## Taking the role of compiler: Smoothing filter25. Nov '13

In our Advanced Computer Architectures 1 labs at TU Berlin we had to play the role of a compiler. Our original piece of code written in C, commonly used as low-pass filter in digital signal processing:

#include <stdio.h>

#define N_SAMPLES   10
#define N_COEFFS    3

double      sample[N_SAMPLES] = {1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10};
double      coeff[N_COEFFS]= {0.5, 1, 0.5};
double      result[N_SAMPLES];

void smooth(double sample[], double coeff[], double result[], int n)
{
int i, j;
double norm=0.0;

for (i=0; i<N_COEFFS; i++)
norm+= coeff[i];

for (i=0; i<n; i++){
if (i==0 || i==n-1){
result[i] = sample[i];
}else{
result[i]=0.0;
for (j=0; j<N_COEFFS; j++)
result[i] += sample[i-1+j]*coeff[j];
result[i]/=norm;
}
}
}

int main(int argc, char *arvg[])
{
int i;

if (N_SAMPLES>=N_COEFFS)
smooth(sample, coeff, result, N_SAMPLES);

for (i=0; i<N_SAMPLES; i++)
printf("%f\n", result[i]);
}

I took the hard way, write assembly and invoke it from C code. The resulting assembly looked like this:

.global N_SAMPLES
.global N_COEFFS
.global result
.global sample
.global smooth
.global norm
.global coeff

.data

N_COEFFS: .word 3
coeff: .double 0.5, 1.0, 0.5
N_SAMPLES: .word 5
sample: .double 1.0, 2.0, 1.0, 2.0, 1.0
result: .double 0.0, 0.0, 0.0, 0.0, 0.0

one: .double 1.0

.text

smooth:
addi $t1,$zero, 8           ; Reset second offset
addi $t0,$zero, 0           ; Reset first offset
addi $t2,$zero, 16          ; Reset third offset

lw $t9, N_SAMPLES($zero)     ; Load sample count
lw $t8, N_COEFFS($zero)      ; Load coeff count

sub $t8,$t9, $t8 ; Subtract number of coeffs from number of samples sltu$t8, $zero,$t8         ; If samples-coeffs < 0
beqz $t8, bail ; If not enough samples bail out l.d$f20, coeff($zero) ; Load first coefficient c.lt.d$f20, $f0 bc1f coeff1ok sub.d$f20, $f0,$f20
coeff1ok:

l.d $f22, coeff($t1)         ; Load second coefficient
c.lt.d $f22,$f0
bc1f coeff2ok
sub.d $f22,$f0, $f22 coeff2ok: add.d$f2, $f20,$f22        ; Calculate norm

l.d $f24, coeff($t2)         ; Load third coefficient
c.lt.d $f24,$f0
bc1f coeff3ok
sub.d $f24,$f0, $f24 coeff3ok: add.d$f2, $f2,$f24         ; Add third coefficient

l.d $f26, one($zero)         ; Load 1.0
div.d $f2,$f26, $f2 ; Invert norm l.d$f4, sample($zero) ; Load first sample s.d$f4, result($zero) ; Store first sample mul.d$f20, $f20,$f2        ; Calculate first coefficient

addi $t4,$zero, 1           ; Store 1
sub $t8,$t9, $t4 ; Calculate the index of last sample sll$t8, $t8, 3 ; Multiply it by 8 l.d$f4, sample($t8) ; Load the last sample s.d$f4, result($t8) ; Store the last sample mul.d$f22, $f22,$f2        ; Calculate second coefficient
mul.d $f24,$f24, $f2 ; Calculate third coefficient smoothloop: l.d$f8, sample($t0) ; Load first sample l.d$f10, sample($t1) ; Load second sample mul.d$f14, $f8,$f20    ; Multiply first coefficient and first sample

l.d $f12, sample($t2)    ; Load third sample

mul.d $f16,$f10, $f22 ; Multiply second coefficient and second sample mul.d$f18, $f12,$f24   ; Multiply third coefficient and third sample

addi $t0,$t0, 8         ; Increment first pointer
addi $t1,$t1, 8         ; Increment second pointer
addi $t2,$t2, 8         ; Increment third pointer

add.d $f4,$f16, $f14 ; Sum first two products add.d$f4, $f4,$f18     ; Sum with third product

s.d $f4, result($t0)     ; Store multiplication-accumulation

bne $t8,$t1, smoothloop ; Branch if pointer is lower than last sample offset

bail:
j $ra Frontend written in C to invoke the assembly: #include <stdio.h> extern int N_SAMPLES; extern int N_COEFFS; extern double sample[]; extern double result[]; extern double coeff[]; extern void smooth(); extern double norm; int main(int argc, char **argv) { int i; smooth(); printf("N_SAMPLES:%d\n", N_SAMPLES); printf("N_COEFFS:%d\n", N_COEFFS); printf("--\n"); for (i = 0; i < N_SAMPLES; i++) { printf("sample[%d]: %f\n", i, sample[i]); } printf("--\n"); for (i = 0; i < N_COEFFS; i++) { printf("coeff[%d]: %f\n", i, coeff[i]); } printf("--\n"); for (i = 0; i < N_SAMPLES; i++) { printf("result[%d]: %f\n", i, result[i]); } } To compile it I used following Makefile: smooth: smooth-frontend.c smooth.s mips-openwrt-linux-uclibc-gcc -march=mips64r2 -static -o$@ $? run: qemu-mips smooth clean: rm -fv smooth And finally to transform this GNU Compiler Collection compatible piece of assembly to something that WinMIPS64 would understand I used following Python snippet 2: for line in open("smooth.s"): line = line.strip() if line.startswith("#"): continue if not line: continue if line.startswith(".global"): continue if line.startswith("addi ") or line.startswith("sub ") or line.startswith("sll "): line = "d" + line line = line.replace("$f", "f")
if line.startswith("j $ra"): print "nop" print "halt" continue print line The final piece of code fed to WinMIPS64 was this: .data N_COEFFS: .word 3 coeff: .double 0.5, 1.0, 0.5 N_SAMPLES: .word 5 sample: .double 1.0, 2.0, 1.0, 2.0, 1.0 result: .double 0.0, 0.0, 0.0, 0.0, 0.0 one: .double 1.0 .text smooth: daddi$t1, $zero, 8 ; Reset second offset daddi$t0, $zero, 0 ; Reset first offset daddi$t2, $zero, 16 ; Reset third offset lw$t9, N_SAMPLES($zero) ; Load sample count lw$t8, N_COEFFS($zero) ; Load coeff count dsub$t8, $t9,$t8            ; Subtract number of coeffs from number of samples
sltu $t8,$zero, $t8 ; If samples-coeffs < 0 beqz$t8, bail               ; If not enough samples bail out
l.d f20, coeff($zero) ; Load first coefficient c.lt.d f20, f0 bc1f coeff1ok sub.d f20, f0, f20 coeff1ok: l.d f22, coeff($t1)         ; Load second coefficient
c.lt.d f22, f0
bc1f coeff2ok
sub.d f22, f0, f22
coeff2ok:
add.d f2, f20, f22        ; Calculate norm
l.d f24, coeff($t2) ; Load third coefficient c.lt.d f24, f0 bc1f coeff3ok sub.d f24, f0, f24 coeff3ok: add.d f2, f2, f24 ; Add third coefficient l.d f26, one($zero)         ; Load 1.0
div.d f2, f26, f2         ; Invert norm
l.d f4, sample($zero) ; Load first sample s.d f4, result($zero)       ; Store first sample
mul.d f20, f20, f2        ; Calculate first coefficient
daddi $t4,$zero, 1           ; Store 1
dsub $t8,$t9, $t4 ; Calculate the index of last sample dsll$t8, $t8, 3 ; Multiply it by 8 l.d f4, sample($t8)         ; Load the last sample
s.d f4, result($t8) ; Store the last sample mul.d f22, f22, f2 ; Calculate second coefficient mul.d f24, f24, f2 ; Calculate third coefficient smoothloop: l.d f8, sample($t0)     ; Load first sample
l.d f10, sample($t1) ; Load second sample mul.d f14, f8, f20 ; Multiply first coefficient and first sample l.d f12, sample($t2)    ; Load third sample
mul.d f16, f10, f22   ; Multiply second coefficient and second sample
add.d f4, f16, f14    ; Sum first two products
mul.d f18, f12, f24   ; Multiply third coefficient and third sample
daddi $t0,$t0, 8         ; Increment first pointer
daddi $t1,$t1, 8         ; Increment second pointer
daddi $t2,$t2, 8         ; Increment third pointer
add.d f4, f4, f18     ; Sum with third product
s.d f4, result($t0) ; Store multiplication-accumulation bne$t8, \$t1, smoothloop ; Branch if pointer is lower than last sample offset
bail:
nop
halt
1

http://www.aes.tu-berlin.de/menue/courses/aca/

2

http://indigo.ie/~mscott/

WinMIPS64 Python computer architecture C TU Berlin compilers assembly