Taking the role of compiler: Smoothing filter25. Nov '13
In our Advanced Computer Architectures 1 labs at TU Berlin we had to play the role of a compiler. Our original piece of code written in C, commonly used as low-pass filter in digital signal processing:
#include <stdio.h>
#define N_SAMPLES 10
#define N_COEFFS 3
double sample[N_SAMPLES] = {1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10};
double coeff[N_COEFFS]= {0.5, 1, 0.5};
double result[N_SAMPLES];
void smooth(double sample[], double coeff[], double result[], int n)
{
int i, j;
double norm=0.0;
for (i=0; i<N_COEFFS; i++)
norm+= coeff[i];
for (i=0; i<n; i++){
if (i==0 || i==n-1){
result[i] = sample[i];
}else{
result[i]=0.0;
for (j=0; j<N_COEFFS; j++)
result[i] += sample[i-1+j]*coeff[j];
result[i]/=norm;
}
}
}
int main(int argc, char *arvg[])
{
int i;
if (N_SAMPLES>=N_COEFFS)
smooth(sample, coeff, result, N_SAMPLES);
for (i=0; i<N_SAMPLES; i++)
printf("%f\n", result[i]);
}
I took the hard way, write assembly and invoke it from C code. The resulting assembly looked like this:
.global N_SAMPLES
.global N_COEFFS
.global result
.global sample
.global smooth
.global norm
.global coeff
.data
N_COEFFS: .word 3
coeff: .double 0.5, 1.0, 0.5
N_SAMPLES: .word 5
sample: .double 1.0, 2.0, 1.0, 2.0, 1.0
result: .double 0.0, 0.0, 0.0, 0.0, 0.0
one: .double 1.0
.text
smooth:
addi $t1, $zero, 8 ; Reset second offset
addi $t0, $zero, 0 ; Reset first offset
addi $t2, $zero, 16 ; Reset third offset
lw $t9, N_SAMPLES($zero) ; Load sample count
lw $t8, N_COEFFS($zero) ; Load coeff count
sub $t8, $t9, $t8 ; Subtract number of coeffs from number of samples
sltu $t8, $zero, $t8 ; If samples-coeffs < 0
beqz $t8, bail ; If not enough samples bail out
l.d $f20, coeff($zero) ; Load first coefficient
c.lt.d $f20, $f0
bc1f coeff1ok
sub.d $f20, $f0, $f20
coeff1ok:
l.d $f22, coeff($t1) ; Load second coefficient
c.lt.d $f22, $f0
bc1f coeff2ok
sub.d $f22, $f0, $f22
coeff2ok:
add.d $f2, $f20, $f22 ; Calculate norm
l.d $f24, coeff($t2) ; Load third coefficient
c.lt.d $f24, $f0
bc1f coeff3ok
sub.d $f24, $f0, $f24
coeff3ok:
add.d $f2, $f2, $f24 ; Add third coefficient
l.d $f26, one($zero) ; Load 1.0
div.d $f2, $f26, $f2 ; Invert norm
l.d $f4, sample($zero) ; Load first sample
s.d $f4, result($zero) ; Store first sample
mul.d $f20, $f20, $f2 ; Calculate first coefficient
addi $t4, $zero, 1 ; Store 1
sub $t8, $t9, $t4 ; Calculate the index of last sample
sll $t8, $t8, 3 ; Multiply it by 8
l.d $f4, sample($t8) ; Load the last sample
s.d $f4, result($t8) ; Store the last sample
mul.d $f22, $f22, $f2 ; Calculate second coefficient
mul.d $f24, $f24, $f2 ; Calculate third coefficient
smoothloop:
l.d $f8, sample($t0) ; Load first sample
l.d $f10, sample($t1) ; Load second sample
mul.d $f14, $f8, $f20 ; Multiply first coefficient and first sample
l.d $f12, sample($t2) ; Load third sample
mul.d $f16, $f10, $f22 ; Multiply second coefficient and second sample
mul.d $f18, $f12, $f24 ; Multiply third coefficient and third sample
addi $t0, $t0, 8 ; Increment first pointer
addi $t1, $t1, 8 ; Increment second pointer
addi $t2, $t2, 8 ; Increment third pointer
add.d $f4, $f16, $f14 ; Sum first two products
add.d $f4, $f4, $f18 ; Sum with third product
s.d $f4, result($t0) ; Store multiplication-accumulation
bne $t8, $t1, smoothloop ; Branch if pointer is lower than last sample offset
bail:
j $ra
Frontend written in C to invoke the assembly:
#include <stdio.h>
extern int N_SAMPLES;
extern int N_COEFFS;
extern double sample[];
extern double result[];
extern double coeff[];
extern void smooth();
extern double norm;
int main(int argc, char **argv) {
int i;
smooth();
printf("N_SAMPLES:%d\n", N_SAMPLES);
printf("N_COEFFS:%d\n", N_COEFFS);
printf("--\n");
for (i = 0; i < N_SAMPLES; i++) {
printf("sample[%d]: %f\n", i, sample[i]);
}
printf("--\n");
for (i = 0; i < N_COEFFS; i++) {
printf("coeff[%d]: %f\n", i, coeff[i]);
}
printf("--\n");
for (i = 0; i < N_SAMPLES; i++) {
printf("result[%d]: %f\n", i, result[i]);
}
}
To compile it I used following Makefile:
smooth: smooth-frontend.c smooth.s
mips-openwrt-linux-uclibc-gcc -march=mips64r2 -static -o $@ $?
run:
qemu-mips smooth
clean:
rm -fv smooth
And finally to transform this GNU Compiler Collection compatible piece of assembly to something that WinMIPS64 would understand I used following Python snippet 2:
for line in open("smooth.s"):
line = line.strip()
if line.startswith("#"):
continue
if not line:
continue
if line.startswith(".global"):
continue
if line.startswith("addi ") or line.startswith("sub ") or line.startswith("sll "):
line = "d" + line
line = line.replace("$f", "f")
if line.startswith("j $ra"):
print "nop"
print "halt"
continue
print line
The final piece of code fed to WinMIPS64 was this:
.data
N_COEFFS: .word 3
coeff: .double 0.5, 1.0, 0.5
N_SAMPLES: .word 5
sample: .double 1.0, 2.0, 1.0, 2.0, 1.0
result: .double 0.0, 0.0, 0.0, 0.0, 0.0
one: .double 1.0
.text
smooth:
daddi $t1, $zero, 8 ; Reset second offset
daddi $t0, $zero, 0 ; Reset first offset
daddi $t2, $zero, 16 ; Reset third offset
lw $t9, N_SAMPLES($zero) ; Load sample count
lw $t8, N_COEFFS($zero) ; Load coeff count
dsub $t8, $t9, $t8 ; Subtract number of coeffs from number of samples
sltu $t8, $zero, $t8 ; If samples-coeffs < 0
beqz $t8, bail ; If not enough samples bail out
l.d f20, coeff($zero) ; Load first coefficient
c.lt.d f20, f0
bc1f coeff1ok
sub.d f20, f0, f20
coeff1ok:
l.d f22, coeff($t1) ; Load second coefficient
c.lt.d f22, f0
bc1f coeff2ok
sub.d f22, f0, f22
coeff2ok:
add.d f2, f20, f22 ; Calculate norm
l.d f24, coeff($t2) ; Load third coefficient
c.lt.d f24, f0
bc1f coeff3ok
sub.d f24, f0, f24
coeff3ok:
add.d f2, f2, f24 ; Add third coefficient
l.d f26, one($zero) ; Load 1.0
div.d f2, f26, f2 ; Invert norm
l.d f4, sample($zero) ; Load first sample
s.d f4, result($zero) ; Store first sample
mul.d f20, f20, f2 ; Calculate first coefficient
daddi $t4, $zero, 1 ; Store 1
dsub $t8, $t9, $t4 ; Calculate the index of last sample
dsll $t8, $t8, 3 ; Multiply it by 8
l.d f4, sample($t8) ; Load the last sample
s.d f4, result($t8) ; Store the last sample
mul.d f22, f22, f2 ; Calculate second coefficient
mul.d f24, f24, f2 ; Calculate third coefficient
smoothloop:
l.d f8, sample($t0) ; Load first sample
l.d f10, sample($t1) ; Load second sample
mul.d f14, f8, f20 ; Multiply first coefficient and first sample
l.d f12, sample($t2) ; Load third sample
mul.d f16, f10, f22 ; Multiply second coefficient and second sample
add.d f4, f16, f14 ; Sum first two products
mul.d f18, f12, f24 ; Multiply third coefficient and third sample
daddi $t0, $t0, 8 ; Increment first pointer
daddi $t1, $t1, 8 ; Increment second pointer
daddi $t2, $t2, 8 ; Increment third pointer
add.d f4, f4, f18 ; Sum with third product
s.d f4, result($t0) ; Store multiplication-accumulation
bne $t8, $t1, smoothloop ; Branch if pointer is lower than last sample offset
bail:
nop
halt