Taking the role of compiler: Smoothing filter25. Nov '13

In our Advanced Computer Architectures 1 labs at TU Berlin we had to play the role of a compiler. Our original piece of code written in C, commonly used as low-pass filter in digital signal processing:

#include <stdio.h>

#define N_SAMPLES   10
#define N_COEFFS    3

double      sample[N_SAMPLES] = {1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10};
double      coeff[N_COEFFS]= {0.5, 1, 0.5};
double      result[N_SAMPLES];

void smooth(double sample[], double coeff[], double result[], int n)
{
        int i, j;
        double norm=0.0;

        for (i=0; i<N_COEFFS; i++)
                norm+= coeff[i];

        for (i=0; i<n; i++){
                if (i==0 || i==n-1){
                        result[i] = sample[i];
                }else{
                        result[i]=0.0;
                        for (j=0; j<N_COEFFS; j++)
                                result[i] += sample[i-1+j]*coeff[j];
                        result[i]/=norm;
                }
        }
}

int main(int argc, char *arvg[])
{
        int i;

        if (N_SAMPLES>=N_COEFFS)
                smooth(sample, coeff, result, N_SAMPLES);

        for (i=0; i<N_SAMPLES; i++)
                printf("%f\n", result[i]);
}

I took the hard way, write assembly and invoke it from C code. The resulting assembly looked like this:

.global N_SAMPLES
.global N_COEFFS
.global result
.global sample
.global smooth
.global norm
.global coeff

.data

N_COEFFS: .word 3
coeff: .double 0.5, 1.0, 0.5
N_SAMPLES: .word 5
sample: .double 1.0, 2.0, 1.0, 2.0, 1.0
result: .double 0.0, 0.0, 0.0, 0.0, 0.0

one: .double 1.0

.text

smooth:
    addi $t1, $zero, 8           ; Reset second offset
    addi $t0, $zero, 0           ; Reset first offset
    addi $t2, $zero, 16          ; Reset third offset

    lw $t9, N_SAMPLES($zero)     ; Load sample count
    lw $t8, N_COEFFS($zero)      ; Load coeff count

    sub $t8, $t9, $t8            ; Subtract number of coeffs from number of samples
    sltu $t8, $zero, $t8         ; If samples-coeffs < 0
    beqz $t8, bail               ; If not enough samples bail out

    l.d $f20, coeff($zero)       ; Load first coefficient
    c.lt.d $f20, $f0
    bc1f coeff1ok
    sub.d $f20, $f0, $f20
    coeff1ok:

    l.d $f22, coeff($t1)         ; Load second coefficient
    c.lt.d $f22, $f0
    bc1f coeff2ok
    sub.d $f22, $f0, $f22
    coeff2ok:

    add.d $f2, $f20, $f22        ; Calculate norm

    l.d $f24, coeff($t2)         ; Load third coefficient
    c.lt.d $f24, $f0
    bc1f coeff3ok
    sub.d $f24, $f0, $f24
    coeff3ok:

    add.d $f2, $f2, $f24         ; Add third coefficient

    l.d $f26, one($zero)         ; Load 1.0
    div.d $f2, $f26, $f2         ; Invert norm

    l.d $f4, sample($zero)       ; Load first sample
    s.d $f4, result($zero)       ; Store first sample


    mul.d $f20, $f20, $f2        ; Calculate first coefficient

    addi $t4, $zero, 1           ; Store 1
    sub $t8, $t9, $t4            ; Calculate the index of last sample
    sll $t8, $t8, 3              ; Multiply it by 8
    l.d $f4, sample($t8)         ; Load the last sample
    s.d $f4, result($t8)         ; Store the last sample



    mul.d $f22, $f22, $f2        ; Calculate second coefficient
    mul.d $f24, $f24, $f2        ; Calculate third coefficient


    smoothloop:
        l.d $f8, sample($t0)     ; Load first sample
        l.d $f10, sample($t1)    ; Load second sample

        mul.d $f14, $f8, $f20    ; Multiply first coefficient and first sample

        l.d $f12, sample($t2)    ; Load third sample

        mul.d $f16, $f10, $f22   ; Multiply second coefficient and second sample
        mul.d $f18, $f12, $f24   ; Multiply third coefficient and third sample

        addi $t0, $t0, 8         ; Increment first pointer
        addi $t1, $t1, 8         ; Increment second pointer
        addi $t2, $t2, 8         ; Increment third pointer


        add.d $f4, $f16, $f14    ; Sum first two products
        add.d $f4, $f4, $f18     ; Sum with third product

        s.d $f4, result($t0)     ; Store multiplication-accumulation


        bne $t8, $t1, smoothloop ; Branch if pointer is lower than last sample offset

bail:
    j $ra

Frontend written in C to invoke the assembly:

#include <stdio.h>

extern int N_SAMPLES;
extern int N_COEFFS;
extern double sample[];
extern double result[];
extern double coeff[];
extern void smooth();
extern double norm;

int main(int argc, char **argv) {
    int i;
    smooth();

    printf("N_SAMPLES:%d\n", N_SAMPLES);
    printf("N_COEFFS:%d\n", N_COEFFS);
    printf("--\n");
    for (i = 0; i < N_SAMPLES; i++) {
        printf("sample[%d]: %f\n", i, sample[i]);
    }
    printf("--\n");
    for (i = 0; i < N_COEFFS; i++) {
        printf("coeff[%d]: %f\n", i, coeff[i]);
    }
    printf("--\n");
    for (i = 0; i < N_SAMPLES; i++) {
        printf("result[%d]: %f\n", i, result[i]);
    }
}

To compile it I used following Makefile:

smooth: smooth-frontend.c smooth.s
        mips-openwrt-linux-uclibc-gcc -march=mips64r2 -static -o $@ $?

run:
        qemu-mips smooth

clean:
        rm -fv smooth

And finally to transform this GNU Compiler Collection compatible piece of assembly to something that WinMIPS64 would understand I used following Python snippet 2:

for line in open("smooth.s"):
    line = line.strip()
    if line.startswith("#"):
        continue
    if not line:
        continue
    if line.startswith(".global"):
        continue
    if line.startswith("addi ") or line.startswith("sub ") or line.startswith("sll "):
        line = "d" + line
    line = line.replace("$f", "f")
    if line.startswith("j $ra"):
        print "nop"
        print "halt"
        continue
    print line

The final piece of code fed to WinMIPS64 was this:

.data
N_COEFFS: .word 3
coeff: .double 0.5, 1.0, 0.5
N_SAMPLES: .word 5
sample: .double 1.0, 2.0, 1.0, 2.0, 1.0
result: .double 0.0, 0.0, 0.0, 0.0, 0.0
one: .double 1.0
.text
smooth:
daddi $t1, $zero, 8           ; Reset second offset
daddi $t0, $zero, 0           ; Reset first offset
daddi $t2, $zero, 16          ; Reset third offset
lw $t9, N_SAMPLES($zero)     ; Load sample count
lw $t8, N_COEFFS($zero)      ; Load coeff count
dsub $t8, $t9, $t8            ; Subtract number of coeffs from number of samples
sltu $t8, $zero, $t8         ; If samples-coeffs < 0
beqz $t8, bail               ; If not enough samples bail out
l.d f20, coeff($zero)       ; Load first coefficient
c.lt.d f20, f0
bc1f coeff1ok
sub.d f20, f0, f20
coeff1ok:
l.d f22, coeff($t1)         ; Load second coefficient
c.lt.d f22, f0
bc1f coeff2ok
sub.d f22, f0, f22
coeff2ok:
add.d f2, f20, f22        ; Calculate norm
l.d f24, coeff($t2)         ; Load third coefficient
c.lt.d f24, f0
bc1f coeff3ok
sub.d f24, f0, f24
coeff3ok:
add.d f2, f2, f24         ; Add third coefficient
l.d f26, one($zero)         ; Load 1.0
div.d f2, f26, f2         ; Invert norm
l.d f4, sample($zero)       ; Load first sample
s.d f4, result($zero)       ; Store first sample
mul.d f20, f20, f2        ; Calculate first coefficient
daddi $t4, $zero, 1           ; Store 1
dsub $t8, $t9, $t4            ; Calculate the index of last sample
dsll $t8, $t8, 3              ; Multiply it by 8
l.d f4, sample($t8)         ; Load the last sample
s.d f4, result($t8)         ; Store the last sample
mul.d f22, f22, f2        ; Calculate second coefficient
mul.d f24, f24, f2        ; Calculate third coefficient
smoothloop:
l.d f8, sample($t0)     ; Load first sample
l.d f10, sample($t1)    ; Load second sample
mul.d f14, f8, f20    ; Multiply first coefficient and first sample
l.d f12, sample($t2)    ; Load third sample
mul.d f16, f10, f22   ; Multiply second coefficient and second sample
add.d f4, f16, f14    ; Sum first two products
mul.d f18, f12, f24   ; Multiply third coefficient and third sample
daddi $t0, $t0, 8         ; Increment first pointer
daddi $t1, $t1, 8         ; Increment second pointer
daddi $t2, $t2, 8         ; Increment third pointer
add.d f4, f4, f18     ; Sum with third product
s.d f4, result($t0)     ; Store multiplication-accumulation
bne $t8, $t1, smoothloop ; Branch if pointer is lower than last sample offset
bail:
nop
halt
1

http://www.aes.tu-berlin.de/menue/courses/aca/

2

http://indigo.ie/~mscott/

WinMIPS64 Python computer architecture C TU Berlin compilers assembly