Code: Select all

```
.function pwm_sdm_mod_ord2 ; (int16_t *source, uint32_t *dest, SDM_CONTEXT_T *ctx)
.align 32 ; ICache line width
stm r6-r14,lr,(--sp)
mov r9,r0 ; r9 = in
mov r8,r2 ; r8 = ctx
mov r7,r1 ; r7 = dest
ld r2,(r8) ; r2 = z1_L
ld r1,(r8+4) ; r1 = z1_R
ld r5,(r8+8) ; r5 = z2_L
ld r4,(r8+12) ; r4 = z2_R
ld r6,(r8+16) ; r6 = pwm_bits
ld r12,(r8+20) ; r12 = pwm_offset
ld r10,(r8+24) ; r10 = nsamples
mov r0,1
rsub r6,32 ; r6 = out_shift
shl r11,r0,r6
sub r11,1 ; r11 = quant_mask
addcmpble r10,0,0,done
ldsh r3,(r9++) ; r3 = (L)
ldsh r0,(r9++) ; r0 = (R)
loop:
ldsh r13,(r9++) ; r13 = (L+1)
ldsh r14,(r9++) ; r14 = (R+1)
loop_nopreload:
shl r3,16 ; L <<= 16
shl r0,16 ; R <<= 16
adds r3,r3,r2 ; L = z1_L
adds r0,r0,r1 ; R = z1_R
adds r3,r3,r2 ; L = z1_L
adds r0,r0,r1 ; R = z1_R
subs r3,r5,r3 ; L -= z2_L
subs r0,r4,r0 ; R -= z2_R
bic r5,r3,r11 ; z2_L = L & quant_mask
bic r4,r0,r11 ; z2_R = R & quant_mask
subs r3,r5,r3 ; L -= z2_R
subs r0,r4,r0 ; R -= z2_R
asr r5,r6 ; z2_L >>= nbits
asr r4,r6 ; z2_R >>= nbits
add r5,r12 ; z2_L += offset
add r4,r12 ; z2_R += offset
st r5,(r7++) ; z2_L -> (out)
mov r5,r2 ; z2_L = z1_L
st r4,(r7++) ; z2_R -> (out)
mov r4,r1 ; z2_R = z1_R
mov r2,r3 ; z1_L = L
mov r1,r0 ; z1_L = R
mov r3,r13 ; L = L+1
mov r0,r14 ; R = R+1
addcmpbgt r10,-1,1,loop
addcmpbgt r10,0,0,loop_nopreload
done:
st r2,(r8)
st r1,(r8+4)
st r5,(r8+8)
st r4,(r8+12)
ldm r6-r14,pc,(sp++)
.endfn
```

Which, as far as I can tell from debug tool output, has no ALU stalls except for 1 cycle during predicted branches inside the inner loop, and has register load stalls only when SDRAM is busy.

I think I like this architecture. The last time I did DSP was with a 1GHz TMS320C6210 in university.