Quite excitingly it worked first time. Of course every single thing I try needs to have its assembly looked over carefully

For those who are interested,
Code: Select all
extern "C" {
void entry(void)
{
int *A = (int *)0x0888c000;
for (int count = 0; count < 16; count++)
A[count] = count - 15;
}
}
goes through Clang to make LLVM IR (O3, hence the unroll)
Code: Select all
define void @entry() #0 {
store i32 -15, i32* inttoptr (i64 143179776 to i32*), align 16384, !tbaa !1
store i32 -14, i32* inttoptr (i64 143179780 to i32*), align 4, !tbaa !1
store i32 -13, i32* inttoptr (i64 143179784 to i32*), align 8, !tbaa !1
store i32 -12, i32* inttoptr (i64 143179788 to i32*), align 4, !tbaa !1
store i32 -11, i32* inttoptr (i64 143179792 to i32*), align 16, !tbaa !1
store i32 -10, i32* inttoptr (i64 143179796 to i32*), align 4, !tbaa !1
store i32 -9, i32* inttoptr (i64 143179800 to i32*), align 8, !tbaa !1
store i32 -8, i32* inttoptr (i64 143179804 to i32*), align 4, !tbaa !1
store i32 -7, i32* inttoptr (i64 143179808 to i32*), align 32, !tbaa !1
store i32 -6, i32* inttoptr (i64 143179812 to i32*), align 4, !tbaa !1
store i32 -5, i32* inttoptr (i64 143179816 to i32*), align 8, !tbaa !1
store i32 -4, i32* inttoptr (i64 143179820 to i32*), align 4, !tbaa !1
store i32 -3, i32* inttoptr (i64 143179824 to i32*), align 16, !tbaa !1
store i32 -2, i32* inttoptr (i64 143179828 to i32*), align 4, !tbaa !1
store i32 -1, i32* inttoptr (i64 143179832 to i32*), align 8, !tbaa !1
store i32 0, i32* inttoptr (i64 143179836 to i32*), align 4, !tbaa !1
ret void
}
goes through my back-end to make pseudo QPU asm
Code: Select all
.file "test.ll"
.text
.globl entry
.align 2
.type entry,@function
.set reorder // @entry
// BB#0:
il acc0, 143179776
il acc1, -15
store_word acc1, acc0, 0
il acc0, 143179780
il acc1, -14
<snip>
store_word acc1, acc0, 0
il acc0, 143179832
il acc1, -1
store_word acc1, acc0, 0
il acc0, 143179836
il acc1, 0
store_word acc1, acc0, 0
bla wra_nop, wrb_nop, lr
.set noreorder
$tmp0:
.size entry, ($tmp0)-entry
goes into my assembler, where the pseudos get turned into a sequence of instructions, the whole thing gets rescheduled, nops get inserted and the whole thing gets assembled (note I've squirted in a tiny start-operation to set the stack (ra30) and end the program correctly)
Code: Select all
/*0888c000*/ /*start:*/ /* */
/*0888c000*/ 0x0888c038, 0xf0f009e7, /* bra wra_nop, wrb_nop, 0888c038 ( 143179832 ) */
/*0888c008*/ 0x0888cff0, 0xe0020780, /* il ra30, 0888cff0 ( 143183856 ) */
/*0888c010*/ 0x00000000, 0x10000820, /* nop; */
/*0888c018*/ 0x00000000, 0x10000820, /* nop; */
/*0888c020*/ 0x15000000, 0x30020820, /* or acc0, acc0, acc0; ProgramEnd */
/*0888c028*/ 0x15000000, 0x10020820, /* or acc0, acc0, acc0; */
/*0888c030*/ 0x15000000, 0x10020820, /* or acc0, acc0, acc0; */
/*0888c038*/ /*entry:*/ /* */
/*0888c038*/ 0xfffffff1, 0xe0021840, /* il acc1, fffffff1 ( -15 ) */
/*0888c040*/ 0x0888c000, 0xe0021800, /* il acc0, 0888c000 ( 143179776 ) */
/*0888c048*/ 0x00000a00, 0xe0021c40, /* il vpmvcd_wr_setup, 00000a00 ( 2560 ) */
/*0888c050*/ 0x15000249, 0x10020c20, /* or wra_vpm_dat, acc1, acc1; */
/*0888c058*/ 0xfffffff2, 0xe0021840, /* il acc1, fffffff2 ( -14 ) */
/*0888c060*/ 0x80814000, 0xe0021c40, /* il vpmvcd_wr_setup, 80814000 ( -2139013120 ) */
/*0888c068*/ 0x0c0001c7, 0xd0021ca0, /* add vpm_st_addr, acc0, 00000000 (0); */
/*0888c070*/ 0x0888c004, 0xe0021800, /* il acc0, 0888c004 ( 143179780 ) */
/*0888c078*/ 0x15032fff, 0x100209e0, /* or wra_nop, vpm_st_wait, vpm_st_wait; */
/*0888c080*/ 0x00000a00, 0xe0021c40, /* il vpmvcd_wr_setup, 00000a00 ( 2560 ) */
/*0888c088*/ 0x15000249, 0x10020c20, /* or wra_vpm_dat, acc1, acc1; */
/*0888c090*/ 0xfffffff3, 0xe0021840, /* il acc1, fffffff3 ( -13 ) */
/*0888c098*/ 0x80814000, 0xe0021c40, /* il vpmvcd_wr_setup, 80814000 ( -2139013120 ) */
/*0888c0a0*/ 0x0c0001c7, 0xd0021ca0, /* add vpm_st_addr, acc0, 00000000 (0); */
<snip>
/*0888c390*/ 0x00000a00, 0xe0021c40, /* il vpmvcd_wr_setup, 00000a00 ( 2560 ) */
/*0888c398*/ 0x15000249, 0x10020c20, /* or wra_vpm_dat, acc1, acc1; */
/*0888c3a0*/ 0x00000000, 0xf0f7e9e7, /* bra wra_nop, wrb_nop, ra31, 00000000 ( 0 ) */
/*0888c3a8*/ 0x80814000, 0xe0021c40, /* il vpmvcd_wr_setup, 80814000 ( -2139013120 ) */
/*0888c3b0*/ 0x0c0001c7, 0xd0021ca0, /* add vpm_st_addr, acc0, 00000000 (0); */
/*0888c3b8*/ 0x15032fff, 0x100209e0, /* or wra_nop, vpm_st_wait, vpm_st_wait; */
/*0888c3c0*/ /*$tmp0:*/ /* */
and then it gets run live on the pi
Code: Select all
[email protected] ~/dma $ sudo ./mapper /dev/mem
phys addr 0888c000
mapped in at 0x888c000
status 00000600
status 00000600
status 00000700
status 00000700
0x888c000 fffffff1
0x888c004 fffffff2
0x888c008 fffffff3
0x888c00c fffffff4
0x888c010 fffffff5
0x888c014 fffffff6
0x888c018 fffffff7
0x888c01c fffffff8
0x888c020 fffffff9
0x888c024 fffffffa
0x888c028 fffffffb
0x888c02c fffffffc
0x888c030 fffffffd
0x888c034 fffffffe
0x888c038 ffffffff
0x888c03c 00000000
Things which are working right now
- 32-bit integer arithmetic and logic (excluding operations which expect carry-in/out)
- integer multiplication is 24-bit
- loading and storing of X words of data
- 16-way conditional control flow (branches)
- function calling and most stack operations
- register, flag and branch delay slot hazards
- out-of-order scheduling within a basic block to avoid inserting nops
- load/store reordering
We're getting there. I'd like to fork this out into a new thread now...