Source and Assembly Codes for these benchmarks runs are shown below. The 32 bit compilations uses 12 scalar add and multiply instructions and 10 using fused multiply accumulate NEON type, but limited to scalar operation (SISD - Single Instructions Single Data). All the others use NEON or 64 bit vector SIMD instructions (Multiple Data), carrying out four calculations simultaneously at single precision, with 128 operations in the execution loops, or half these at double precision. Each has its own variation of fused multiply and add or subtract instructions.

In the original single precision benchmarks, the NEON version produced significantly faster performance, where the compiler converted the 32 intrinsic calculating functions into 22 instructions, with those fused operations, and a total in-loop count of 27. Performance of the first 64 bit version was degraded through making use of only 12 vector registers, for a programming function involving 23 variables, necessitating frequent load instructions. The gcc 7 compiler made use of 25 vector registers with out of loop loads to achieve similar performance as the hand code NEON benchmark. Both the 64 bit double precision benchmarks included the higher efficient code, with external data loading, but best speed was, as expected, half that for single precision SIMD calculations.

Code: Select all

```
######################################################################
MP-MFLOPS on Raspberry Pi 3B+
Function triadplus2
for(i=0; i<n; i++)
x[i] = (x[i]+a)*b-(x[i]+c)*d+(x[i]+e)*f-(x[i]+g)*h+(x[i]+j)*k
-(x[i]+l)*m+(x[i]+o)*p-(x[i]+q)*r+(x[i]+s)*t-(x[i]+u)*v+(x[i]+w)*y;
######################################################################
gcc 4.9 32 bit
SP MFLOPS DP MFLOPS
797 1t 3134 4T 798 1T 3119 4T
.L21: .L21:
flds s23, [r3] fldd d17, [r1]
fadds s15, s8, s23 faddd d16, d17, d2
fadds s24, s10, s23 faddd d18, d17, d0
fadds s31, s6, s23 faddd d25, d17, d4
fadds s30, s4, s23 faddd d24, d17, d6
fnmuls s15, s15, s7 fnmuld d16, d3, d16
fadds s29, s3, s23 faddd d23, d17, d15
fadds s28, s1, s23 faddd d22, d17, d13
fadds s27, s0, s23 faddd d21, d17, d11
vfma.f32 s15, s9, s24 faddd d20, d17, d9
fadds s26, s17, s23 faddd d19, d17, d31
fadds s25, s18, s23 vfma.f64 d16, d18, d1
fadds s24, s20, s23 faddd d18, d17, d29
fadds s23, s21, s23 faddd d17, d17, d27
vfma.f32 s15, s5, s31 vfma.f64 d16, d25, d5
vfma.f32 s15, s14, s30 vfms.f64 d16, d24, d7
vfma.f32 s15, s2, s29 vfma.f64 d16, d23, d14
vfma.f32 s15, s13, s28 vfms.f64 d16, d22, d12
vfma.f32 s15, s16, s27 vfma.f64 d16, d21, d10
vfma.f32 s15, s12, s26 vfms.f64 d16, d20, d8
vfma.f32 s15, s19, s25 vfma.f64 d16, d19, d30
vfma.f32 s15, s11, s24 vfms.f64 d16, d18, d26
vfma.f32 s15, s22, s23 vfma.f64 d16, d17, d28
fstmias r3!, {s15} fstmiad r1!, {d16}
cmp r3, r2 cmp r1, r0
bne .L9 bne .L21
######################################################################
gcc 6 64 bit
SP MFLOPS DP MFLOPS
1793 1T to 6981 4T 1405 1T to 4398 4T
.L65: .L84:
ldr q16, [x2, x5] ldr q16, [x2, x0]
add w6, w6, 1 add w3, w3, 1
ldr q15, [sp, 64] cmp w3, w6
cmp w3, w6 fadd v15.2d, v16.2d, v14.2d
ldr q17, [sp, 80] fadd v17.2d, v16.2d, v12.2d
ldr q0, [sp, 112] fmul v15.2d, v15.2d, v13.2d
fadd v15.4s, v16.4s, v15.4s fmls v15.2d, v17.2d, v11.2d
fmul v15.4s, v15.4s, v17.4s fadd v17.2d, v16.2d, v10.2d
ldr q17, [sp, 96] fmla v15.2d, v17.2d, v9.2d
fadd v17.4s, v16.4s, v17.4s fadd v17.2d, v16.2d, v8.2d
fmls v15.4s, v17.4s, v0.4s fmls v15.2d, v17.2d, v31.2d
ldr q0, [sp, 128] fadd v17.2d, v16.2d, v30.2d
fadd v17.4s, v16.4s, v0.4s fmla v15.2d, v17.2d, v29.2d
ldr q0, [sp, 144] fadd v17.2d, v16.2d, v28.2d
fmla v15.4s, v17.4s, v0.4s fmls v15.2d, v17.2d, v0.2d
ldr q0, [sp, 160] fadd v17.2d, v16.2d, v27.2d
fadd v17.4s, v16.4s, v0.4s fmla v15.2d, v17.2d, v26.2d
ldr q0, [sp, 176] fadd v17.2d, v16.2d, v25.2d
fmls v15.4s, v17.4s, v0.4s fmls v15.2d, v17.2d, v24.2d
ldr q0, [sp, 192] fadd v17.2d, v16.2d, v23.2d
fadd v17.4s, v16.4s, v0.4s fmla v15.2d, v17.2d, v22.2d
ldr q0, [sp, 208] fadd v17.2d, v16.2d, v21.2d
fmla v15.4s, v17.4s, v0.4s fadd v16.2d, v16.2d, v19.2d
ldr q0, [sp, 224] fmls v15.2d, v17.2d, v20.2d
fadd v17.4s, v16.4s, v0.4s fmla v15.2d, v16.2d, v18.2d
ldr q0, [sp, 240] str q15, [x2, x0]
fmls v15.4s, v17.4s, v0.4s add x0, x0, 16
ldr q0, [sp, 256] bcc .L84
fadd v17.4s, v16.4s, v0.4s
ldr q0, [sp, 272]
fmla v15.4s, v17.4s, v0.4s
ldr q0, [sp, 288]
fadd v17.4s, v16.4s, v0.4s
fmls v15.4s, v17.4s, v14.4s
fadd v17.4s, v16.4s, v13.4s
fmla v15.4s, v17.4s, v12.4s
fadd v17.4s, v16.4s, v11.4s
fadd v16.4s, v16.4s, v9.4s
fmls v15.4s, v17.4s, v10.4s
fmla v15.4s, v16.4s, v8.4s
str q15, [x2, x5]
add x5, x5, 16
bhi .L65
######################################################################
gcc6 neon
SP MFLOPS C code
2999 1T to 11563 4T for(i=0; i<n; i=i+4)
.L41: {
ldr q1, [x1] x41 = vld1q_f32(ptrx1);
ldr q0, [sp, 64] z41 = vaddq_f32(x41, a41);
fadd v18.4s, v20.4s, v1.4s z41 = vmulq_f32(z41, b41);
fadd v17.4s, v22.4s, v1.4s z42 = vaddq_f32(x41, c41);
fadd v0.4s, v0.4s, v1.4s z42 = vmulq_f32(z42, d41);
fadd v16.4s, v24.4s, v1.4s z41 = vsubq_f32(z41, z42);
fadd v7.4s, v26.4s, v1.4s z42 = vaddq_f32(x41, e41);
fadd v6.4s, v28.4s, v1.4s z42 = vmulq_f32(z42, f41);
fadd v5.4s, v30.4s, v1.4s z41 = vaddq_f32(z41, z42);
fmul v0.4s, v0.4s, v19.4s z42 = vaddq_f32(x41, g41);
fadd v4.4s, v10.4s, v1.4s z42 = vmulq_f32(z42, h41);
fadd v3.4s, v12.4s, v1.4s z41 = vsubq_f32(z41, z42);
fadd v2.4s, v14.4s, v1.4s z42 = vaddq_f32(x41, j41);
fadd v1.4s, v8.4s, v1.4s z42 = vmulq_f32(z42, k41);
fmls v0.4s, v21.4s, v18.4s z41 = vaddq_f32(z41, z42);
fmla v0.4s, v23.4s, v17.4s z42 = vaddq_f32(x41, l41);
fmls v0.4s, v25.4s, v16.4s z42 = vmulq_f32(z42, m41);
fmla v0.4s, v27.4s, v7.4s z41 = vsubq_f32(z41, z42);
fmls v0.4s, v29.4s, v6.4s z42 = vaddq_f32(x41, o41);
fmla v0.4s, v31.4s, v5.4s z42 = vmulq_f32(z42, p41);
fmls v0.4s, v9.4s, v1.4s z41 = vaddq_f32(z41, z42);
fmla v0.4s, v4.4s, v11.4s z42 = vaddq_f32(x41, q41);
fmls v0.4s, v3.4s, v13.4s z42 = vmulq_f32(z42, r41);
fmla v0.4s, v2.4s, v15.4s z41 = vsubq_f32(z41, z42);
str q0, [x1], 16 z42 = vaddq_f32(x41, s41);
cmp x1, x0 z42 = vmulq_f32(z42, t41);
bne .L41 z41 = vaddq_f32(z41, z42);
z42 = vaddq_f32(x41, u41);
z42 = vmulq_f32(z42, v41);
z41 = vsubq_f32(z41, z42);
z42 = vaddq_f32(x41, w41);
z42 = vmulq_f32(z42, y41);
z41 = vaddq_f32(z41, z42);
vst1q_f32(ptrx1, z41);
ptrx1 = ptrx1 + 4;
}
######################################################################
gcc 7
SP MFLOPS DP MFLOPS
2800 1T to 10608 4T 1403 1T 4492 4T
.L51: .L44:
ldr q15, [x2, x3] ldr q15, [x3, x2]
add w4, w4, 1 add w4, w4, 1
cmp w4, w6 cmp w4, w5
fadd v0.4s, v15.4s, v14.4s fadd v7.2d, v15.2d, v14.2d
fadd v17.4s, v15.4s, v12.4s fadd v16.2d, v15.2d, v12.2d
fmul v0.4s, v0.4s, v13.4s fmul v7.2d, v7.2d, v13.2d
fmls v0.4s, v17.4s, v11.4s fmls v7.2d, v16.2d, v11.2d
fadd v17.4s, v15.4s, v10.4s fadd v16.2d, v15.2d, v10.2d
fmla v0.4s, v17.4s, v9.4s fmla v7.2d, v16.2d, v9.2d
fadd v17.4s, v15.4s, v8.4s fadd v16.2d, v15.2d, v8.2d
fmls v0.4s, v17.4s, v31.4s fmls v7.2d, v16.2d, v31.2d
fadd v17.4s, v15.4s, v30.4s fadd v16.2d, v15.2d, v30.2d
fmla v0.4s, v17.4s, v29.4s fmla v7.2d, v16.2d, v29.2d
fadd v17.4s, v15.4s, v16.4s fadd v16.2d, v15.2d, v28.2d
fmls v0.4s, v17.4s, v28.4s fmls v7.2d, v16.2d, v27.2d
fadd v17.4s, v15.4s, v27.4s fadd v16.2d, v15.2d, v26.2d
fmla v0.4s, v17.4s, v26.4s fmla v7.2d, v16.2d, v25.2d
fadd v17.4s, v15.4s, v25.4s fadd v16.2d, v15.2d, v24.2d
fmls v0.4s, v17.4s, v24.4s fmls v7.2d, v16.2d, v23.2d
fadd v17.4s, v15.4s, v23.4s fadd v16.2d, v15.2d, v22.2d
fmla v0.4s, v17.4s, v22.4s fmla v7.2d, v16.2d, v21.2d
fadd v17.4s, v15.4s, v21.4s fadd v16.2d, v15.2d, v20.2d
fadd v15.4s, v15.4s, v19.4s fadd v15.2d, v15.2d, v18.2d
fmls v0.4s, v17.4s, v20.4s fmls v7.2d, v16.2d, v19.2d
fmla v0.4s, v15.4s, v18.4s fmla v7.2d, v15.2d, v17.2d
str q0, [x2, x3] str q7, [x3, x2]
add x3, x3, 16 add x2, x2, 16
bcc .L51 bcc .L44
```