Metrics for the execution speeds of each of these programs were normalized so that each reports a score of 1.0 when run on a Raspberry Pi 4B in 64-bit mode. This was done using gcc 10.3 and go 1.16.3.

After this, the same two programs were complied and run on a number of non-Raspberry Pi computers to see how much the relative speed between C and Go might change depending on architecture. The results of such a comparison can hopefully then be used to infer the maturity of Go on the Pi compared to other platforms.

Here are some preliminary results:

Code: Select all

```
$ grep "model name" /proc/cpuinfo | head -n1
model name : Intel(R) Pentium(R) 4 CPU 3.40GHz
$ ./realfft
realfft.go -- Perform real to complex Fourier transform
Version=5; N=4194304
run norm(xr-xrs) real sec norm(x-xs) complex sec
1 7.34091862e-13 4.42385340e+00 7.37445718e-13 8.72595406e+00
2 7.34091862e-13 4.35549045e+00 7.37445718e-13 8.59305906e+00
3 7.34091862e-13 4.35643291e+00 7.37445718e-13 8.59572983e+00
Best real=4.3555e+00 sec; Mtflops=5.2965e+01
Best complex=8.5931e+00 sec; Mtflops=5.3691e+01
Single-core speed is 0.9331 times a Pi 4B
$ ./rfft
rfft.c -- Perform real to complex Fourier transform
Version=5; N=4194304
run norm(xr-xrs) real sec norm(x-xs) complex sec
1 7.09627869e-13 4.36951400e+00 7.13269560e-13 8.79502800e+00
2 7.09627869e-13 4.28870000e+00 7.13269560e-13 8.65559600e+00
3 7.09627869e-13 4.29111500e+00 7.13269560e-13 8.65074200e+00
Best real=4.2887e+00 sec; Mtflops=5.3789e+01
Best complex=8.6507e+00 sec; Mtflops=5.3333e+01
Single-core speed is 0.8139 times a Pi 4B
```

Code: Select all

```
$ grep "model name" /proc/cpuinfo | head -n1
model name : AMD EPYC 7702 64-Core Processor
$ ./realfft
realfft.go -- Perform real to complex Fourier transform
Version=5; N=4194304
run norm(xr-xrs) real sec norm(x-xs) complex sec
1 7.34091862e-13 1.33623481e+00 7.37445718e-13 2.65256858e+00
2 7.34091862e-13 1.32155347e+00 7.37445718e-13 2.62144589e+00
3 7.34091862e-13 1.32319140e+00 7.37445718e-13 2.62295699e+00
Best real=1.3216e+00 sec; Mtflops=1.7456e+02
Best complex=2.6214e+00 sec; Mtflops=1.7600e+02
Single-core speed is 3.067 times a Pi 4B
$ ./rfft
rfft.c -- Perform real to complex Fourier transform
Version=5; N=4194304
run norm(xr-xrs) real sec norm(x-xs) complex sec
1 6.86626828e-13 1.02936000e+00 6.89441082e-13 2.06730100e+00
2 6.86626828e-13 1.01212500e+00 6.89441082e-13 2.02839800e+00
3 6.86626828e-13 1.01509700e+00 6.89441082e-13 2.03150100e+00
Best real=1.0121e+00 sec; Mtflops=2.2792e+02
Best complex=2.0284e+00 sec; Mtflops=2.2746e+02
Single-core speed is 3.46 times a Pi 4B
```

For reference the source codes for the two programs are as follows:

Code: Select all

```
/* realfft.go -- Perform real to complex Fourier transform
Written May 10, 2021 by Eric Olson */
package main
import ("fmt"; "os"; "time"; "math/cmplx";
"math"; "reflect"; "unsafe")
var tictime float64
func tic() {
now:=time.Now()
tictime=float64(now.Unix())+1.0E-9*float64(now.Nanosecond())
}
func toc() float64 {
now:=time.Now()
return float64(now.Unix())+1.0E-9*float64(now.Nanosecond())-tictime
}
type rstate struct {
x,w,s uint64
}
var gs=rstate{0,0,0xb5ad4eceda1ce2a9}
func rint32() uint32 {
gs.x*=gs.x; gs.w+=gs.s
gs.x+=gs.w; gs.x=(gs.x>>32)|(gs.x<<32)
return uint32(gs.x)
}
func rseed(x,w,s uint64) {
gs.x=x; gs.w=w; gs.s=s|1
}
func cfft(xhat,x []complex128,s,n int){
if n==1 {
xhat[0]=x[0]
return
}
if n%2!=0 {
fmt.Printf("Error: cfft called with non-power-of-two argument!\n")
os.Exit(1)
}
n2:=n/2
cfft(xhat,x,2*s,n2)
cfft(xhat[n2:],x[s:],2*s,n2)
for l:=0;l<n2;l++ {
theta:=-2*math.Pi*float64(l)/float64(n)
ts,tc:=math.Sincos(theta)
t1:=xhat[l]
t2:=complex(tc,ts)*xhat[l+n2]
xhat[l]=t1+t2; xhat[l+n2]=t1-t2
}
}
func cfift(x,xhat []complex128,s,n int){
if n==1 {
x[0]=xhat[0]
return
}
if n%2!=0 {
fmt.Printf("Error: cfft called with non-power-of-two argument!\n")
os.Exit(1)
}
n2:=n/2
cfift(x,xhat,2*s,n2)
cfift(x[n2:],xhat[s:],2*s,n2)
for l:=0;l<n2;l++ {
theta:=2*math.Pi*float64(l)/float64(n)
ts,tc:=math.Sincos(theta)
t1:=x[l]
t2:=complex(tc,ts)*x[l+n2]
x[l]=t1+t2; x[l+n2]=t1-t2
}
}
func rfft(xhat []complex128,x []float64,s,n int){
if n%2!=0 {
fmt.Printf("Error: rfift called with non-power-of-two argument!\n")
os.Exit(1)
}
n2:=n/2
var xc []complex128
xsh:=(*reflect.SliceHeader)(unsafe.Pointer(&x))
xcsh:=(*reflect.SliceHeader)(unsafe.Pointer(&xc))
xcsh.Data=xsh.Data
xcsh.Len=xsh.Len/2; xcsh.Cap=xsh.Len/2
cfft(xhat,xc,s,n2)
n4:=n2/2
t1:=xhat[0]
xhat[0]=complex(real(t1)+imag(t1),0)
xhat[s*n2]=complex(real(t1)-imag(t1),0)
for l:=1;l<=n4;l++ {
theta:=-2*math.Pi*float64(l)/float64(n)
ts,tc:=math.Sincos(theta)
ie:=complex(-ts,tc)
q1:=xhat[s*l]; q2:=cmplx.Conj(xhat[s*(n2-l)])
t1:=q1+q2; t2:=q1-q2
xhat[s*l]=(t1-ie*t2)/2
xhat[s*(n2-l)]=cmplx.Conj(t1+ie*t2)/2
}
}
func rfift(x []float64,xhat []complex128,s,n int) {
if n%2!=0 {
fmt.Printf("Error: rfift called with non-power-of-two argument!\n")
os.Exit(1)
}
n2:=n/2
var xc []complex128
xsh:=(*reflect.SliceHeader)(unsafe.Pointer(&x))
xcsh:=(*reflect.SliceHeader)(unsafe.Pointer(&xc))
xcsh.Data=xsh.Data
xcsh.Len=xsh.Len/2; xcsh.Cap=xsh.Len/2
n4:=n2/2
for k:=0; k<=n4; k++ {
theta:=2*math.Pi*float64(k)/float64(n)
ts,tc:=math.Sincos(theta)
ie:=complex(-ts,tc)
q1:=xhat[s*k]; q2:=cmplx.Conj(xhat[s*(n2-k)])
t1:=q1+q2; t2:=q1-q2
xhat[s*k]=t1+ie*t2
xhat[s*(n2-k)]=cmplx.Conj(t1-ie*t2)
}
cfift(xc,xhat,s,n2)
}
const N=4194304
var xr,xrs [N]float64
var x,xs,xhat [N]complex128
var xrhat [N/2+1]complex128
var trmin,tcmin float64=0,0
var rnorm,cnorm float64=0,0
func dotest() {
rseed(0,0,0xb5ad4eceda1ce2a9)
for l:=0;l<N;l++ {
xr[l]=2*float64(rint32())/(1<<32)-1
x[l]=complex(xr[l],0)
}
tic()
rfft(xrhat[:],xr[:],1,N)
rfift(xrs[:],xrhat[:],1,N)
tr:=toc()
if trmin==0 || tr<trmin { trmin=tr }
for l:=0;l<N;l++ { xrs[l]/=N }
r:=float64(0.0)
for l:=0;l<N;l++ {
dx:=xr[l]-xrs[l]
r+=dx*dx
}
r=math.Sqrt(r)
fmt.Printf(" %15.8e",r)
if rnorm==0 { rnorm=r
} else if rnorm!=r {
fmt.Printf("\nReal floating point error detected!\n")
os.Exit(1)
}
fmt.Printf(" %15.8e",tr)
tic()
cfft(xhat[:],x[:],1,N)
cfift(xs[:],xhat[:],1,N)
tc:=toc()
if tcmin==0 || tc<tcmin { tcmin=tc }
for l:=0;l<N;l++ { xs[l]/=N }
r=0
for l:=0;l<N;l++ {
dx:=x[l]-xs[l]
r+=real(dx*cmplx.Conj(dx))
}
r=math.Sqrt(r)
fmt.Printf(" %15.8e",r)
if cnorm==0 { cnorm=r
} else if cnorm!=r {
fmt.Printf("\nComplex floating point error detected!\n")
os.Exit(1)
}
fmt.Printf(" %15.8e\n",tc)
}
func main(){
fmt.Printf("realfft.go -- Perform real to complex Fourier transform\n")
fmt.Printf("Version=%d; N=%d\n\n",6,N)
fmt.Printf("%6s %15s %15s %15s %15s\n",
"run","norm(xr-xrs)","real sec","norm(x-xs)","complex sec")
for w:=0;w<3;w++ {
fmt.Printf("%6d",w+1)
dotest()
}
ops:=2*N*math.Log2(N)+3*N*math.Log2(N)
rflops:=ops/trmin/2e6
cflops:=ops/tcmin/1e6
fmt.Printf("\nBest real=%.4e sec; Mtflops=%.4e\n",
trmin,rflops)
fmt.Printf("Best complex=%.4e sec; Mtflops=%.4e\n",
tcmin,cflops)
fmt.Printf("Single-core speed is %.4g times a Pi 4B\n",
math.Sqrt(rflops*cflops)/57.15)
os.Exit(0)
}
```

Code: Select all

```
/* rfft.c -- Perform real to complex Fourier transform
Written May 10, 2021 by Eric Olson */
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <complex.h>
#include <math.h>
static struct timeval tic_start;
void tic() {
gettimeofday(&tic_start,0);
}
double toc() {
struct timeval tic_stop;
gettimeofday(&tic_stop,0);
double sec=tic_stop.tv_sec-tic_start.tv_sec;
return sec+(tic_stop.tv_usec-tic_start.tv_usec)*1.0e-6;
}
typedef struct {
uint64_t x,w,s;
} rstate;
rstate gs={0,0,0xb5ad4eceda1ce2a9};
uint32_t rint32(){
gs.x*=gs.x; gs.w+=gs.s;
gs.x+=gs.w; gs.x=(gs.x>>32)|(gs.x<<32);
return (uint32_t)gs.x;
}
void rseed(uint64_t x,uint64_t w,uint64_t s) {
gs.x=x; gs.w=w; gs.s=s|1;
}
typedef double complex Complex;
typedef double Real;
void cfft(Complex *xhat,Complex *x,int s,int n){
if(n==1){
xhat[0]=x[0];
return;
}
if(n%2){
printf("Error: cfft called with non-power-of-two argument!\n");
exit(1);
}
int n2=n/2;
cfft(xhat,x,2*s,n2);
cfft(xhat+n2,x+s,2*s,n2);
for(int l=0;l<n2;l++){
Real theta=-2*M_PI*l/n;
Real ts=sin(theta),tc=cos(theta);
Complex t1=xhat[l];
Complex t2=(tc+1i*ts)*xhat[l+n2];
xhat[l]=t1+t2; xhat[l+n2]=t1-t2;
}
}
void cfift(Complex *x,Complex *xhat,int s,int n){
if(n==1){
x[0]=xhat[0];
return;
}
if(n%2){
printf("Error: cfft called with non-power-of-two argument!\n");
exit(1);
}
int n2=n/2;
cfift((Complex *)x,xhat,2*s,n2);
cfift(x+n2,xhat+s,2*s,n2);
for(int l=0;l<n2;l++){
Real theta=2*M_PI*l/n;
Real ts=sin(theta),tc=cos(theta);
Complex t1=x[l];
Complex t2=(tc+1i*ts)*x[l+n2];
x[l]=t1+t2; x[l+n2]=t1-t2;
}
}
void rfft(Complex *xhat,Real *x,int s,int n){
if(n%2){
printf("Error: rfift called with non-power-of-two argument!\n");
exit(1);
}
int n2=n/2;
cfft(xhat,(Complex *)x,s,n2);
int n4=n2/2;
Complex t1=xhat[0];
xhat[0]=creal(t1)+cimag(t1);
xhat[s*n2]=creal(t1)-cimag(t1);
for(int l=1;l<=n4;l++){
Real theta=-2*M_PI*l/n;
Real ts=sin(theta),tc=cos(theta);
Complex ie=-ts+1i*tc;
Complex q1=xhat[s*l], q2=conj(xhat[s*(n2-l)]);
Complex t1=q1+q2, t2=q1-q2;
xhat[s*l]=(t1-ie*t2)/2;
xhat[s*(n2-l)]=conj(t1+ie*t2)/2;
}
}
void rfift(Real *x,Complex *xhat,int s,int n){
if(n%2){
printf("Error: rfift called with non-power-of-two argument!\n");
exit(1);
}
int n2=n/2;
int n4=n2/2;
for(int k=0;k<=n4;k++){
Real theta=2*M_PI*k/n;
Real ts=sin(theta),tc=cos(theta);
Complex ie=-ts+1i*tc;
Complex q1=xhat[s*k], q2=conj(xhat[s*(n2-k)]);
Complex t1=q1+q2, t2=q1-q2;
xhat[s*k]=t1+ie*t2;
xhat[s*(n2-k)]=conj(t1-ie*t2);
}
cfift((Complex *)x,xhat,s,n2);
}
#define N 4194304
Real xr[N],xrs[N];
Complex x[N],xs[N],xhat[N];
Complex xrhat[N/2+1];
Real trmin=0,tcmin=0;
Real rnorm=0,cnorm=0;
void dotest(){
rseed(0,0,0xb5ad4eceda1ce2a9);
for(int l=0;l<N;l++){
xr[l]=2.0*rint32()/((uint64_t)1<<32)-1;
x[l]=xr[l];
}
tic();
rfft(xrhat,xr,1,N);
rfift(xrs,xrhat,1,N);
double tr=toc();
if(trmin==0||tr<trmin) trmin=tr;
for(int l=0;l<N;l++) xrs[l]/=N;
Real r=0;
for(int l=0;l<N;l++){
Real dx=xr[l]-xrs[l];
r+=dx*dx;
}
r=sqrt(r);
printf(" %15.8e",r);
if(rnorm==0) rnorm=r;
else if(rnorm!=r){
printf("Real floating point error detected!\n");
exit(1);
}
printf(" %15.8e",tr); fflush(stdout);
tic();
cfft(xhat,x,1,N);
cfift(xs,xhat,1,N);
double tc=toc();
if(tcmin==0||tc<tcmin) tcmin=tc;
for(int l=0;l<N;l++) xs[l]/=N;
r=0;
for(int l=0;l<N;l++){
Complex dx=x[l]-xs[l];
r+=dx*conj(dx);
}
r=sqrt(r);
printf(" %15.8e",r);
if(cnorm==0) cnorm=r;
else if(cnorm!=r){
printf("Complex floating point error detected!\n");
exit(1);
}
printf(" %15.8e\n",tc); fflush(stdout);
}
int main(){
printf("rfft.c -- Perform real to complex Fourier transform\n");
printf("Version=%d; N=%d\n\n",6,N);
printf("%6s %15s %15s %15s %15s\n",
"run","norm(xr-xrs)","real sec","norm(x-xs)","complex sec");
for(int w=0;w<3;w++){
printf("%6d",w+1); fflush(stdout);
dotest();
}
Real ops=2*N*log2(N)+3*N*log2(N);
Real rflops=ops/trmin/2e6;
Real cflops=ops/tcmin/1e6;
printf("\nBest real=%.4e sec; Mtflops=%.4e\n",
trmin,rflops);
printf("Best complex=%.4e sec; Mtflops=%.4e\n",
tcmin,cflops);
printf("Single-core speed is %.4g times a Pi 4B\n",
sqrt(rflops*cflops)/65.81);
exit(0);
}
```

Edit: Changed rseed to make sure the generator for the Weyl sequence is odd--this doesn't affect the anything except reuse of the random number code in other projects--and updated the version to 6.