You need to get out more GCC in general is horrible at optimization with structures, I truely get sick of holding it's stupid hand. It misses them continously and I don't care what version you use. It has been raised countless times and never fixed and we are talking at least 10 years.
Try this one which is standard GPIO code for a Pi.
What causes the problem is I am carrying the IO base address in memory and it needs to make a volatile pointer to GPIO->GPSET and GPIO->GPCLR and usually it's too stupid to work out they are just offsets off the same base address.
Please don't tell me to make it a constant it's a memory value because my code auto-detects the base address so it can run on any model Pi ... so no changing the problem to get an optimization. What I am interested in is the code for gpio_output
Code: Select all
#include <stdint.h>
/*--------------------------------------------------------------------------}
{ RASPBERRY PI GPIO HARDWARE REGISTERS - BCM2835.PDF Manual Section 6 }
{--------------------------------------------------------------------------*/
struct __attribute__((__packed__, aligned(4))) GPIORegisters {
volatile uint32_t GPFSEL[6]; // 0x00 GPFSEL0 - GPFSEL[5]
uint32_t reserved1; // 0x18 reserved
volatile uint32_t GPSET[2]; // 0x1C GPSET0 - GPSET1;
uint32_t reserved2; // 0x24 reserved
volatile uint32_t GPCLR[2]; // 0x28 GPCLR0 - GPCLR1
uint32_t reserved3; // 0x30 reserved
const volatile uint32_t GPLEV[2]; // 0x34 GPLEV0 - GPLEV1 ** Read only hence const
uint32_t reserved4; // 0x3C reserved
volatile uint32_t GPEDS[2]; // 0x40 GPEDS0 - GPEDS1
uint32_t reserved5; // 0x48 reserved
volatile uint32_t GPREN[2]; // 0x4C GPREN0 - GPREN1;
uint32_t reserved6; // 0x54 reserved
volatile uint32_t GPFEN[2]; // 0x58 GPFEN0 - GPFEN1;
uint32_t reserved7; // 0x60 reserved
volatile uint32_t GPHEN[2]; // 0x64 GPHEN0 - GPHEN1;
uint32_t reserved8; // 0x6c reserved
volatile uint32_t GPLEN[2]; // 0x70 GPLEN0 - GPLEN1;
uint32_t reserved9; // 0x78 reserved
volatile uint32_t GPAREN[2]; // 0x7C GPAREN0 - GPAREN1;
uint32_t reserved10; // 0x84 reserved
volatile uint32_t GPAFEN[2]; // 0x88 GPAFEN0 - GPAFEN1;
uint32_t reserved11; // 0x90 reserved
volatile uint32_t GPPUD; // 0x94 GPPUD
volatile uint32_t GPPUDCLK[2]; // 0x98 GPPUDCLK0 - GPPUDCLK1;
};
uint32_t RPi_IO_Base_Addr = 0x3F000000;
#define GPIO ((volatile __attribute__((aligned(4))) struct GPIORegisters*) (RPi_IO_Base_Addr + 0x200000))
bool gpio_output(int gpio, bool on) {
if (gpio < 0 || gpio > 54) return false; // Check GPIO pin number valid, return false if invalid
uint32_t bit = 1 << (gpio % 32); // Create mask bit
if (on) { // ON request
GPIO->GPSET[gpio / 32] = bit; // Set bit to make GPIO high output
} else {
GPIO->GPCLR[gpio / 32] = bit; // Set bit to make GPIO low output
}
return true; // Return true
}
Most times on most distros it generates this sort of thing with -02 .. love the double load of the address and it's to stupid to compact the different branches.
Code: Select all
58 .fpu neon-vfpv4
59 .type gpio_output, %function
60 gpio_output:
61 @ args = 0, pretend = 0, frame = 0
62 @ frame_needed = 0, uses_anonymous_args = 0
63 @ link register save eliminated.
64 0000 360050E3 cmp r0, #54
65 0004 1600008A bhi .L12
66 0008 1F3000E2 and r3, r0, #31
67 000c 0120A0E3 mov r2, #1
68 0010 000051E3 cmp r1, #0
69 0014 12C3A0E1 lsl ip, r2, r3
70 0018 0800001A bne .L13
71 001c 001000E3 movw r1, #:lower16:RPi_IO_Base_Addr
72 0020 001040E3 movt r1, #:upper16:RPi_IO_Base_Addr
73 0024 C032A0E1 asr r3, r0, #5
74 0028 0200A0E1 mov r0, r2
75 002c 0A3083E2 add r3, r3, #10
76 0030 002091E5 ldr r2, [r1]
77 0034 022682E2 add r2, r2, #2097152
78 0038 03C182E7 str ip, [r2, r3, lsl #2]
79 003c 1EFF2FE1 bx lr
80 .L13:
81 0040 002000E3 movw r2, #:lower16:RPi_IO_Base_Addr
82 0044 002040E3 movt r2, #:upper16:RPi_IO_Base_Addr
83 0048 C032A0E1 asr r3, r0, #5
84 004c 0100A0E1 mov r0, r1
85 0050 002092E5 ldr r2, [r2]
86 0054 033182E0 add r3, r2, r3, lsl #2
87 0058 023683E2 add r3, r3, #2097152
88 005c 1CC083E5 str ip, [r3, #28]
89 0060 1EFF2FE1 bx lr
90 .L12:
91 0064 0000A0E3 mov r0, #0
92 0068 1EFF2FE1 bx lr
93 .size gpio_output, .-gpio_output
Every now and again on a distro you can convince it to at least load a single load the address but then it still keeps the two branches.
Code: Select all
gpio_output(int, bool):
cmp r0, #54
bhi .L4
mov r2, #1
and r3, r0, #31
cmp r1, #0
lsl ip, r2, r3
bne .L6
ldr r1, .L7
asr r3, r0, #5
ldr r1, [r1]
add r3, r3, #10
add r1, r1, #2097152
mov r0, r2
str ip, [r1, r3, lsl #2]
bx lr
.L6:
ldr r2, .L7
asr r3, r0, #5
ldr r0, [r2]
add r3, r3, #6
add r0, r0, #2097152
add r3, r0, r3, lsl #2
str ip, [r3, #4]
mov r0, r1
bx lr
.L4:
mov r0, #0
bx lr
.L7:
.word .LANCHOR0
RPi_IO_Base_Addr:
.word 1056964608
Now feed the same code into CLang 4.0 (I dont have ARM one setup at home but 386 code will show same). One register load compacted out of two full branches to a single.
Code: Select all
gpio_output(int, bool): # @gpio_output(int, bool)
cmp edi, 54
jbe .LBB0_2
xor eax, eax
ret
.LBB0_2:
mov eax, 1
mov ecx, edi
shl eax, cl
mov ecx, 2097152
add ecx, dword ptr [rip + RPi_IO_Base_Addr]
lea rdx, [rcx + 40]
add rcx, 28
test sil, sil
cmove rcx, rdx
shr edi, 5
mov dword ptr [rcx + 4*rdi], eax
mov al, 1
ret
RPi_IO_Base_Addr:
.long 1056964608 # 0x3f000000
So please don't tell me GCC is good at optimization compared to CLang it's horrible. There is nothing wrong with GCC code it's just a million miles from optimized because of the struct and the struct isn't even misaligned (it gets worse with that).