Why don't you use GNU assembly? It should look something like this
.equ locked,1
.equ unlocked,0
@ lock_mutex
@ Declare for use from C as extern void lock_mutex(void * mutex);
.global lock_mutex
lock_mutex:
LDR r1, =locked
1: LDREX r2, [r0]
CMP r2, r1 @ Test if mutex is locked or unlocked
BEQ 2f @ If locked - wait for it to be released, from 2
STREXNE r2, r1, [r0] @ Not locked, attempt to lock it
CMPNE r2, #1 @ Check if Store-Exclusive failed
BEQ 1b @ Failed - retry from 1
# Lock acquired
DMB @ Required before accessing protected resource
BX lr
2: @ Take appropriate action while waiting for mutex to become unlocked
@ WAIT_FOR_UPDATE
B 1b @ Retry from 1
@ unlock_mutex
@ Declare for use from C as extern void unlock_mutex(void * mutex);
.global unlock_mutex
unlock_mutex:
LDR r1, =unlocked
DMB @ Required before releasing protected resource
STR r1, [r0] @ Unlock mutex
@ SIGNAL_UPDATE
BX lr
Then its dump looks like this
$ arm-linux-gnueabihf-objdump -d mutex.o
mutex.o: file format elf32-littlearm
Disassembly of section .text:
00000000 <lock_mutex>:
0: e3a01001 mov r1, #1
4: e1902f9f ldrex r2, [r0]
8: e1520001 cmp r2, r1
c: 0a000004 beq 24 <lock_mutex+0x24>
10: 11802f91 strexne r2, r1, [r0]
14: 13520001 cmpne r2, #1
18: 0afffff9 beq 4 <lock_mutex+0x4>
1c: f57ff05f dmb sy
20: e12fff1e bx lr
24: eafffff6 b 4 <lock_mutex+0x4>
00000028 <unlock_mutex>:
28: e3a01000 mov r1, #0
2c: f57ff05f dmb sy
30: e5801000 str r1, [r0]
34: e12fff1e bx lr
However what I'm wondering if you did managed to configure both cores to be included in core coherency. To my knowledge you can specify which cores participate in ldrex/strex operations.