I tracked down two problems and the code now works. Issues:
- __sync_synchronize() was not generating the mfence instruction on my platform (Apple's GCC 4.2.1). Replacing __sync_synchronize() with an explicit mfence resolves this issue.
- I was doing something wrong with the OpenMP private variables (still not sure what...). Sometimes the two threads entered the lock with the same identity (ex. both may say they were thread 0). Recomputing 'i' with 'omp_get_thread_num' on every iteration seems to do the trick.
Here is the corrected code for completeness:
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <omp.h>
#define cpu_relax() asm volatile ("pause":::"memory")
#define mb() asm volatile ("mfence":::"memory")
/* Simple Lamport bakery lock for two threads. */
typedef struct
{
volatile uint32_t entering[2];
volatile uint32_t number[2];
} SimpleBakeryLock_t;
void lock(SimpleBakeryLock_t* l, int id)
{
int i = id, j = !id;
uint32_t ni, nj;
l->entering[i] = 1;
mb();
ni = 1 + l->number[j];
l->number[i] = ni;
mb();
l->entering[i] = 0;
mb();
while (l->entering[j]) {
cpu_relax();
}
do {
nj = l->number[j];
} while ((nj != 0) && (nj < ni || (nj == ni && j < i)));
}
void unlock(SimpleBakeryLock_t* l, int id)
{
mb(); /* prevent critical section writes from leaking out over unlock */
l->number[id] = 0;
mb();
}
SimpleBakeryLock_t x;
int main(void)
{
const int32_t iterations = 10000000;
int32_t dummy;
uint32_t count = 0;
memset((void*)&x, 0, sizeof(x));
mb();
// set OMP_NUM_THREADS=2 in your environment!
#pragma omp parallel for schedule(static, 1)
for(dummy = 0; dummy < iterations; ++dummy)
{
int i = omp_get_thread_num();
lock(&x, i);
count = count + 1;
unlock(&x, i);
}
printf("count: %u (expected: %u)\n", count, iterations);
return 0;
}