Your examples do not work, because there are race conditions in updating and checking the flags array.
Seems like what you want though is a well known primitive called a barrier. This can be implemented, for example, using semaphores. See section 3.6 of The Little Book of Semaphores for details on how this works.
With a barrier your code can be written concisely as:
const int nThreads = 4;
const int nIter = 100;
mutex m;
barrier barrier(nThreads);
void printer(int id) {
for (int i = 0; i < nIter; i++) {
{
lock_guard<mutex> lock(m); // lock to prevent interleaved console output
cout << "Thread: " << id << " printing " << i << endl;
}
barrier.wait();
}
}
int main(int argc, char **argv) {
vector<thread> ts;
for (int i = 0; i < nThreads; i++) {
ts.emplace_back(thread(printer, i));
}
for (int i = 0; i < nThreads; i++) {
ts[i].join();
}
return 0;
}
Below is a simple semaphore implementation (copied from here).
class semaphore {
private:
mutex mtx;
condition_variable cv;
int count;
public:
semaphore(int count_ = 0):count(count_){;}
void notify()
{
unique_lock<mutex> lck(mtx);
++count;
cv.notify_one();
}
void wait()
{
unique_lock<mutex> lck(mtx);
while(count == 0){
cv.wait(lck);
}
count--;
}
};
Using that, you can implement a barrier as in the referenced book:
class barrier {
public:
barrier(int n): n(n), count(0) {}
void wait() {
phase1();
phase2();
}
private:
mutex m;
semaphore turnstile1, turnstile2;
int n, count;
void phase1() {
m.lock();
count++;
if (count == n) {
for (int i = 0; i < n; i++)
turnstile1.notify();
}
m.unlock();
turnstile1.wait();
}
void phase2() {
m.lock();
count--;
if (count == 0) {
for (int i = 0; i < n; i++)
turnstile2.notify();
}
m.unlock();
turnstile2.wait();
}
};