Question

As was mentioned in this Shared Memory Array Default Value question, shared memory is non-initialized, i.e. can contain any value.

#include <stdio.h>

#define BLOCK_SIZE 512

__global__ void scan(float *input, float *output, int len) {
    __shared__ int data[BLOCK_SIZE];

    // DEBUG
    if (threadIdx.x == 0 && blockIdx.x == 0)
    {
        printf("Block Number: %d\n", blockIdx.x);
        for (int i = 0; i < BLOCK_SIZE; ++i)
        {
            printf("DATA[%d] = %d\n", i, data[i]);
        }
    }
    
}

int main(int argc, char ** argv) {
    dim3 block(BLOCK_SIZE, 1, 1);
    dim3 grid(10, 1, 1);
    scan<<<grid,block>>>(NULL, NULL, NULL);
    cudaDeviceSynchronize();
    return 0;
}

But why in this code it is not true and I'm constantly getting zeroed shared memory?

DATA[0] = 0
DATA[1] = 0
DATA[2] = 0
DATA[3] = 0
DATA[4] = 0
DATA[5] = 0
DATA[6] = 0
...

I tested with Release and Debug Mode: -O3 -arch=sm_20, -O3 -arch=sm_30 and -arch=sm_30. The result is always the same.

Was it helpful?

Solution

tl;dr: shared memory is not initialized to 0

I think your conjecture of shared memory initialized to 0 is questionable. Try the following code, which is a slight modification of yours. Here, I'm calling the kernel twice and altering the values of the data array. The first time the kernel is launched, the "uninitialized" values of data will be all 0's. The second time the kernel is launched, the "uninitialized" values of data will be all different from 0's.

I think this depends on the fact that shared memory is SRAM, which exhibits data remanence.

#include <stdio.h>

#define BLOCK_SIZE 32

__global__ void scan(float *input, float *output, int len) {

    __shared__ int data[BLOCK_SIZE];

    if (threadIdx.x == 0 && blockIdx.x == 0)
    {
        for (int i = 0; i < BLOCK_SIZE; ++i)
        {
            printf("DATA[%d] = %d\n", i, data[i]);
            data[i] = i;
        }

    }
}

int main(int argc, char ** argv) {
    dim3 block(BLOCK_SIZE, 1, 1);
    dim3 grid(10, 1, 1);
    scan<<<grid,block>>>(NULL, NULL, NULL);
    scan<<<grid,block>>>(NULL, NULL, NULL);
    cudaDeviceSynchronize();
    getchar();
    return 0;
}
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top