I have written a code on Nsight that compiles and can be executed but the first launch can't be completed.
The strange thing is that when I run it in debug mode, it works perfectly but it is too slow.
Here is the part of the code before entering the function that access the GPU (where i think there is an error I can't find) :
void parallelAction (int * dataReturned, char * data, unsigned char * descBase, int range, int cardBase, int streamIdx)
{
size_t inputBytes = range*128*sizeof(unsigned char);
size_t baseBytes = cardBase*128*sizeof(unsigned char);
size_t outputBytes = range*sizeof(int);
unsigned char * data_d;
unsigned char * descBase_d;
int * cardBase_d;
int * dataReturned_d;
cudaMalloc((void **) &data_d, inputBytes);
cudaMalloc((void **) &descBase_d, baseBytes);
cudaMalloc((void **) &cardBase_d, sizeof(int));
cudaMalloc((void **) &dataReturned_d, outputBytes);
int blockSize = 196;
int nBlocks = range/blockSize + (range%blockSize == 0?0:1);
cudaMemcpy(data_d, data, inputBytes, cudaMemcpyHostToDevice);
cudaMemcpy(descBase_d, descBase, baseBytes, cudaMemcpyHostToDevice);
cudaMemcpy(cardBase_d, &cardBase, sizeof(int), cudaMemcpyHostToDevice);
FindClosestDescriptor<<< nBlocks, blockSize >>>(dataReturned_d, data_d, descBase_d, cardBase_d);
cudaMemcpy(dataReturned, dataReturned_d, outputBytes, cudaMemcpyDeviceToHost);
cudaFree(data_d);
cudaFree(descBase_d);
cudaFree(cardBase_d);
cudaFree(dataReturned_d);
}
And the function entering the GPU (I don't think the error is here) :
__global__ void FindClosestDescriptor(int * dataReturned, unsigned char * data, unsigned char * base, int *cardBase)
{
int idx = blockDim.x * blockIdx.x + threadIdx.x;
unsigned char descriptor1[128], descriptor2[128];
int part = 0;
int result = 0;
int winner = 0;
int minDistance = 0;
int itelimit = *cardBase;
for (int k = 0; k < 128; k++)
{
descriptor1[k] = data[idx*128+k];
}
// initialize minDistance
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[k];
}
for (int k = 0; k < 128; k++)
{
part = (descriptor1[k]-descriptor2[k]);
part *= part;
minDistance += part;
}
// test all descriptors in the base :
for (int i = 1; i < itelimit; i++)
{
result = 0;
for (int k = 0; k < 128; k++)
{
descriptor2[k] = base[i*128+k];
// Calculate squared l2 distance :
part = (descriptor1[k]-descriptor2[k]);
part *= part;
result += part;
}
// Compare to minDistance
if (result < minDistance)
{
minDistance = result;
winner = i;
}
}
// Write the result in dataReturned
dataReturned[idx] = winner;
}
Thank you in advance if you can help me.
EDIT : the last cudaMemcpy returns the error "the launch timed out and was terminated".