Question

I'm writing a code for the image denoising and came across a strange problem with stripes in the processed images. Basically when I'm calculating X-gradient of image the horizontal stripes appear (or vertical for Y direction) Lena X gradient. The whole algorithm works OK and it looks like I'm getting the correct answer (I'm comparing with program in C) except those annoying stripes Lena result.

The distance between stripes is changing with different block sizes. I'm also having different stripes positions each time I run the program! Here is the part of the program related to the gradient calculation. I have a feeling that I'm doing something very stupid :) Thank you!

 #define BLKXSIZE 16
 #define BLKYSIZE 16
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
 void Diff4th_GPU(float* A, float* B, int N, int M, int Z, float sigma, int iter, float tau, int                           type)
 {
    float *Ad; 

dim3 dimBlock(BLKXSIZE,BLKYSIZE);
dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE));

cudaMalloc((void**)&Ad,N*M*sizeof(float));        


cudaMemcpy(Ad,A,N*M*sizeof(float),cudaMemcpyHostToDevice); 

cudaCheckErrors("cc1");
int n = 1;
while (n <= iter) {
    Diff4th2D<<<dimGrid,dimBlock>>>(Ad, N, M, sigma, iter, tau, type);
    n++;
   cudaDeviceSynchronize();
    cudaCheckErrors("kernel");}

cudaMemcpy(B,Ad,N*M*sizeof(float),cudaMemcpyDeviceToHost);
cudaCheckErrors("cc2");
cudaFree(Ad);
 } 

  __global__ void Diff4th2D(float* A, int N, int M, float sigma, int iter, float tau, int type)
 {

float gradX, gradX_sq, gradY, gradY_sq, gradXX, gradYY, gradXY, sq_sum, xy_2, Lam,    V_norm, V_orth, c, c_sq, lam_t;


int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y*blockDim.y + threadIdx.y;

 int index = j + i*N;

if ((i < N) && (j < M))
  {
    float gradX = 0, gradY = 0, gradXX = 0, gradYY = 0,  gradXY = 0;

    if ((i>1) && (i<N)) {
        if ((j>1) && (j<M)){
    int indexN = (j)+(i-1)*(N);
    if (indexN > ((N*M)-1)) indexN = (N*M)-1;
    if (indexN < 0) indexN = 0;
    int indexS = (j)+(i+1)*(N);
    if (indexS > ((N*M)-1)) indexS = (N*M)-1;
    if (indexS < 0) indexS = 0;
    int indexW = (j-1)+(i)*(N);
    if (indexW > ((N*M)-1)) indexW = (N*M)-1;
    if (indexW < 0) indexW = 0;
    int indexE = (j+1)+(i)*(N);        
    if (indexE > ((N*M)-1)) indexE = (N*M)-1;
    if (indexE < 0) indexE = 0;

   gradX = 0.5*(A[indexN]-A[indexS]);
   A[index] = gradX;
  }
}
}
}
Was it helpful?

Solution

You have a race condition inside your kernel, as elements of A may or may not be overwritten before they are used.

Use different arrays for input and output.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top