CUDA DCT works only when blockDim.x is 1

https://stackoverflow.com/questions/13794161

06-12-2021
|

Question

I'm just starting learning CUDA and knows only very basic stuff. I'm trying to develop a CUDA program that does 8x8 DCT using matrix multiplication method. An 8x8 DCT coefficient matrix D is computed, and the DCT transform is then D'AD. Each thread computes 1 data point and each block is 8x8. I wrote a sequential DCT and compare the results in an output file.

Here's the problem. When the number of blocks is 1xN, everything works fine. When the number of blocks is MxN, (M is any number greater than 1), the kernel gives wrong result. I think the problem should be my block indexing, but I couldn't find the problem.

Could anyone offer some help? I know it's a very basic program, but I really need it. Any comments are gratefully appreciated! Thanks in advance!

#include <stdio.h>
#include <stdlib.h>
#include "types.h"
#include "cuda.h"

static int DCT_bases[64]= {2896, 2896, 2896, 2896, 2896, 2896, 2896, 2896,
    4017, 3406, 2276, 799, -799, -2276, -3406, -4017,
    3784, 1568, -1568, -3784, -3784, -1568, 1568, 3784,
    3406, -799, -4017, -2276, 2276, 4017, 799, -3406, 
    2896, -2896, -2896, 2896, 2896, -2896, -2896, 2896,
    2276, -4017, 799, 3406, -3406, -799, 4017, -2276, 
    1568, -3784, 3784, -1568, -1568, 3784, -3784, 1568,
    799, -2276, 3406, -4017, 4017, -3406, 2276, -799 };

__device__ __constant__ int dDCT_bases[64];

__global__ void cudaDCT2D(int *src, int width) {

    int i = blockIdx.y * blockDim.y + threadIdx.y;
    int j = blockIdx.x * blockDim.x + threadIdx.x;
    int k;
    int sum = 0;
    int dct_i = threadIdx.y;
    int dct_j = threadIdx.x;

    __shared__ int temp[8][8];

    temp[dct_i][dct_j] = src[i*width+j];
    __syncthreads();
    sum = 0;

    for (k=0; k<8; k++) {
        sum += temp[dct_i][k] * dDCT_bases[dct_j*8+k]; 
    }
    __syncthreads();
    temp[dct_i][dct_j] = sum >> 13;
    __syncthreads();

    sum = 0; 

    for (k = 0; k < 8; k++) {
        sum += dDCT_bases[dct_i*8+k] * temp[k][dct_j];
    }
    __syncthreads();

    src[i*width+j] = sum >> 13;

}

void myDCT2D(int *src, int width, int height) {

    int bi, bj;
    int i, j, k;
    int sum = 0;
    int temp[64];

    for (bi = 0; bi < width / 8; bi++) {
        for (bj = 0; bj < height / 8; bj++) {
            for (i=0; i<8; i++) {
                for (j=0; j<8; j++) {
                    for (k = 0; k < 8; k++) {
                        sum +=  src[i*8+k] * DCT_bases[j*8+k];
                    }
                    temp[i*8+j] = sum >> 13;
                    sum = 0;
                }
            }

            for (i=0; i<8; i++) {
                for (j=0; j<8; j++) {
                    for (k=0; k < 8; k++) {
                        sum += DCT_bases[i*8+k] * temp[k*8+j];
                    }
                    src[i*8+j] = sum >> 13;
                    sum = 0;
                }
            }

            src += 64; 
        }
    }       
}

int main (int argc, char *argv[]) 
{
    int *matrix;
    int *m0;
    int i, j;
    int *d_m;
    int *m1;

    FILE* fp;

    int width = 8;
    int height = 8;

    if (argc > 1) { 
        width = atoi(argv[1]);
        height = atoi(argv[2]);
    }


    if (width % 8 || height % 8) {
        printf("Width and Height has to be multiple of 8!\n"); 
        getchar();
        return 0;
    }

    matrix = (int *) malloc(sizeof(int) * width * height); 
    m0 = (int *) malloc(sizeof(int) * width * height);
    m1 = (int *) malloc(sizeof(int) * width * height);
    fp = fopen("cuda_test.txt", "w");

    for (i=0; i< height; i++) {
        for (j = 0; j < width; j++) {
            matrix[i*width+j] = rand()% 256;
            m0[i*width+j] = matrix[i*width+j];
            m1[i*width+j] = matrix[i*width+j];
            fprintf(fp,"%d ", m0[i*width+j]);
        }
        fprintf(fp,"\n");
    }
    fprintf(fp, "\n");

    cudaMalloc((void**) &d_m, sizeof(int) * width * height);
    cudaMemcpy(d_m, m0, sizeof(int) * width * height, cudaMemcpyHostToDevice);
    cudaMemcpyToSymbol(dDCT_bases, DCT_bases, sizeof(DCT_bases));

    //    printf("%s\n", cudaGetErrorString(cudaGetLastError()));
    dim3 dimGrid(width / 8, height / 8);    
    dim3 dimBlock(8,8);

    cudaDCT2D<<<dimGrid,dimBlock>>> (d_m, width);

    cudaMemcpy(m0, d_m, sizeof(int) * width * height, cudaMemcpyDeviceToHost);

    for (i=0; i< height; i++) {
        for (j = 0; j < width; j++) {
            fprintf(fp,"%d ", m0[i*width+j]);
        }
        fprintf(fp,"\n");
    }

    fprintf(fp, "\n"); 
    myDCT2D(m1, width, height);

    for (i=0; i< height; i++) {
        for (j = 0; j < width; j++) {
            fprintf(fp,"%d ", m1[i*width+j]);
        }
        fprintf(fp,"\n");
    }

    fprintf(fp,"\n");
    free(matrix);
    free(m0);
    free(m1);
    cudaFree(d_m);

    return 0;
}

Solution

I find the answer myself. In fact there's nothing wrong with the cuda program, but i'm interpreting the matrix in different way. In CUDA, I use a 2-D block structure, so a 16x16 matrix would be interpreted by cuda in this way: [ M1_8x8 M2_8x8 M3_8x8 M4_8x8] But in my C test program, I'm assuming that the first 8x8 numbers are in the first matrix, so it becomes this: [M1 16x4 M2 16x4 M3 16x4 M4 16x4]

So the matrix is different! That's why the results are not the same! I think it will only happen for starters like me.... :(

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow