I'm just starting learning CUDA and knows only very basic stuff. I'm trying to develop a CUDA program that does 8x8 DCT using matrix multiplication method. An 8x8 DCT coefficient matrix D is computed, and the DCT transform is then D'AD. Each thread computes 1 data point and each block is 8x8. I wrote a sequential DCT and compare the results in an output file.
Here's the problem. When the number of blocks is 1xN, everything works fine. When the number of blocks is MxN, (M is any number greater than 1), the kernel gives wrong result. I think the problem should be my block indexing, but I couldn't find the problem.
Could anyone offer some help? I know it's a very basic program, but I really need it.
Any comments are gratefully appreciated!
Thanks in advance!
#include <stdio.h>
#include <stdlib.h>
#include "types.h"
#include "cuda.h"
static int DCT_bases[64]= {2896, 2896, 2896, 2896, 2896, 2896, 2896, 2896,
4017, 3406, 2276, 799, -799, -2276, -3406, -4017,
3784, 1568, -1568, -3784, -3784, -1568, 1568, 3784,
3406, -799, -4017, -2276, 2276, 4017, 799, -3406,
2896, -2896, -2896, 2896, 2896, -2896, -2896, 2896,
2276, -4017, 799, 3406, -3406, -799, 4017, -2276,
1568, -3784, 3784, -1568, -1568, 3784, -3784, 1568,
799, -2276, 3406, -4017, 4017, -3406, 2276, -799 };
__device__ __constant__ int dDCT_bases[64];
__global__ void cudaDCT2D(int *src, int width) {
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
int k;
int sum = 0;
int dct_i = threadIdx.y;
int dct_j = threadIdx.x;
__shared__ int temp[8][8];
temp[dct_i][dct_j] = src[i*width+j];
__syncthreads();
sum = 0;
for (k=0; k<8; k++) {
sum += temp[dct_i][k] * dDCT_bases[dct_j*8+k];
}
__syncthreads();
temp[dct_i][dct_j] = sum >> 13;
__syncthreads();
sum = 0;
for (k = 0; k < 8; k++) {
sum += dDCT_bases[dct_i*8+k] * temp[k][dct_j];
}
__syncthreads();
src[i*width+j] = sum >> 13;
}
void myDCT2D(int *src, int width, int height) {
int bi, bj;
int i, j, k;
int sum = 0;
int temp[64];
for (bi = 0; bi < width / 8; bi++) {
for (bj = 0; bj < height / 8; bj++) {
for (i=0; i<8; i++) {
for (j=0; j<8; j++) {
for (k = 0; k < 8; k++) {
sum += src[i*8+k] * DCT_bases[j*8+k];
}
temp[i*8+j] = sum >> 13;
sum = 0;
}
}
for (i=0; i<8; i++) {
for (j=0; j<8; j++) {
for (k=0; k < 8; k++) {
sum += DCT_bases[i*8+k] * temp[k*8+j];
}
src[i*8+j] = sum >> 13;
sum = 0;
}
}
src += 64;
}
}
}
int main (int argc, char *argv[])
{
int *matrix;
int *m0;
int i, j;
int *d_m;
int *m1;
FILE* fp;
int width = 8;
int height = 8;
if (argc > 1) {
width = atoi(argv[1]);
height = atoi(argv[2]);
}
if (width % 8 || height % 8) {
printf("Width and Height has to be multiple of 8!\n");
getchar();
return 0;
}
matrix = (int *) malloc(sizeof(int) * width * height);
m0 = (int *) malloc(sizeof(int) * width * height);
m1 = (int *) malloc(sizeof(int) * width * height);
fp = fopen("cuda_test.txt", "w");
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
matrix[i*width+j] = rand()% 256;
m0[i*width+j] = matrix[i*width+j];
m1[i*width+j] = matrix[i*width+j];
fprintf(fp,"%d ", m0[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp, "\n");
cudaMalloc((void**) &d_m, sizeof(int) * width * height);
cudaMemcpy(d_m, m0, sizeof(int) * width * height, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(dDCT_bases, DCT_bases, sizeof(DCT_bases));
// printf("%s\n", cudaGetErrorString(cudaGetLastError()));
dim3 dimGrid(width / 8, height / 8);
dim3 dimBlock(8,8);
cudaDCT2D<<<dimGrid,dimBlock>>> (d_m, width);
cudaMemcpy(m0, d_m, sizeof(int) * width * height, cudaMemcpyDeviceToHost);
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
fprintf(fp,"%d ", m0[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp, "\n");
myDCT2D(m1, width, height);
for (i=0; i< height; i++) {
for (j = 0; j < width; j++) {
fprintf(fp,"%d ", m1[i*width+j]);
}
fprintf(fp,"\n");
}
fprintf(fp,"\n");
free(matrix);
free(m0);
free(m1);
cudaFree(d_m);
return 0;
}