I'm not going to try and sort out your complex matrix allocation scheme. The purpose of my suggestion was so that you can simplify things to simple 1-line allocations.
Furthermore, I don't think you really grasped the example I gave. It was a 3D example, and the typedefs had 2 subscripts. A 2D version would have typedefs with a single subscript.
Really none of this has to do with CUDA. It revolves around understanding of C arrays and pointers.
Those were the major changes I made to get your code working:
#include <stdio.h>
#include <stdlib.h>
#define hsize 256
#define vsize 256
#define IMAGE_TYPE unsigned char
__global__ void kernel(IMAGE_TYPE matrixin[][hsize], IMAGE_TYPE matrixout[][hsize]) {
int tid=threadIdx.x;
int bid=blockIdx.x;
matrixout[bid][tid]=matrixin[bid][tid];
}
int fatal(char* s) {
fprintf(stderr,"%s\n",s);
return 1;
}
int main() {
typedef IMAGE_TYPE IMarray[hsize];
IMarray *hin_image,*hout_image;
IMarray *din_image,*dout_image;
//allocate host memory
hin_image = (IMarray *)malloc(hsize*vsize*sizeof(IMAGE_TYPE));
hout_image = (IMarray *)malloc(hsize*vsize*sizeof(IMAGE_TYPE));
for(int i=0;i<vsize;i++)
for(int j=0;j<hsize;j++)
hin_image[i][j]='a';
//allocate device memory
cudaMalloc((void**)&din_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
cudaMalloc((void**)&dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE));
cudaMemset(dout_image, 0, (vsize*hsize)*sizeof(IMAGE_TYPE));
cudaMemcpy(din_image,hin_image, (vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyHostToDevice);
dim3 threads(hsize,1,1);
dim3 blocks(vsize,1,1);
kernel<<<blocks,threads>>>(din_image,dout_image);
cudaMemcpy(hout_image,dout_image,(vsize*hsize)*sizeof(IMAGE_TYPE),cudaMemcpyDeviceToHost);
for(int i=0;i<10;i++) {
printf("\n");
for(int j=0;j<10;j++)
printf("%c\t",hout_image[i][j]);
}
printf("\n");
cudaFree(din_image);
cudaFree(dout_image);
free(hin_image);
free(hout_image);
return 0;
}