I have tried to write programm for multiply CSR matrix and vector on C/CUDA 5.5 in MS Visual Studio 2012, but faced with ptx error.
Listing of my code:
__global__ void multKernelSymm(double* s, double* u, double* val, int* rowPtr, int* colInd)
{
int l = 0, jl, i;
int idx = blockDim.x*blockIdx.x;
l = rowPtr[idx] - 1;
for ( i = 0; i < (rowPtr[idx + 1] - rowPtr[idx]); i++){
jl = colInd[l] - 1;
s[idx] = s[idx] + val[l] * u[jl];
l+=1;
}
l = 0;
l = rowPtr[idx] - 1;
for (int i = 0; i < (rowPtr[idx + 1] - rowPtr[idx]); i++){
jl = colInd[l] - 1;
if(jl > idx)
atomicAdd(&s[jl], val[l] * u[idx]);
//s[jl] = s[jl] + val[l] * u[idx];
l+=1;
}
}
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
I was able to get your code to compile by making 3 changes:
As @PaulR indicated, remove the double keyword when calling your custom atomicAdd function.
The first parameter of atomicAdd should be a pointer to the address you want to update (the address you want to add the value to.) Instead you are passing the value at that address. So your function call should look like this:
atomicAdd(&(s[jl]), val[l] * u[idx]);
Finally, move the declaration and definition of your custom atomicAdd function to before your kernel definition.
With those changes I could get your code to compile.