Based on the additional comment that instead of 3 rows there are thousands of rows, we can write a transform functor that sums an entire row. Based on the fact that there are thousands of rows, this should keep the machine pretty busy:
#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#define ROW 20
#define COL 10
__device__ int *vals;
__device__ int *keys;
struct test_functor
{
const int a;
test_functor(int _a) : a(_a) {}
__device__
int operator()(int& x, int& y ) {
int temp = 0;
for (int i = 0; i<a; i++)
temp += vals[i + (y*a)] * keys[i];
return temp;
}
};
int main(){
int *s_vals, *s_keys;
thrust::host_vector<int> h_vals(ROW*COL);
thrust::host_vector<int> h_keys(COL);
thrust::sequence(h_vals.begin(), h_vals.end());
thrust::fill(h_keys.begin(), h_keys.end(), 1);
h_keys[0] = 0;
thrust::device_vector<int> d_vals = h_vals;
thrust::device_vector<int> d_keys = h_keys;
thrust::device_vector<int> d_sums(ROW);
thrust::fill(d_sums.begin(), d_sums.end(), 0);
s_vals = thrust::raw_pointer_cast(&d_vals[0]);
s_keys = thrust::raw_pointer_cast(&d_keys[0]);
cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
thrust::device_vector<int> d_idx(ROW);
thrust::sequence(d_idx.begin(), d_idx.end());
thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(), d_sums.begin(), test_functor(COL));
thrust::host_vector<int> h_sums = d_sums;
std::cout << "Results :" << std::endl;
for (unsigned i = 0; i<ROW; i++)
std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
return 0;
}
This approach has the drawback that in general accesses to the vals
array will not be coalesced. However for a few thousand rows the cache may offer significant relief. We can fix this problem by re-ordering the data to be stored in column-major form in the flattened array, and change our indexing method in the loop in the functor to be like this:
for (int i=0; i<a; i++)
temp += vals[(i*ROW)+y]*keys[i];
If preferred, you can pass ROW as an additional parameter to the functor.