Sorting arrays of structures in CUDA

Question 1

As Robert Crovella has pointed out in his comment, tents of minutes most likely means that you are doing something wrong. I'm providing an example below in which I compare the performance of sorting an Array of Structures (AoS) and a Structure of Arrays (SoA) using thrust::sort and thrust::sort_by_key. I'm running on a laptop GeForce GT 540M and compiling with CUDA 5.5, so you have a more powerful card than mine. For 100000 elements the execution time is of the order of seconds in both cases. As I pointed out in my comment, the first case is more demanding in terms of computation time (1675ms) than the second (668.9ms).

#include <thrust\device_vector.h>
#include <thrust\sort.h>

struct MyStruct1
{
    int key;
    int value1;
    int value2;
};

struct MyStruct2
{
    int N;
    int* key;
    int* value1;
    int* value2;

    MyStruct2(int N_) {
        N = N_;
        cudaMalloc((void**)&key,N*sizeof(int));
        cudaMalloc((void**)&value1,N*sizeof(int));
        cudaMalloc((void**)&value2,N*sizeof(int));
    }

};

__host__ __device__ bool operator<(const MyStruct1 &lhs, const MyStruct1 &rhs) { return (lhs.key < rhs.key); };

void main(void)
{
    const int N = 10000;

    float time;
    cudaEvent_t start, stop;

    /*******************************/
    /* SORTING ARRAY OF STRUCTURES */
    /*******************************/
    thrust::host_vector<MyStruct1> h_struct1(N);
    for (int i = 0; i<N; i++)
    {
        MyStruct1 s;
        s.key       = rand()*255;
        s.value1    = rand()*255;
        s.value2    = rand()*255;
        h_struct1[i]    = s;
    }
    thrust::device_vector<MyStruct1> d_struct(h_struct1);

cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

thrust::sort(d_struct.begin(), d_struct.end());

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting array of structs - elapsed time:  %3.1f ms \n", time);  

h_struct1 = d_struct;

//for (int i = 0; i<N; i++)
//{
//  MyStruct1 s = h_struct1[i];
//  printf("key %i value1 %i value2 %i\n",s.key,s.value1,s.value2);
//}
//printf("\n\n");

/*******************************/
/* SORTING STRUCTURES OF ARRAYS*/
/*******************************/

MyStruct2 d_struct2(N);
thrust::host_vector<int> h_temp_key(N);
thrust::host_vector<int> h_temp_value1(N);
thrust::host_vector<int> h_temp_value2(N);

//for (int i = 0; i<N; i++)
//{
//  h_temp_key[i]       = rand()*255;
//  h_temp_value1[i]    = rand()*255;
//  h_temp_value2[i]    = rand()*255;
//  printf("Original data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//}
//printf("\n\n");

cudaMemcpy(d_struct2.key,h_temp_key.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value1,h_temp_value1.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value2,h_temp_value2.data(),N*sizeof(int),cudaMemcpyHostToDevice);

// wrap raw pointers with device pointers 
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

thrust::device_ptr<int> dev_ptr_key     = thrust::device_pointer_cast(d_struct2.key);
thrust::device_ptr<int> dev_ptr_value1  = thrust::device_pointer_cast(d_struct2.value1);
thrust::device_ptr<int> dev_ptr_value2  = thrust::device_pointer_cast(d_struct2.value2);

thrust::device_vector<int> d_indices(N);
thrust::sequence(d_indices.begin(), d_indices.end(), 0, 1);

// first sort the keys and indices by the keys
thrust::sort_by_key(dev_ptr_key, dev_ptr_key + N, d_indices.begin());

// Now reorder the ID arrays using the sorted indices
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value1, dev_ptr_value1);
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value2, dev_ptr_value2);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting struct of arrays - elapsed time:  %3.1f ms \n", time);  

cudaMemcpy(h_temp_key.data(),d_struct2.key,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value1.data(),d_struct2.value1,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value2.data(),d_struct2.value2,N*sizeof(int),cudaMemcpyDeviceToHost);

//for (int i = 0; i<N; i++) printf("Ordered data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//printf("\n\n");

getchar();

}

For the sake of simplicity, I have skipped adding proper CUDA error check in the sense of What is the canonical way to check for errors using the CUDA runtime API?.

Question 2

Just in case anyone's still looking, if you decide to use Thrust libraries, you can make a zip iterator of tuples from the structure of arrays that you want to sort, and pass this iterator to thrust::sort_by_key() like so:

void sort() {
   const int N = 6;
   int  *keys = new  int[N]; // = { 1,   4,   2,   8,   5,   7};
   char *vals = new char[N]; // = {'a', 'b', 'c', 'd', 'e', 'f'};
   int  *addr = new  int[N]; // = { 1,   2,   3,   4,   5,   6};
   keys[0]=1; keys[1]=4; keys[2]=2; keys[3]=8; keys[4]=5; keys[5]=7;
   vals[0]='a'; vals[1]='b'; vals[2]='c'; vals[3]='d'; vals[4]='e'; vals[5]='f';
   addr[0]=1; addr[1]=2; addr[2]=3; addr[3]=4; addr[4]=5; addr[5]=6;

   int *d_keys, *d_addr;
   char *d_vals;
   CUDA_SAFE(cudaMalloc((void **)&d_keys, N * sizeof(int)));
   CUDA_SAFE(cudaMalloc((void **)&d_addr, N * sizeof(int)));
   CUDA_SAFE(cudaMalloc((void **)&d_vals, N * sizeof(char)));
   CUDA_SAFE(cudaMemcpy(d_keys, keys, N * sizeof(int), cudaMemcpyHostToDevice));
   CUDA_SAFE(cudaMemcpy(d_vals, vals, N * sizeof(char), cudaMemcpyHostToDevice));
   CUDA_SAFE(cudaMemcpy(d_addr, addr, N * sizeof(int), cudaMemcpyHostToDevice));

   auto it = thrust::make_zip_iterator(thrust::make_tuple(d_vals, d_addr));

   thrust::sort_by_key(thrust::device, d_keys, d_keys+N, it);

   CUDA_SAFE(cudaMemcpy(keys, d_keys, N * sizeof(int), cudaMemcpyDeviceToHost));
   CUDA_SAFE(cudaMemcpy(vals, d_vals, N * sizeof(char), cudaMemcpyDeviceToHost));
   CUDA_SAFE(cudaMemcpy(addr, d_addr, N * sizeof(int), cudaMemcpyDeviceToHost));

   printf("Keys: "); for (int i = 0; i < N; i++) printf("%d ", keys[i]); printf("\n");
   printf("Vals: "); for (int i = 0; i < N; i++) printf("%c ", vals[i]); printf("\n");
   printf("Addr: "); for (int i = 0; i < N; i++) printf("%d ", addr[i]); printf("\n");
}

I have excluded the headers and checks for brevity.