Question

I have a laptop with an NVIDIA GT750M 4Gb (compute capability 3.0) graphics card. I need to sort an array of structures on CUDA (about 3 × 10^7 elements). But I cannot figure out how, since I do not have enough experience in CUDA. When using thrust::sort I get strange results (it takes a few tens of minutes, while std::sort takes 1 minute).

struct MyStruct
{
    float key;
    float a;
    float b;
    int c;
    int d;
    int e;
    int f;
    bool flag;
}
bool minCompare(const MyStruct lhs, const MyStruct rhs)
{
    return lhs.key < rhs.key;
}
Was it helpful?

Solution

As Robert Crovella has pointed out in his comment, tents of minutes most likely means that you are doing something wrong. I'm providing an example below in which I compare the performance of sorting an Array of Structures (AoS) and a Structure of Arrays (SoA) using thrust::sort and thrust::sort_by_key. I'm running on a laptop GeForce GT 540M and compiling with CUDA 5.5, so you have a more powerful card than mine. For 100000 elements the execution time is of the order of seconds in both cases. As I pointed out in my comment, the first case is more demanding in terms of computation time (1675ms) than the second (668.9ms).

#include <thrust\device_vector.h>
#include <thrust\sort.h>

struct MyStruct1
{
    int key;
    int value1;
    int value2;
};

struct MyStruct2
{
    int N;
    int* key;
    int* value1;
    int* value2;

    MyStruct2(int N_) {
        N = N_;
        cudaMalloc((void**)&key,N*sizeof(int));
        cudaMalloc((void**)&value1,N*sizeof(int));
        cudaMalloc((void**)&value2,N*sizeof(int));
    }

};

__host__ __device__ bool operator<(const MyStruct1 &lhs, const MyStruct1 &rhs) { return (lhs.key < rhs.key); };

void main(void)
{
    const int N = 10000;

    float time;
    cudaEvent_t start, stop;

    /*******************************/
    /* SORTING ARRAY OF STRUCTURES */
    /*******************************/
    thrust::host_vector<MyStruct1> h_struct1(N);
    for (int i = 0; i<N; i++)
    {
        MyStruct1 s;
        s.key       = rand()*255;
        s.value1    = rand()*255;
        s.value2    = rand()*255;
        h_struct1[i]    = s;
    }
    thrust::device_vector<MyStruct1> d_struct(h_struct1);

cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

thrust::sort(d_struct.begin(), d_struct.end());

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting array of structs - elapsed time:  %3.1f ms \n", time);  

h_struct1 = d_struct;

//for (int i = 0; i<N; i++)
//{
//  MyStruct1 s = h_struct1[i];
//  printf("key %i value1 %i value2 %i\n",s.key,s.value1,s.value2);
//}
//printf("\n\n");

/*******************************/
/* SORTING STRUCTURES OF ARRAYS*/
/*******************************/

MyStruct2 d_struct2(N);
thrust::host_vector<int> h_temp_key(N);
thrust::host_vector<int> h_temp_value1(N);
thrust::host_vector<int> h_temp_value2(N);

//for (int i = 0; i<N; i++)
//{
//  h_temp_key[i]       = rand()*255;
//  h_temp_value1[i]    = rand()*255;
//  h_temp_value2[i]    = rand()*255;
//  printf("Original data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//}
//printf("\n\n");

cudaMemcpy(d_struct2.key,h_temp_key.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value1,h_temp_value1.data(),N*sizeof(int),cudaMemcpyHostToDevice);
cudaMemcpy(d_struct2.value2,h_temp_value2.data(),N*sizeof(int),cudaMemcpyHostToDevice);

// wrap raw pointers with device pointers 
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);

thrust::device_ptr<int> dev_ptr_key     = thrust::device_pointer_cast(d_struct2.key);
thrust::device_ptr<int> dev_ptr_value1  = thrust::device_pointer_cast(d_struct2.value1);
thrust::device_ptr<int> dev_ptr_value2  = thrust::device_pointer_cast(d_struct2.value2);

thrust::device_vector<int> d_indices(N);
thrust::sequence(d_indices.begin(), d_indices.end(), 0, 1);

// first sort the keys and indices by the keys
thrust::sort_by_key(dev_ptr_key, dev_ptr_key + N, d_indices.begin());

// Now reorder the ID arrays using the sorted indices
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value1, dev_ptr_value1);
thrust::gather(d_indices.begin(), d_indices.end(), dev_ptr_value2, dev_ptr_value2);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Sorting struct of arrays - elapsed time:  %3.1f ms \n", time);  

cudaMemcpy(h_temp_key.data(),d_struct2.key,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value1.data(),d_struct2.value1,N*sizeof(int),cudaMemcpyDeviceToHost);
cudaMemcpy(h_temp_value2.data(),d_struct2.value2,N*sizeof(int),cudaMemcpyDeviceToHost);

//for (int i = 0; i<N; i++) printf("Ordered data - key %i value1 %i value2 %i\n",h_temp_key[i],h_temp_value1[i],h_temp_value2[i]);
//printf("\n\n");

getchar();

}

For the sake of simplicity, I have skipped adding proper CUDA error check in the sense of What is the canonical way to check for errors using the CUDA runtime API?.

OTHER TIPS

Just in case anyone's still looking, if you decide to use Thrust libraries, you can make a zip iterator of tuples from the structure of arrays that you want to sort, and pass this iterator to thrust::sort_by_key() like so:

void sort() {
   const int N = 6;
   int  *keys = new  int[N]; // = { 1,   4,   2,   8,   5,   7};
   char *vals = new char[N]; // = {'a', 'b', 'c', 'd', 'e', 'f'};
   int  *addr = new  int[N]; // = { 1,   2,   3,   4,   5,   6};
   keys[0]=1; keys[1]=4; keys[2]=2; keys[3]=8; keys[4]=5; keys[5]=7;
   vals[0]='a'; vals[1]='b'; vals[2]='c'; vals[3]='d'; vals[4]='e'; vals[5]='f';
   addr[0]=1; addr[1]=2; addr[2]=3; addr[3]=4; addr[4]=5; addr[5]=6;

   int *d_keys, *d_addr;
   char *d_vals;
   CUDA_SAFE(cudaMalloc((void **)&d_keys, N * sizeof(int)));
   CUDA_SAFE(cudaMalloc((void **)&d_addr, N * sizeof(int)));
   CUDA_SAFE(cudaMalloc((void **)&d_vals, N * sizeof(char)));
   CUDA_SAFE(cudaMemcpy(d_keys, keys, N * sizeof(int), cudaMemcpyHostToDevice));
   CUDA_SAFE(cudaMemcpy(d_vals, vals, N * sizeof(char), cudaMemcpyHostToDevice));
   CUDA_SAFE(cudaMemcpy(d_addr, addr, N * sizeof(int), cudaMemcpyHostToDevice));

   auto it = thrust::make_zip_iterator(thrust::make_tuple(d_vals, d_addr));

   thrust::sort_by_key(thrust::device, d_keys, d_keys+N, it);

   CUDA_SAFE(cudaMemcpy(keys, d_keys, N * sizeof(int), cudaMemcpyDeviceToHost));
   CUDA_SAFE(cudaMemcpy(vals, d_vals, N * sizeof(char), cudaMemcpyDeviceToHost));
   CUDA_SAFE(cudaMemcpy(addr, d_addr, N * sizeof(int), cudaMemcpyDeviceToHost));

   printf("Keys: "); for (int i = 0; i < N; i++) printf("%d ", keys[i]); printf("\n");
   printf("Vals: "); for (int i = 0; i < N; i++) printf("%c ", vals[i]); printf("\n");
   printf("Addr: "); for (int i = 0; i < N; i++) printf("%d ", addr[i]); printf("\n");
}

I have excluded the headers and checks for brevity.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top