CUB select if with returned indexes

Question 1

After some fiddling and asking around, I was able to get a simple code along the lines of what you suggest working:

$ cat t348.cu
#include <cub/cub.cuh>
#include <stdio.h>
#define DSIZE 6

struct GreaterThan
{

    __host__ __device__ __forceinline__
    bool operator()(const cub::ItemOffsetPair<int, ptrdiff_t> &a) const {
        return (a.value > DSIZE/2);
    }
};

int main(){

  int num_items = DSIZE;
  int *d_in;
  cub::ItemOffsetPair<int,ptrdiff_t> * d_out;
  int *d_num_selected;
  int *d_temp_storage = NULL;
  size_t temp_storage_bytes = 0;

  cudaMalloc((void **)&d_in, num_items*sizeof(int));
  cudaMalloc((void **)&d_num_selected, sizeof(int));
  cudaMalloc((void **)&d_out, num_items*sizeof(cub::ItemOffsetPair<int,ptrdiff_t>));

  int h_in[DSIZE] = {5, 4, 3, 2, 1, 0};
  cudaMemcpy(d_in, h_in, num_items*sizeof(int), cudaMemcpyHostToDevice);

  cub::ArgIndexInputIterator<int *> input_itr(d_in);


  cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, input_itr, d_out, d_num_selected, num_items, GreaterThan());

  cudaMalloc(&d_temp_storage, temp_storage_bytes);

  cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, input_itr, d_out, d_num_selected, num_items, GreaterThan());
  int h_num_selected = 0;
  cudaMemcpy(&h_num_selected, d_num_selected, sizeof(int), cudaMemcpyDeviceToHost);
  cub::ItemOffsetPair<int, ptrdiff_t> h_out[h_num_selected];
  cudaMemcpy(h_out, d_out, h_num_selected*sizeof(cub::ItemOffsetPair<int, ptrdiff_t>), cudaMemcpyDeviceToHost);
  for (int i =0 ; i < h_num_selected; i++)
    printf("index: %d, offset: %d, value: %d\n", i, h_out[i].offset, h_out[i].value);

  return 0;
}
$ nvcc -arch=sm_20 -o t348 t348.cu
$ ./t348
index: 0, offset: 0, value: 5
index: 1, offset: 1, value: 4
$

RHEL 6.2, cub v1.2.2, CUDA 5.5

Question 2

I have recently been running into performance issues when using the Thrust library. These come from thrust allocating memory in the base of a large nested loop structure. This is obviously unwanted, with ideal execution using a pre-allocated slab of global memory.

Thrust lets you customize how temporary memory is allocated during algorithm execution.

See the custom_temporary_allocation example to see how to create a cache for your pre-allocated slab.