Residual calculation using CUDA

Question 1

thrust::inner_product will do it in a single function call. Your original idea can be made to work also (zipping together the two vectors and using thrust::transform_reduce) This code demonstrates both methods:

#include <iostream>

#include <thrust/tuple.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>

#define N 2

struct zdiffsq{
template <typename Tuple>
  __host__ __device__ float operator()(Tuple a)
  {
    float result = thrust::get<0>(a) - thrust::get<1>(a);
    return result*result;
  }
};

struct diffsq{
  __host__ __device__ float operator()(float a, float b)
  {
    return (b-a)*(b-a);
  }
};

int main(){

  thrust::device_vector<float> oldvector(N);
  thrust::device_vector<float> newvector(N);
  oldvector[0] = 1.0f;  oldvector[1] = 2.0f;
  newvector[0] = 2.0f;  newvector[1] = 5.0f;

  float result = thrust::inner_product(oldvector.begin(), oldvector.end(), newvector.begin(), 0.0f, thrust::plus<float>(), diffsq());
  std::cout << "Result: " << result << std::endl;

  float result2 = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(oldvector.begin(), newvector.begin())), thrust::make_zip_iterator(thrust::make_tuple(oldvector.end(), newvector.end())), zdiffsq(), 0.0f, thrust::plus<float>());
  std::cout << "Result2: " << result2 << std::endl;
}

You can also investigate eliminating the functor definition used with the inner product example, by using thrust placeholders.

Even if you want to write your own CUDA code, the standard recommendation now for oft-used algorithms like parallel reductions and sorting, is to use cub.

And yes, the CUDA parallel reduction sample and accompanying presentation is still a good basic intro to a fast parallel reduction.

Question 2

Robert Crovella has already answered this question and has also suggested using CUB.

At variance with Thrust, CUB leaves performance-critical parameters, such as the choice of specific reduction algorithm to be used and the degree of concurrency unbound, selectable by the user. These parameters can be tuned in order maximimize performance for a particular architecture and application. The parameters can be specified at compile time, so avoiding runtime performance penalties.

Below, there is a full worked example on how using CUB for residual calculation.

#include <cub/cub.cuh>
#include <cuda.h>

#include "Utilities.cuh"

#include <iostream>

#define BLOCKSIZE           256
#define ITEMS_PER_THREAD    8

const int N = 4096;

/******************************/
/* TRANSFORM REDUCTION KERNEL */
/******************************/
__global__ void TransformSumKernel(const float * __restrict__ indata1, const float * __restrict__ indata2, float * __restrict__ outdata) {

    unsigned int tid = threadIdx.x + blockIdx.x * gridDim.x;

    // --- Specialize BlockReduce for type float. 
    typedef cub::BlockReduce<float, BLOCKSIZE * ITEMS_PER_THREAD> BlockReduceT;

    __shared__ typename BlockReduceT::TempStorage  temp_storage;

    float result;
    if(tid < N) result = BlockReduceT(temp_storage).Sum((indata1[tid] - indata2[tid]) * (indata1[tid] - indata2[tid]));

    if(threadIdx.x == 0) atomicAdd(outdata, result);

    return;
}

/********/
/* MAIN */
/********/
int main() {

    // --- Allocate host side space for 
    float *h_data1      = (float *)malloc(N * sizeof(float));
    float *h_data2      = (float *)malloc(N * sizeof(float));
    float *h_result     = (float *)malloc(sizeof(float));

    float *d_data1;     gpuErrchk(cudaMalloc(&d_data1, N * sizeof(float)));
    float *d_data2;     gpuErrchk(cudaMalloc(&d_data2, N * sizeof(float)));
    float *d_result;    gpuErrchk(cudaMalloc(&d_result, sizeof(float)));

    for (int i = 0; i < N; i++) {
        h_data1[i] = 1.f;
        h_data2[i] = 3.f;
    }

    gpuErrchk(cudaMemcpy(d_data1, h_data1, N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_data2, h_data2, N * sizeof(float), cudaMemcpyHostToDevice));

    TransformSumKernel<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_data1, d_data2, d_result);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost));

    std::cout << "output: ";
    std::cout << h_result[0];
    std::cout << std::endl;

    gpuErrchk(cudaFree(d_data1));
    gpuErrchk(cudaFree(d_data2));
    gpuErrchk(cudaFree(d_result));

    return 0;
}