thrust::inner_product will do it in a single function call. Your original idea can be made to work also (zipping together the two vectors and using thrust::transform_reduce
) This code demonstrates both methods:
#include <iostream>
#include <thrust/tuple.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#define N 2
struct zdiffsq{
template <typename Tuple>
__host__ __device__ float operator()(Tuple a)
{
float result = thrust::get<0>(a) - thrust::get<1>(a);
return result*result;
}
};
struct diffsq{
__host__ __device__ float operator()(float a, float b)
{
return (b-a)*(b-a);
}
};
int main(){
thrust::device_vector<float> oldvector(N);
thrust::device_vector<float> newvector(N);
oldvector[0] = 1.0f; oldvector[1] = 2.0f;
newvector[0] = 2.0f; newvector[1] = 5.0f;
float result = thrust::inner_product(oldvector.begin(), oldvector.end(), newvector.begin(), 0.0f, thrust::plus<float>(), diffsq());
std::cout << "Result: " << result << std::endl;
float result2 = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(oldvector.begin(), newvector.begin())), thrust::make_zip_iterator(thrust::make_tuple(oldvector.end(), newvector.end())), zdiffsq(), 0.0f, thrust::plus<float>());
std::cout << "Result2: " << result2 << std::endl;
}
You can also investigate eliminating the functor definition used with the inner product example, by using thrust placeholders.
Even if you want to write your own CUDA code, the standard recommendation now for oft-used algorithms like parallel reductions and sorting, is to use cub.
And yes, the CUDA parallel reduction sample and accompanying presentation is still a good basic intro to a fast parallel reduction.