Simpson's method to integrate real valued functions with CUDA

Question 1

The basic problem here is that you don't understand your own algorithm.

Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.

If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.

Question 2

I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:

Generate the Simpson's quadrature weights;
Generate the function sampling points;
Generate the function values;
Calculate the elementwise product between the quadrature weights and the function values;
Sum the above products.

Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.

Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.

Step #3 is an application of thrust::transform.

Steps #4 and #5 can be accomplished together by thrust::inner_product.

This approach can be exploited also for use when other quadrature integration rules are of interest.

Here is the code

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>

#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

// for printing
#include <thrust/copy.h>
#include <ostream>

#define STRIDE 2
#define N 100

#define pi_f  3.14159265358979f                 // Greek pi in single precision

struct sin_functor
{
    __host__ __device__
    float operator()(float x) const
    {
        return sin(2.f*pi_f*x);
    }
};

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
    : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
        Iterator first;
        Iterator last;
        difference_type stride;
};

int main(void)
{
    // --- Generate the integration coefficients
    thrust::host_vector<float> h_coefficients(STRIDE);
    h_coefficients[0] = 4.f;
    h_coefficients[1] = 2.f;

    thrust::device_vector<float> d_coefficients(N);

    typedef thrust::device_vector<float>::iterator Iterator;
    strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
    strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);

    thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
    thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);

    d_coefficients[0]       = 1.f;
    d_coefficients[N-1]     = 1.f;

    // print the generated d_coefficients
    std::cout << "d_coefficients: ";
    thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " "));  std::cout << std::endl;

    // --- Generate sampling points
    float a     = 0.f;
    float b     = .5f;

    float Dx    = (b-a)/(float)(N-1);

    thrust::device_vector<float> d_x(N);

    thrust::transform(thrust::make_counting_iterator(a/Dx),
        thrust::make_counting_iterator((b+1.f)/Dx),
        thrust::make_constant_iterator(Dx),
        d_x.begin(),
        thrust::multiplies<float>());

    // --- Calculate function values
    thrust::device_vector<float> d_y(N);

    thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());

    // --- Calculate integral
    float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);

    printf("The integral is = %f\n", integral);

    getchar();

    return 0;
}