Question

I'm trying to code integration by Simpson's method in CUDA.

This is the formula for Simpson's rule

enter image description here

where x_k = a + k*h.

Here's my code

    __device__ void initThreadBounds(int *n_start, int *n_end, int n, 
                                        int totalBlocks, int blockWidth)
    {
        int threadId = blockWidth * blockIdx.x + threadIdx.x;
        int nextThreadId = threadId + 1;

        int threads = blockWidth * totalBlocks;

        *n_start = (threadId * n)/ threads;
        *n_end =  (nextThreadId * n)/ threads;
    }

    __device__ float reg_func (float x)
    {
        return x;
    }

    typedef float (*p_func) (float);

    __device__ p_func integrale_f = reg_func;

    __device__ void integralSimpsonMethod(int totalBlocks, int totalThreads, 
                    double a, double b, int n, float p_function(float), float* result)
    {
        *result = 0;

        float h = (b - a)/n; 
        //*result = p_function(a)+p_function(a + h * n);
        //parallel
        int idx_start;
        int idx_end;
        initThreadBounds(&idx_start, &idx_end, n-1, totalBlocks, totalThreads);
        //parallel_ends
        for (int i = idx_start; i < idx_end; i+=2) {
            *result +=  ( p_function(a + h*(i-1)) + 
                          4 * p_function(a + h*(i)) + 
                          p_function(a + h*(i+1)) ) * h/3;

        }   
    } 


    __global__ void integralSimpson(int totalBlocks, int totalThreads,  float* result)
    {
        float res = 0;

        integralSimpsonMethod(totalBlocks, totalThreads, 0, 10, 1000, integrale_f, &res);
        result[(blockIdx.x*totalThreads + threadIdx.x)] = res;

        //printf ("Simpson method\n");
    }


    __host__ void inttest()
    {

        const int blocksNum = 32;
        const int threadNum = 32;

        float   *device_resultf; 
        float   host_resultf[threadNum*blocksNum]={0};


        cudaMalloc((void**) &device_resultf, sizeof(float)*threadNum*blocksNum);
            integralSimpson<<<blocksNum, threadNum>>>(blocksNum, threadNum, device_resultf);
        cudaThreadSynchronize();

        cudaMemcpy(host_resultf, device_resultf, sizeof(float) *threadNum*blocksNum, 
                      cudaMemcpyDeviceToHost);

        float sum = 0;
        for (int i = 0; i != blocksNum*threadNum; ++i) {
            sum += host_resultf[i];
            //  printf ("result in %i cell = %f \n", i, host_resultf[i]);
        }
        printf ("sum = %f \n", sum);
        cudaFree(device_resultf);
    }

int main(int argc, char* argv[])
{


   inttest();


    int i;
    scanf ("%d",&i);

}

The problem is: it works wrong when n is lower than 100000. For an integral from 0 to 10, the result is ~99, but when n = 100000 or larger it works fine and the result is ~50.

What's wrong, guys?

Was it helpful?

Solution

The basic problem here is that you don't understand your own algorithm.

Your integralSimpsonMethod() function is designed such that each thread is sampling at least 3 quadrature points per sub-interval in the integral domain. Therefore, if you choose n so that it is less than four times the number of threads in the kernel call, it is inevitable that each sub interval will overlap and the resulting integral will be incorrect. You need to make sure that the code checks and scales the thread count or n so that they don't produce overlap when the integral is computed.

If you are doing this for anything other than self-edification, then I recommend you look up the composite version of Simpson's rule. This is much better suited to parallel implementation and will be considerably more performant if implemented correctly.

OTHER TIPS

I would propose an approach to Simpson's integration by using CUDA Thrust. You basically need five steps:

  1. Generate the Simpson's quadrature weights;
  2. Generate the function sampling points;
  3. Generate the function values;
  4. Calculate the elementwise product between the quadrature weights and the function values;
  5. Sum the above products.

Step #1 requires creating an array with elements repeated many times, namely, 1 4 2 4 2 4 ... 1 for the Simpson's case. This can be accomplished by borrowing Robert Crovella's approach in cuda thrust library repeat vector multiple times.

Step #2 can be accomplished by using couting_iterators and borrowing talonmies approach in Purpose and usage of counting_iterators in CUDA Thrust library.

Step #3 is an application of thrust::transform.

Steps #4 and #5 can be accomplished together by thrust::inner_product.

This approach can be exploited also for use when other quadrature integration rules are of interest.

Here is the code

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/inner_product.h>
#include <thrust/functional.h>

#include <thrust/fill.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

// for printing
#include <thrust/copy.h>
#include <ostream>

#define STRIDE 2
#define N 100

#define pi_f  3.14159265358979f                 // Greek pi in single precision

struct sin_functor
{
    __host__ __device__
    float operator()(float x) const
    {
        return sin(2.f*pi_f*x);
    }
};

template <typename Iterator>
class strided_range
{
    public:

    typedef typename thrust::iterator_difference<Iterator>::type difference_type;

    struct stride_functor : public thrust::unary_function<difference_type,difference_type>
    {
        difference_type stride;

        stride_functor(difference_type stride)
            : stride(stride) {}

        __host__ __device__
        difference_type operator()(const difference_type& i) const
        {
            return stride * i;
        }
    };

    typedef typename thrust::counting_iterator<difference_type>                   CountingIterator;
    typedef typename thrust::transform_iterator<stride_functor, CountingIterator> TransformIterator;
    typedef typename thrust::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;

    // type of the strided_range iterator
    typedef PermutationIterator iterator;

    // construct strided_range for the range [first,last)
    strided_range(Iterator first, Iterator last, difference_type stride)
    : first(first), last(last), stride(stride) {}

    iterator begin(void) const
    {
        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
    }

    iterator end(void) const
    {
        return begin() + ((last - first) + (stride - 1)) / stride;
    }

    protected:
        Iterator first;
        Iterator last;
        difference_type stride;
};

int main(void)
{
    // --- Generate the integration coefficients
    thrust::host_vector<float> h_coefficients(STRIDE);
    h_coefficients[0] = 4.f;
    h_coefficients[1] = 2.f;

    thrust::device_vector<float> d_coefficients(N);

    typedef thrust::device_vector<float>::iterator Iterator;
    strided_range<Iterator> pos1(d_coefficients.begin()+1, d_coefficients.end()-2, STRIDE);
    strided_range<Iterator> pos2(d_coefficients.begin()+2, d_coefficients.end()-1, STRIDE);

    thrust::fill(pos1.begin(), pos1.end(), h_coefficients[0]);
    thrust::fill(pos2.begin(), pos2.end(), h_coefficients[1]);

    d_coefficients[0]       = 1.f;
    d_coefficients[N-1]     = 1.f;

    // print the generated d_coefficients
    std::cout << "d_coefficients: ";
    thrust::copy(d_coefficients.begin(), d_coefficients.end(), std::ostream_iterator<float>(std::cout, " "));  std::cout << std::endl;

    // --- Generate sampling points
    float a     = 0.f;
    float b     = .5f;

    float Dx    = (b-a)/(float)(N-1);

    thrust::device_vector<float> d_x(N);

    thrust::transform(thrust::make_counting_iterator(a/Dx),
        thrust::make_counting_iterator((b+1.f)/Dx),
        thrust::make_constant_iterator(Dx),
        d_x.begin(),
        thrust::multiplies<float>());

    // --- Calculate function values
    thrust::device_vector<float> d_y(N);

    thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), sin_functor());

    // --- Calculate integral
    float integral = (Dx/3.f) * thrust::inner_product(d_y.begin(), d_y.begin() + N, d_coefficients.begin(), 0.0f);

    printf("The integral is = %f\n", integral);

    getchar();

    return 0;
}
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top