functor with nested calls to CUDA::thrust functors operating as zip_iterator

StackOverflow https://stackoverflow.com/questions/21288323

  •  01-10-2022
  •  | 
  •  

Вопрос

I found some difficulties trying to implement ODEs solver routines running on GPUs using CUDA::Thurst iterators to solve a bunch of coupled first order equations in the GPU. I want to work around the approach in the former question enabling the user to write the systems of equations as human like as possible using arbitrary functors working on tuples of vectors. In details, here is a small piece of code:

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>

#include <iostream>
#include <math.h>


__host__ __device__ float f1(float x)
{
  return sinf(x);
}

__host__ __device__ float f2(float x)
{
  return cosf(x);
}

__host__ __device__ float Vx(float x)
{
  return sinf(x);
}

struct q_dot
{
  float x;
  float delta;
  q_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float p = thrust::get<1>(t) + delta;
    return  p/MASS;
  }
};


struct p_dot
{
  float x;
  float delta;
  p_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float q = thrust::get<0>(t) +   delta;
    return  -Vx(q);
  }
};




struct euler_functor
{
  unsigned fn;
  float h;
  float er;
  float x0;

  euler_functor(unsigned _fn,float _x0,float _h, float _er) : fn(_fn),h(_h),er(_er),x0(_x0) {};
  template <typename Tuple>
  __host__ __device__
  void operator()(Tuple t) const  {
    // if (fn == 1) y = h*f1(y);
    //else if (fn == 2) y = h*f2(y); This can be handled in this way?

    q =  h*p_dot(x0,h/2)(t);
    p =  h*p_dot(x0,h/2)(t);
    float er_p,er_q;
    er_p=0.5*h*p_dot(x0,h/2)(t);
    er_q=0.5*h*q_dot(x0,h/2)(t);
    er = er_p;

  }
};


int main(void)
{
  float t=0;
  float t_step=0.1;
  float error;


  const unsigned N = 8;
  // allocate three device_vectors with 10 elements
  thrust::device_vector<float> Q(N),P(N);
  // initilaize to some values
  thrust::sequence(Q.begin(), Q.end(),  0.0f, (float)(6.283/(float)N));
  // initilaize to some values
  thrust::sequence(P.begin(), P.end(),  0.0f, (float)(10.283/(float)N));

  // apply euler for each element of Q and P
  //thrust::for_each(X.begin(),X.end(),euler_functor(1,t,t_step,error)); this becomes:
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(Q.begin(),P.begin())),
                   thrust::make_zip_iterator(thrust::make_tuple(  Q.end(),  P.end())),euler_functor(1,t,t_step,error));
  // print the values
  for(int i = 0; i < N; i++) std::cout<< Q[i]<<"  "<<P[i]<< std::endl;
}

But when I compile the former code I get a lot of errors. Again, I am not sure if this is the best way to do it. How can I make it work? Where are my error in it? Is there a better approach? as It is the er variable that carries information about the numerical error is returning always zero. Howto get this information? It can be use to implement some adaptive trick.

Это было полезно?

Решение

There were a number of problems with your code. I'm not sure I will capture all of them here, but:

  1. MASS was undefined.
  2. Your p_dot and q_dot functors needed additional __device__ decorations
  3. Your usage of the p and q variables inside the euler functor did not make any sense; they are not defined anywhere, nor is this the correct way to return values into P and Q vectors, if that was your intent.
  4. We don't return data through the variable passed to the functor at instantiation. Therefore to return the er variable at each timestep, I create a separate vector (ERP and ERQ) to do so.

Here is a modified code which has the above issues and various other issues fixed. It seems to return sensible results, although I've not checked the arithmetic in detail.

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/fill.h>
#include <thrust/replace.h>
#include <thrust/functional.h>
#include <thrust/for_each.h>
#include <thrust/iterator/zip_iterator.h>

#include <iostream>
#include <math.h>

#define MASS 1.0f

__host__ __device__ float f1(float x)
{
  return sinf(x);
}

__host__ __device__ float f2(float x)
{
  return cosf(x);
}

__host__ __device__ float Vx(float x)
{
  return sinf(x);
}

struct q_dot
{
  float x;
  float delta;
  __host__ __device__
  q_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float p = thrust::get<1>(t) + delta;
    return  p/MASS;
  }
};


struct p_dot
{
  float x;
  float delta;
  __host__ __device__
  p_dot(float _x,float _delta): x(_x),delta(_delta){};
  template <typename Tuple>
  __host__ __device__
  float operator()(Tuple t)
  {
    float q = thrust::get<0>(t) +   delta;
    return  -Vx(q);
  }
};



struct euler_functor
{
  unsigned fn;
  float h;
  float x0;

  euler_functor(unsigned _fn,float _x0,float _h) : fn(_fn),h(_h),x0(_x0) {};
  template <typename Tuple>
  __host__ __device__
  void operator()(const Tuple &t) {
    // if (fn == 1) y = h*f1(y);
    //else if (fn == 2) y = h*f2(y); 
    float t0, t1, t2, t3;
    t0 =  h*p_dot(x0,h/2.0f)(t);
    t1 =  h*q_dot(x0,h/2.0f)(t);
    t2=0.5*h*p_dot(x0,h/2.0f)(t);
    t3=0.5*h*q_dot(x0,h/2.0f)(t);
    thrust::get<0>(t) = t0;
    thrust::get<1>(t) = t1;
    thrust::get<2>(t) = t2;
    thrust::get<3>(t) = t3;

  }
};


int main(void)
{
  float t=0;
  float t_step=0.1;


  const unsigned N = 8;
  // allocate three device_vectors with 10 elements
  thrust::device_vector<float> Q(N),P(N), ERP(N), ERQ(N);
  // initilaize to some values
  thrust::sequence(Q.begin(), Q.end(),  0.0f, (float)(6.283/(float)N));
  // initilaize to some values
  thrust::sequence(P.begin(), P.end(),  0.0f, (float)(10.283/(float)N));
  for(int i = 0; i < N; i++) std::cout<< Q[i]<<" "<<P[i]<< " "<< ERP[i] << " " << ERQ[i] << std::endl;
  std::cout<< "*****" << std::endl;
  // apply euler for each element of Q and P
  //thrust::for_each(X.begin(),X.end(),euler_functor(1,t,t_step,error)); this becomes:
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(Q.begin(),P.begin(),ERP.begin(), ERQ.begin())),thrust::make_zip_iterator(thrust::make_tuple(Q.end(),P.end(),ERP.end(), ERQ.end())),euler_functor(1,t,t_step));
  // print the values
  for(int i = 0; i < N; i++) std::cout<< Q[i]<<" "<<P[i]<< " "<< ERP[i] << " " << ERQ[i] << std::endl;
}
Лицензировано под: CC-BY-SA с атрибуция
Не связан с StackOverflow
scroll top