This looks like a reduce problem. I think you can use thrust::transform
with zip iterators and thrust::reduce_by_key
. A sketch of this solution is:
// generate indices
std::vector< int > hindices;
for( size_t i=0 ; i<N_gene ; ++i )
for( size_t j=0 ; j<n_ka_d[i] ; ++j )
hindices.push_back( i );
thrust::device_vector< int > indices = hindices;
// generate tmp
// trafo1 implements get_coeff0( get< 0 >( t ) ) * get< 1 >( t);
thrust::device_vector< double > tmp( N );
thrust::transform(
thrust::make_zip_iterator(
thrust::make_tuple( ka_vec_d.begin() , ka_val_d.begin() ) ) ,
thrust::make_zip_iterator(
thrust::make_tuple( ka_vec_d.end() , ka_val_d.end() ) ) ,
tmp.begin() , trafo1 );
// do the reduction for each ac[i]
thrust::device_vector< int > indices_out( N );
thrust::reduce_by_key( indices.begin() , indices.end() , tmp.begin() ,
ac.begin() , indices_out.begin() );
// do the pow transformation
thrust::transform( ac.begin() , ac.end() , ac.begin() , pow_trafo );
I this this can also be optimized by transform_iterators to reduce the number of calls of thrust::transform
and thrust::recuce_by_key
.