Standard deviation using CUDA

https://stackoverflow.com/questions/12380966

01-07-2021
|

Domanda

I am writing a code to find the mean and standard deviation of 6 vectors with 8000 elements each. I was wondering if I could do it using CUDA and speed up the operation. I could think of how to find the mean using CUDA but I am unable to understand how to calculate standard deviation using CUDA. Can anyone help me here please?

Soluzione

I have solved this problem in CUDA for data mining. I havent used any libraries. But, it gave me good results. The problem is to find the standard deviation and mean of 128*1 million samples. This is what i did.

My device has the shared memory of 16KB. And, i am using floats. So, the shared memory can accommodate 4,000 elements. The max thread per block for my device is 512. So, i can have 8 blocks. If i divide 16KB into 8 blocks, i'll get 2,000KB (i.e., 1 float for 1 thread). Generally, this wont match. If you have a better device you need to do this math again.
To find the standard deviation, each block has 512 elements. You can find the square(element-mean) using a single thread.
The next challenge is adding this and finding the sum of these elements. Try the same method you employed for finding the mean. For 512 elements. Copy the result to global memory.
Iterate. Find the square root of the result.

PS: Plan accordingly so that the global memory calls are minimum. Mean and Standard deviation calls the data frequently from the memory.

Altri suggerimenti

Here is a Thrust example that calculates a number of summary statistics in a single pass, including mean and std. deviation.

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <cmath>
#include <limits>

// This example computes several statistical properties of a data
// series in a single reduction.  The algorithm is described in detail here:
// http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
//
// Thanks to Joseph Rhoads for contributing this example


// structure used to accumulate the moments and other 
// statistical properties encountered so far.
template <typename T>
struct summary_stats_data
{
    T n;
    T min;
    T max;
    T mean;
    T M2;
    T M3;
    T M4;

    // initialize to the identity element
    void initialize()
    {
      n = mean = M2 = M3 = M4 = 0;
      min = std::numeric_limits<T>::max();
      max = std::numeric_limits<T>::min();
    }

    T variance()   { return M2 / (n - 1); }
    T variance_n() { return M2 / n; }
    T skewness()   { return std::sqrt(n) * M3 / std::pow(M2, (T) 1.5); }
    T kurtosis()   { return n * M4 / (M2 * M2); }
};

// stats_unary_op is a functor that takes in a value x and
// returns a variace_data whose mean value is initialized to x.
template <typename T>
struct summary_stats_unary_op
{
    __host__ __device__
    summary_stats_data<T> operator()(const T& x) const
    {
         summary_stats_data<T> result;
         result.n    = 1;
         result.min  = x;
         result.max  = x;
         result.mean = x;
         result.M2   = 0;
         result.M3   = 0;
         result.M4   = 0;

         return result;
    }
};

// summary_stats_binary_op is a functor that accepts two summary_stats_data 
// structs and returns a new summary_stats_data which are an
// approximation to the summary_stats for 
// all values that have been agregated so far
template <typename T>
struct summary_stats_binary_op 
    : public thrust::binary_function<const summary_stats_data<T>&, 
                                     const summary_stats_data<T>&,
                                           summary_stats_data<T> >
{
    __host__ __device__
    summary_stats_data<T> operator()(const summary_stats_data<T>& x, const summary_stats_data <T>& y) const
    {
        summary_stats_data<T> result;

        // precompute some common subexpressions
        T n  = x.n + y.n;
        T n2 = n  * n;
        T n3 = n2 * n;

        T delta  = y.mean - x.mean;
        T delta2 = delta  * delta;
        T delta3 = delta2 * delta;
        T delta4 = delta3 * delta;

        //Basic number of samples (n), min, and max
        result.n   = n;
        result.min = thrust::min(x.min, y.min);
        result.max = thrust::max(x.max, y.max);

        result.mean = x.mean + delta * y.n / n;

        result.M2  = x.M2 + y.M2;
        result.M2 += delta2 * x.n * y.n / n;

        result.M3  = x.M3 + y.M3;
        result.M3 += delta3 * x.n * y.n * (x.n - y.n) / n2; 
        result.M3 += (T) 3.0 * delta * (x.n * y.M2 - y.n * x.M2) / n;

        result.M4  = x.M4 + y.M4;
        result.M4 += delta4 * x.n * y.n * (x.n * x.n - x.n * y.n + y.n * y.n) / n3;
        result.M4 += (T) 6.0 * delta2 * (x.n * x.n * y.M2 + y.n * y.n * x.M2) / n2;
        result.M4 += (T) 4.0 * delta * (x.n * y.M3 - y.n * x.M3) / n;

        return result;
    }
};

template <typename Iterator>
void print_range(const std::string& name, Iterator first, Iterator last)
{
    typedef typename std::iterator_traits<Iterator>::value_type T;

    std::cout << name << ": ";
    thrust::copy(first, last, std::ostream_iterator<T>(std::cout, " "));  
    std::cout << "\n";
}


int main(void)
{
    typedef float T;

    // initialize host array
    T h_x[] = {4, 7, 13, 16};

    // transfer to device
    thrust::device_vector<T> d_x(h_x, h_x + sizeof(h_x) / sizeof(T));

    // setup arguments
    summary_stats_unary_op<T>  unary_op;
    summary_stats_binary_op<T> binary_op;
    summary_stats_data<T>      init;

    init.initialize();

    // compute summary statistics
    summary_stats_data<T> result = thrust::transform_reduce(d_x.begin(), d_x.end(), unary_op, init, binary_op);

    std::cout <<"******Summary Statistics Example*****"<<std::endl;
    print_range("The data", d_x.begin(), d_x.end());

    std::cout <<"Count              : "<< result.n << std::endl;
    std::cout <<"Minimum            : "<< result.min <<std::endl;
    std::cout <<"Maximum            : "<< result.max <<std::endl;
    std::cout <<"Mean               : "<< result.mean << std::endl;
    std::cout <<"Variance           : "<< result.variance() << std::endl;
    std::cout <<"Standard Deviation : "<< std::sqrt(result.variance_n()) << std::endl;
    std::cout <<"Skewness           : "<< result.skewness() << std::endl;
    std::cout <<"Kurtosis           : "<< result.kurtosis() << std::endl;

    return 0;
}

This is outside my area of expertise, but there are single-pass iterative algorithms for computing standard deviation which may be convertible into a reduction. In particular, I am thinking of Welford's Algorithm, as described in Knuth, TAOCP, vol. 2. One drawback is that it requires a division at every step, but this will likely balance well with necessary memory accesses. A useable online reference for the algorithm appears to be:

http://www.johndcook.com/standard_deviation.html

A late answer, but I have solved this problem using thrust::transform_reduce in my code (tested with 100k floats on a GTX 1070):

#include <thrust/transform_reduce.h>
#include <thrust/device_vector.h>
#include <thrust/functional.h>

#include <functional>
#include <cmath>

/*
 * @struct varianceshifteop
 * @brief a unary function that shifts input data
 * by their mean and computes the squares of them
 */
struct varianceshifteop
    : std::unary_function<float, float>
{
    varianceshifteop(float m)
        : mean(m)
    { /* no-op */ }

    const float mean;

    __device__ float operator()(float data) const
    {
        return ::pow(data - mean, 2.0f);
    }
};

int main(int argc, char** argv)
{
    thrust::device_vector<float> data{ ... };

    // sum elements and divide by the number of elements
    float mean = thrust::reduce(
        data.cbegin(),
        data.cend(),
        0.0f,
        thrust::plus<float>()) / data.size();

    // shift elements by mean, square, and add them
    float variance = thrust::transform_reduce(
            data.cbegin(),
            data.cend(),
            varianceshifteop(mean),
            0.0f,
            thrust::plus<float>()) / (data.size() - 1);

    // standard dev is just a sqrt away
    float stdv = std::sqrtf(variance);

    return 0;
}

Autorizzato sotto: CC-BY-SA insieme a attribuzione

Non affiliato a StackOverflow