finding quartiles

https://stackoverflow.com/questions/11964552

26-06-2021
|

Вопрос

I've written a program where the user can enter any number of values into a vector and it's supposed to return the quartiles, but I keep getting a "vector subscript out of range" error :

#include "stdafx.h"
#include <iostream>
#include <string>
#include <algorithm>
#include <iomanip>
#include <ios>
#include <vector>

int main () {
    using namespace std;

    cout << "Enter a list of numbers: ";

    vector<double> quantile;
    double x;
    //invariant: homework contains all the homework grades so far
    while (cin >> x)
        quantile.push_back(x);

    //check that the student entered some homework grades
    //typedef vector<double>::size_type vec_sz;
    int size = quantile.size();

    if (size == 0) {
        cout << endl << "You must enter your numbers . "
                        "Please try again." << endl;
        return 1;
    }

    sort(quantile.begin(), quantile.end());

    int mid = size/2;
    double median;
    median = size % 2 == 0 ? (quantile[mid] + quantile[mid-1])/2 : quantile[mid];

    vector<double> first;
    vector<double> third;

    for (int i = 0; i!=mid; ++i)
    {
        first[i] = quantile[i];

    }

        for (int i = mid; i!= size; ++i)
    {
        third[i] = quantile[i];
    }
        double fst;
        double trd;

        int side_length = 0;

        if (size % 2 == 0) 
        {
            side_length = size/2;
        }
        else {
            side_length = (size-1)/2;
        }

        fst = (size/2) % 2 == 0 ? (first[side_length/2]/2 + first[(side_length-1)/2])/2 : first[side_length/2];
        trd = (size/2) % 2 == 0 ? (third[side_length/2]/2 + third[(side_length-1)/2])/2 : third[side_length/2];

    streamsize prec = cout.precision();
    cout << "The quartiles are" <<  setprecision(3) << "1st"
        << fst << "2nd" << median << "3rd" << trd << setprecision(prec) << endl;

    return 0;   

}

Решение

Instead of doing std::sort(quantile.begin(), quantile.end()) a somewhat cheaper way would be

auto const Q1 = quantile.size() / 4;
auto const Q2 = quantile.size() / 2;
auto const Q3 = Q1 + Q2;

std::nth_element(quantile.begin(),          quantile.begin() + Q1, quantile.end());
std::nth_element(quantile.begin() + Q1 + 1, quantile.begin() + Q2, quantile.end());
std::nth_element(quantile.begin() + Q2 + 1, quantile.begin() + Q3, quantile.end());

This would not sort the complete array, but only do a "between groups" sort of the 4 quartile. This saves on the "within groups" sort that a full std::sort would do.

If your quantile array is not large, it's a small optimization. But the scaling behavior of std::nth_element is O(N) however, rather than O(N log N) of a std::sort.

Другие советы

Here is Quantile function which is MATLAB's equivalent with linear interpolation:

#include <algorithm>
#include <cmath>
#include <vector>

template<typename T>
static inline double Lerp(T v0, T v1, T t)
{
    return (1 - t)*v0 + t*v1;
}

template<typename T>
static inline std::vector<T> Quantile(const std::vector<T>& inData, const std::vector<T>& probs)
{
    if (inData.empty())
    {
        return std::vector<T>();
    }

    if (1 == inData.size())
    {
        return std::vector<T>(1, inData[0]);
    }

    std::vector<T> data = inData;
    std::sort(data.begin(), data.end());
    std::vector<T> quantiles;

    for (size_t i = 0; i < probs.size(); ++i)
    {
        T poi = Lerp<T>(-0.5, data.size() - 0.5, probs[i]);

        size_t left = std::max(int64_t(std::floor(poi)), int64_t(0));
        size_t right = std::min(int64_t(std::ceil(poi)), int64_t(data.size() - 1));

        T datLeft = data.at(left);
        T datRight = data.at(right);

        T quantile = Lerp<T>(datLeft, datRight, poi - left);

        quantiles.push_back(quantile);
    }

    return quantiles;
}

Find quartiles:

std::vector<double> in = { 1,2,3,4,5,6,7,8,9,10,11 };
auto quartiles = Quantile<double>(in, { 0.25, 0.5, 0.75 });

You need to preallocate first and third vectors before you set the contents.

vector<double> first(mid);
vector<double> third(size-mid);

or use push_back instead of assignments to first[i] and third[i]

This C++ template function calculates quartile for you. It assumes x to be sorted.

#include <assert.h>

template <typename T1, typename T2> typename T1::value_type quant(const T1 &x, T2 q)
{
    assert(q >= 0.0 && q <= 1.0);
    
    const auto n  = x.size();
    const auto id = (n - 1) * q;
    const auto lo = floor(id);
    const auto hi = ceil(id);
    const auto qs = x[lo];
    const auto h  = (id - lo);
    
    return (1.0 - h) * qs + h * x[hi];
}

To use it do:

std::vector<float> x{1,1,2,2,3,4,5,6};
std::cout << quant(x, 0.25) << std::endl;
std::cout << quant(x, 0.50) << std::endl;
std::cout << quant(x, 0.75) << std::endl;

If only one element in vector, this instruction is out of range:

quantile[mid-1]

"i" starting from mid so third[0] is out of range

for (int i = mid; i!= size; ++i)
{
    third[i] = quantile[i];
}

Here is one error:

vector<double> first;
vector<double> third;

for (int i = 0; i!=mid; ++i)
{
    first[i] = quantile[i];
}

The vector first doesn't have any contents, but you try to access the contents. Same problem with third and its loop. Do you mean to use push_back instead?

Implementation for weighted quantiles

This implements a weight functionality for the quantile function with linear interpolation in between the gridpoints.

#include <vector>
#include <numeric>
#include <algorithm>
#include <iostream>
#include <assert.h>

// https://stackoverflow.com/a/12399290/7128154
template <typename T>
std::vector<size_t> sorted_index(const std::vector<T> &v) {

  std::vector<size_t> idx(v.size());
  iota(idx.begin(), idx.end(), 0);

  stable_sort(idx.begin(), idx.end(),
       [&v](size_t i1, size_t i2) {return v[i1] < v[i2];});

  return idx;
}
// https://stackoverflow.com/a/1267878/7128154
template< typename order_iterator, typename value_iterator >
void reorder( order_iterator order_begin, order_iterator order_end, value_iterator v )  {
    typedef typename std::iterator_traits< value_iterator >::value_type value_t;
    typedef typename std::iterator_traits< order_iterator >::value_type index_t;
    typedef typename std::iterator_traits< order_iterator >::difference_type diff_t;

    diff_t remaining = order_end - 1 - order_begin;
    for ( index_t s = index_t(), d; remaining > 0; ++ s ) {
        for ( d = order_begin[s]; d > s; d = order_begin[d] ) ;
        if ( d == s ) {
            -- remaining;
            value_t temp = v[s];
            while ( d = order_begin[d], d != s ) {
                swap( temp, v[d] );
                -- remaining;
            }
            v[s] = temp;
        }
    }
}

// https://stackoverflow.com/a/1267878/7128154
template< typename order_iterator, typename value_iterator >
void reorder_destructive( order_iterator order_begin, order_iterator order_end, value_iterator v )  {
    typedef typename std::iterator_traits< value_iterator >::value_type value_t;
    typedef typename std::iterator_traits< order_iterator >::value_type index_t;
    typedef typename std::iterator_traits< order_iterator >::difference_type diff_t;

    diff_t remaining = order_end - 1 - order_begin;
    for ( index_t s = index_t(); remaining > 0; ++ s ) {
        index_t d = order_begin[s];
        if ( d == (diff_t) -1 ) continue;
        -- remaining;
        value_t temp = v[s];
        for ( index_t d2; d != s; d = d2 ) {
            std::swap( temp, v[d] );
            std::swap( order_begin[d], d2 = (diff_t) -1 );
            -- remaining;
        }
        v[s] = temp;
    }
}



// https://stackoverflow.com/a/29677616/7128154
// https://stackoverflow.com/a/37708864/7128154
template <typename T>
double quantile(double q, std::vector<T> values, std::vector<double> weights = std::vector<double>())
{
    assert( 0. <= q && q <= 1. && "expecting quantile in range [0; 1]");
    if (weights.empty())
    {
        weights = std::vector<double>(values.size(), 1.);
    }
    else
    {
        assert (values.size() == weights.size()  && "values and weights missfit in quantiles");
        std::vector<size_t> inds = sorted_index(values);
        reorder_destructive(inds.begin(), inds.end(), weights.begin());
    }

    stable_sort(values.begin(), values.end());
    // values and weights are sorted now

    std::vector<double> quantiles (weights.size());
    quantiles[0] = weights[0];
    for (int ii = 1; ii < quantiles.size(); ii++)
    {
        quantiles[ii] = quantiles[ii-1] + weights[ii];
    }
    double norm = std::accumulate(weights.begin(), weights.end(), 0.0);
    int ind = 0;
    double qCurrent = 0;
    for (; ind < quantiles.size(); ind++)
    {
        qCurrent = (quantiles[ind] - weights[ind] / 2. ) / norm;
        quantiles[ind] = qCurrent;
        if (qCurrent > q)
        {
            if (ind == 0) {return values[0];}
            double rat = (q - quantiles[ind-1]) / (quantiles[ind] - quantiles[ind-1]);
            return values[ind-1] + (values[ind] - values[ind-1]) * rat;
        }
    }
    return values[values.size()-1];

}

template <typename T>
double quantile(double q, std::vector<T> values, std::vector<int> weights)
{
    std::vector<double> weights_double (weights.begin(), weights.end());
    return quantile(q, values, weights_double);
}

int main()
{
    std::vector<int> vals {5, 15, 25, 35, 45, 55, 65, 75, 85, 95};
    std::cout << "quantile(0, vals)=" << quantile(0, vals) << std::endl;
    std::cout << "quantile(.73, vals)=" << quantile(.73, vals) << std::endl;

    std::vector<int> vals2 {1, 2, 3};
    std::vector<double> ws2 {1, 2, 3};
    std::cout << "quantile(.13, vals2, ws2)=" << quantile(.13, vals2, ws2) << std::endl;
}

Output

quantile(0, vals)=5
quantile(.73, vals)=73
quantile(.13, vals2, ws2)=1.18667

About weighted quantiles

While in the unweighted case, the input values form an equidistant distribution

values: [1, 2, 3] -> positions: [1/6, 3/6, 5/6]

in the weighted case, the distance is modified.

values: [1, 2, 3], weights: [1, 2, 1] -> positions: [1/8, 4/8, 7/8]

This is not equivalent to a repetition of values for the unweighted case

values: [1, 2, 2, 3] -> positions: [1/8, 3/8, 5/8, 7/8],

as a plateau is formed in between the repeated values. This means:

quantile(q=3/8, values=[1, 2, 2, 3]) = 2

but

quantile(q=3/8, values=[1, 2, 3], weights=[1, 2, 1]) = 1.67

Лицензировано под: CC-BY-SA с атрибуция

Не связан с StackOverflow