Rcpp equivalent for rowsum [closed]

Question 1

Here's my attempt at doing this using Rcpp (first time using the package, so do point out my inefficiencies):

library(inline)
library(Rcpp)

rowsum_helper = cxxfunction(signature(x = "numeric", y = "integer"), '
  NumericVector var(x);
  IntegerVector factor(y);

  std::vector<double> sum(*std::max_element(factor.begin(), factor.end()) + 1,
                          std::numeric_limits<double>::quiet_NaN());
  for (int i = 0, size = var.size(); i < size; ++i) {
    if (sum[factor[i]] != sum[factor[i]]) sum[factor[i]] = var[i];
    else sum[factor[i]] += var[i];
  }

  return NumericVector(sum.begin(), sum.end());
', plugin = "Rcpp")

rowsum_fast = function(x, y) {
  res = rowsum_helper(x, y)
  elements = which(!is.nan(res))
  list(elements - 1, res[elements])
}

It's pretty fast for Martin's example data, but will only work if the factor consists of non-negative integers and will consume memory on the order of the largest integer in the factor vector (one obvious improvement to the above is to subtract min from max to decrease memory usage - which can be done in either the R function or the C++ one).

n = 1e7; x = runif(n); f = sample(n/2, n, T)

system.time(rowsum(x,f))
#    user  system elapsed 
#   14.241  0.170  14.412

system.time({tabulate(f); sum(x)})
#    user  system elapsed 
#   0.216   0.027   0.252

system.time(rowsum_fast(x,f))
#    user  system elapsed 
#   0.313   0.045   0.358

Also note that a lot of the slowdown (as compared to tabulate) happens in the R code, so if you move that to C++ instead, you should see more improvement:

system.time(rowsum_helper(x,f))
#    user  system elapsed 
#   0.210   0.018   0.228

Here's a generalization that will handle almost any y, but will be a little bit slower (I'd actually prefer doing this in Rcpp, but don't know how to handle arbitrary R types there):

rowsum_fast = function(x, y) {
  if (is.numeric(y)) {
    y.min = min(y)
    y = y - y.min
    res = rowsum_helper(x, y)
  } else {
    y = as.factor(y)
    res = rowsum_helper(x, as.numeric(y))
  }

  elements = which(!is.nan(res))

  if (is.factor(y)) {
    list(levels(y)[elements-1], res[elements])
  } else {
    list(elements - 1 + y.min, res[elements])
  }
}

Question 2

Not an answer, but maybe helpful for framing the issue. Seems like worst-case performance is to sum many short groups, and this seems to scale linearly with the size of the vector

> n = 100000; x = runif(n); f = sample(n/2, n, TRUE)
> system.time(rowsum(x, f))
   user  system elapsed 
  0.228   0.000   0.229 
> n = 1000000; x = runif(n); f = sample(n/2, n, TRUE)
> system.time(rowsum(x, f)) 
   user  system elapsed 
  1.468   0.040   1.514 
> n = 10000000; x = runif(n); f = sample(n/2, n, TRUE)
> system.time(rowsum(x, f))
   user  system elapsed 
 17.369   0.748  18.166

There seem to be two short-cuts available, avoiding re-ordering

> n = 10000000; x = runif(n); f = sample(n/2, n, TRUE)
> system.time(rowsum(x, f, reorder=FALSE))
   user  system elapsed 
 16.501   0.476  17.025

and avoiding an internal coercion to character

> n = 10000000; x = runif(n); f = as.character(sample(n/2, n, TRUE)); 
> system.time(rowsum(x, f, reorder=FALSE))
   user  system elapsed 
  8.652   0.268   8.949

And then the basic operations that would seem to be involved -- figuring out the unique values of the grouping factor (to pre-allocate a result vector) and doing the sum

> n = 10000000; x = runif(n); f = sample(n/2, n, TRUE)
> system.time({ t = tabulate(f); sum(x) })
   user  system elapsed 
  0.640   0.000   0.643

so yes, it seems like there's quite a bit of scope for a faster single-purpose implementation. This seems like a natural for data.table, and not too hard to implement in C. Here's a mixed solution, using R to do the tabulation and the 'classic' C interface to do the sum

library(inline)

rowsum1.1 <- function(x, f) {
    t <- tabulate(f)
    crowsum1(x, f, t)
}

crowsum1 = cfunction(c(x_in="numeric", f_in="integer", t_in = "integer"), "
    SEXP res_out;
    double *x = REAL(x_in), *res;
    int len = Rf_length(x_in), *f = INTEGER(f_in);

    res_out = PROTECT(Rf_allocVector(REALSXP, Rf_length(t_in)));
    res = REAL(res_out);
    memset(res, 0, Rf_length(t_in) * sizeof(double));
    for (int i = 0; i < len; ++i)
        res[f[i] - 1] += x[i];
    UNPROTECT(1);
    return res_out;
")

with

> system.time(r1.1 <- rowsum1.1(x, f))
   user  system elapsed 
  1.276   0.092   1.373

To actually return a result that is identical to rowsum, this needs to be shaped as a matrix with appropriate dim names

rowsum1 <- function(x, f) {
    t <- tabulate(f)
    r <- crowsum1(x, f, t)
    keep <- which(t != 0)
    matrix(r[keep], ncol=1, dimnames=list(keep, NULL))
}

> system.time(r1 <- rowsum1(x, f))
   user  system elapsed 
  9.312   0.300   9.641

so for all that work we're only 2x faster (and much less general -- x must be numeric, f must be integer; no NA values). Yes, there are inefficiencies, e.g., allocating space levels that have no counts (though this avoids an expensive coercion to character vector for names).

Question 3

To complement Martin's code, here is some Rcpp based version.

int increment_maybe(int value, double vec_i){
    return vec_i == 0 ? value : ( value +1 ) ;  
}

// [[Rcpp::export]]
NumericVector cpprowsum2(NumericVector x, IntegerVector f){
    std::vector<double> vec(10) ;
    vec.reserve(1000); 
    int n=x.size(); 
    for( int i=0; i<n; i++){
        int index=f[i]; 
        while( index >= vec.size() ){
            vec.resize( vec.size() * 2 ) ;    
        }
        vec[ index ] += x[i] ;
    }
    // count the number of non zeros
    int s = std::accumulate( vec.begin(), vec.end(), 0, increment_maybe) ; 
    NumericVector result(s) ;
    CharacterVector names(s) ;

    std::vector<double>::iterator it = vec.begin() ;
    for( int i=0, j=0 ; j<s; j++ ,++it, ++i ){
        // move until the next non zero value
        while( ! *it ){ i++ ; ++it ;}
        result[j] = *it ;
        names[j]  = i ;
    }
    result.attr( "dim" ) = IntegerVector::create(s, 1) ;
    result.attr( "dimnames" ) = List::create(names, R_NilValue) ; 
    return result ;
}

The C++ code deals with everything, including formatting into the matrix format given by rowsum, and shows (slightly) better performance (at least on the example).

# from Martin's answer
> system.time(r1 <- rowsum1(x, f))
   user  system elapsed
  0.014   0.001   0.015

> system.time(r3 <- cpprowsum2(x, f))
   user  system elapsed
  0.011   0.001   0.013

> identical(r1, r3)
[1] TRUE

Question 4

In a comment and 'answer' that @Ben has deleted, it turns out that f is ordered and increasing.

n = 1e7; x = runif(n);
f <- cumsum(c(1L, sample(c(TRUE, FALSE), n - 1, TRUE)))

So

rowsum3 <- function(x, f)
{
    y <- cumsum(x)
    end <- c(f[-length(f)] != f[-1], TRUE)
    diff(c(0, y[end]))
}

is a common R solution (if one is not too concerned about precision), and

crowsum3 <- cfunction(c(x_in="numeric", f_in="integer"), "
    int j = 0, *f = INTEGER(f_in), len = Rf_length(f_in), 
        len_out = len == 0 ? 0 : f[len - 1];
    SEXP res = Rf_allocVector(REALSXP, len_out);
    double *x = REAL(x_in), *r = REAL(res);
    memset(r, 0, len_out * sizeof(double));
    for (int i = 0; i < len; ++i) {
        if (i != 0 && f[i] != f[i-1]) ++j;
        r[j] += x[i];
    }
    return res;
")

could be a C solution. These have timings

> system.time(r3 <- rowsum3(x, f))
   user  system elapsed 
  1.116   0.120   1.238 
> system.time(c3 <- crowsum3(x, f))
   user  system elapsed 
  0.080   0.000   0.081

and the loss of precision in the R implementation is apparent

> all.equal(r3, c3)
[1] TRUE
> identical(r3, c3)
[1] FALSE

rowsum_helper has

> system.time(r2 <- rowsum_helper(x, f))
   user  system elapsed 
  0.464   0.004   0.470

but also assumes 0-based indexing so

> head(rowsum_helper(x, f))
[1]       NaN 0.9166577 0.4380485 0.7777094 2.0866507 0.7300764
> head(crowsum3(x, f))
[1] 0.9166577 0.4380485 0.7777094 2.0866507 0.7300764 0.7195091