R - Faster Way to Calculate Rolling Statistics Over a Variable Interval

Question 1

Rcpp is a good approach if speed is your primary concern. I'll use the rolling mean statistic to explain by example.

Benchmarks: Rcpp versus R

x = sort(runif(25000,0,4*pi))
y = sin(x) + rnorm(length(x),0.5,0.5)
system.time( rollmean_r(x,y,xout=x,width=1.1) )   # ~60 seconds
system.time( rollmean_cpp(x,y,xout=x,width=1.1) ) # ~0.0007 seconds

Code for Rcpp and R function

cppFunction('
  NumericVector rollmean_cpp( NumericVector x, NumericVector y, 
                              NumericVector xout, double width) {
    double total=0;
    unsigned int n=x.size(), nout=xout.size(), i, ledge=0, redge=0;
    NumericVector out(nout);

    for( i=0; i<nout; i++ ) {
      while( x[ redge ] - xout[i] <= width && redge<n ) 
        total += y[redge++];
      while( xout[i] - x[ ledge ] > width && ledge<n ) 
        total -= y[ledge++];
      if( ledge==redge ) { out[i]=NAN; total=0; continue; }
      out[i] = total / (redge-ledge);
    }
    return out;
  }')

rollmean_r = function(x,y,xout,width) {
  out = numeric(length(xout))
  for( i in seq_along(xout) ) {
    window = x >= (xout[i]-width) & x <= (xout[i]+width)
    out[i] = .Internal(mean( y[window] ))
  }
  return(out)
}

Now for an explantion of rollmean_cpp. x and y are the data. xout is a vector of points at which the rolling statistic is requested. width is the width*2 of the rolling window. Note that the indeces for the ends of sliding window are stored in ledge and redge. These are essentially pointers to the respective elements in x and y. These indeces could be very beneficial for calling other C++ functions (e.g., median and the like) that take a vector and starting and ending indeces as input.

For those who want a "verbose" version of rollmean_cpp for debugging (lengthy):

cppFunction('
  NumericVector rollmean_cpp( NumericVector x, NumericVector y, 
                              NumericVector xout, double width) {

    double total=0, oldtotal=0;
    unsigned int n=x.size(), nout=xout.size(), i, ledge=0, redge=0;
    NumericVector out(nout);


    for( i=0; i<nout; i++ ) {
      Rcout << "Finding window "<< i << " for x=" << xout[i] << "..." << std::endl;
      total = 0;

      // numbers to push into window
      while( x[ redge ] - xout[i] <= width && redge<n ) {
        Rcout << "Adding (x,y) = (" << x[redge] << "," << y[redge] << ")" ;
        Rcout << "; edges=[" << ledge << "," << redge << "]" << std::endl;
        total += y[redge++];
      }

      // numbers to pop off window
      while( xout[i] - x[ ledge ] > width && ledge<n ) {
        Rcout << "Removing (x,y) = (" << x[ledge] << "," << y[ledge] << ")";
        Rcout << "; edges=[" << ledge+1 << "," << redge-1 << "]" << std::endl;
        total -= y[ledge++];
      }
      if(ledge==n) Rcout << " OVER ";
      if( ledge==redge ) {
       Rcout<<" NO DATA IN INTERVAL " << std::endl << std::endl;
       oldtotal=total=0; out[i]=NAN; continue;}

      Rcout << "For interval [" << xout[i]-width << "," <<
               xout[i]+width << "], all points in interval [" << x[ledge] <<
               ", " << x[redge-1] << "]" << std::endl ;
      Rcout << std::endl;

      out[i] = ( oldtotal + total ) / (redge-ledge);
      oldtotal=total+oldtotal;
    }
    return out;
  }')

x = c(1,2,3,6,90,91)
y = c(9,8,7,5.2,2,1)
xout = c(1,2,2,3,6,6.1,13,90,100)
a = rollmean_cpp(x,y,xout=xout,2)
# Finding window 0 for x=1...
# Adding (x,y) = (1,9); edges=[0,0]
# Adding (x,y) = (2,8); edges=[0,1]
# Adding (x,y) = (3,7); edges=[0,2]
# For interval [-1,3], all points in interval [1, 3]
# 
# Finding window 1 for x=2...
# For interval [0,4], all points in interval [1, 3]
# 
# Finding window 2 for x=2...
# For interval [0,4], all points in interval [1, 3]
# 
# Finding window 3 for x=3...
# For interval [1,5], all points in interval [1, 3]
# 
# Finding window 4 for x=6...
# Adding (x,y) = (6,5.2); edges=[0,3]
# Removing (x,y) = (1,9); edges=[1,3]
# Removing (x,y) = (2,8); edges=[2,3]
# Removing (x,y) = (3,7); edges=[3,3]
# For interval [4,8], all points in interval [6, 6]
# 
# Finding window 5 for x=6.1...
# For interval [4.1,8.1], all points in interval [6, 6]
# 
# Finding window 6 for x=13...
# Removing (x,y) = (6,5.2); edges=[4,3]
# NO DATA IN INTERVAL 
# 
# Finding window 7 for x=90...
# Adding (x,y) = (90,2); edges=[4,4]
# Adding (x,y) = (91,1); edges=[4,5]
# For interval [88,92], all points in interval [90, 91]
# 
# Finding window 8 for x=100...
# Removing (x,y) = (90,2); edges=[5,5]
# Removing (x,y) = (91,1); edges=[6,5]
# OVER  NO DATA IN INTERVAL 

print(a)
# [1] 8.0 8.0 8.0 8.0 5.2 5.2 NaN 1.5 NaN

Question 2

Let's see... you are doing a loop( very slow in R), making unnecessary copies of data in creating subset, and using rbind to accumulate you data set. If you avoid those, things will speed up considerably. Try this...

Summary_Stats <- function(Day, dataframe, interval){
    c1 <- dataframe$Date > Day - interval/2 & 
        dataframe$Date < Day + interval/2
    c(
        as.numeric(Day),
        mean(dataframe$Price[c1]),
        median(dataframe$Price[c1]),
        sum(c1),
        quantile(dataframe$Price[c1], 0.25),
        quantile(dataframe$Price[c1], 0.75)
      )
}
Summary_Stats(df$Date[2],dataframe=df, interval=20)
firstDay <- min(df$Date)
lastDay  <- max(df$Date)
system.time({
    x <- sapply(firstDay:lastDay, Summary_Stats, dataframe=df, interval=20)
    x <- as.data.frame(t(x))
    names(x) <- c("Date","Average","Median","Count","P25","P75")
    x$Date <- as.Date(x$Date)
})
dim(x)
head(x)

Question 3

In reply to my question to "Kevin" above, I think I figured something out below.

This function takes ticks data (time observations come in at random intervals are and indicated by a time stamp) and calculates the the mean over an interval.

library(Rcpp)

cppFunction('
  NumericVector rollmean_c2( NumericVector x, NumericVector y, double width,
                              double Min, double Max) {

double total = 0, redge,center;
unsigned int n = (Max - Min) + 1,
                  i, j=0, k, ledge=0, redgeIndex;
NumericVector out(n);


for (i = 0; i < n; i++){
  center = Min + i + 0.5;
  redge = center - width / 2;
  redgeIndex = 0;
  total = 0;

  while (x[redgeIndex] < redge){
    redgeIndex++;
  }
  j = redgeIndex;

  while (x[j] < redge + width){
    total += y[j++];

  }

  out[i] = total / (j - redgeIndex);
}
return out;

  }')

# Set up example data
x = seq(0,4*pi,length.out=2500)
y = sin(x) + rnorm(length(x),0.5,0.5)
plot(x,y,pch=20,col="black",
     main="Sliding window mean; width=1",
     sub="rollmean_c in red      rollmean_r overlaid in white.")


c.out = rollmean_c2(x,y,width=1,Min = min(x), Max = max(x)) 
lines(0.5:12.5,c.out,col="red",lwd=3)

enter image description here

Question 4

Think of all of the points connected as a chain. Think of this chain as a graph, where each data point is a node. Then, for each node, we want to find all other nodes that are distance w or less away. To do this, I first generate a matrix that gives pairwise distances. The nth row gives the distance for nodes n nodes apart.

# First, some data
x = sort(runif(25000,0,4*pi))
y = sin(x) + rnorm(length(x),0,0.5)

# calculate the rows of the matrix one by one
# until the distance between the two closest nodes is greater than w
# This algorithm is actually faster than `dist` because it usually stops
# much sooner
dl = list()
dl[[1]] = diff(x)
i = 1
while( min(dl[[i]]) <= w ) {
  pdl = dl[[i]]
  dl[[i+1]] = pdl[-length(pdl)] + dl[[1]][-(1:i)]
  i = i+1
}

# turn the list of the rows into matrices
rarray = do.call( rbind, lapply(dl,inf.pad,length(x)) )
larray = do.call( rbind, lapply(dl,inf.pad,length(x),"right") )

# extra function
inf.pad = function(x,size,side="left") {
  if(side=="left") {
    x = c( x, rep(Inf, size-length(x) ) )
  } else {
    x = c( rep(Inf, size-length(x) ), x )
  }
  x
}

I then use the matrices to determine the edge of each window. For this example, I set w=2.

# How many data points to look left or right at each data point
lookr = colSums( rarray <= w )
lookl = colSums( larray <= w )

# convert these "look" variables to indeces of the input vector
ri = 1:length(x) + lookr
li = 1:length(x) - lookl

With the windows defined, it's pretty simple to use the *apply functions to get the final answer.

rolling.mean = vapply( mapply(':',li,ri), function(i) .Internal(mean(y[i])), 1 )

All of the above code took about 50 seconds on my computer. This is a little faster than the rollmean_r function in my other answer. However, the especially nice thing here is that the indeces are provided. You could then use whatever R function you like with the *apply functions. For example,

rolling.mean = vapply( mapply(':',li,ri), 
                                        function(i) .Internal(mean(y[i])), 1 )

takes about 5 seconds. And,

rolling.median = vapply( mapply(':',li,ri), 
                                        function(i) median(y[i]), 1 )

takes about 14 seconds. If you wanted to, you could use the Rcpp function in my other answer to get the indeces.