Question

I'd like to take an increasing sequence of numbers (e.g. a series of times)

set.seed(41);  d <- seq(1:100) + runif(100, 0, 1)

and if the difference between two sequential numbers is below a threshold, merge them into a single point by taking the mean value of the two and, then continue going through until the next time combining is necessary. I resorted to functions I usually avoid: while and ifelse to write a quick-and-dirty function, and it works but isn't fast. Can you solve this task 1) more efficiently and 2) without invoking a for or while loop. Is there some built-in function, perhaps with even more functionality, that is well-suited for such a task?

combine_points <- function(x, th=0.5)
{ 
    i = 1                           # start i at 1
    while(min(diff(x)) < th)        # initiate while loop
    {
    ifelse(x[i+1] - x[i] < th,      # logical condition
           x[i] <- x[i+1] <- 
           mean(c(x[i+1], x[i])),      # assignment if TRUE
           (x[i] <- x[i]))          # assignment if FALSE
    x <- sort(unique(x))            # get rid of the duplicated entry created when
                                    # the ifelse statement was TRUE
    # increment i or reset i to 1 if it gets too large
    ifelse(i == length(x), i <- 1, i <- i+1 )
    }
    return(x)
}

newd <- combine_points(d)  
th <- 0.5
which(diff(newd) < th)

integer(0)

Update to benchmarks of solutions so far.

I benchmarked with a larger sample vector, and the Rcpp solution suggested by @Roland is slower than my first while loop when the vector gets long. I made an improvement to the initial while loop, and made an Rcpp version of it, too. The benchmark results are below. Note that @flodel answer is not directly comparable because it is a fundamentally different approach to combining, but it is definitely very fast.

set.seed(41);  d <- seq(1:4000) + runif(4000, 0, 1)
library(microbenchmark)
microbenchmark(
     combine_points.Frank(d,th=0.5),
     combine_points.Frank2(d,th=0.5),
     combine_points_Roland(d,th=0.5),
     combine_points_Roland2(d,th=0.5))
Unit: milliseconds
                                expr       min        lq    median        uq        max neval
   combine_points.Frank(d, th = 0.5) 2115.6391 2154.5038 2174.5889 2193.8444  7884.1638   100
  combine_points.Frank2(d, th = 0.5) 1298.2923 1323.2214 1341.5357 1357.4260 15538.0872   100
  combine_points_Roland(d, th = 0.5) 2497.9106 2506.5960 2512.3591 2519.0036  2573.2854   100
 combine_points_Roland2(d, th = 0.5)  494.8406  497.3613  498.2347  499.8777   544.9743   100

This is a considerable improvement over my first attempt, and the following is an Rcpp version, which is the fastest, so far:

combine_points.Frank2 <- function(x, th=0.5)
{ 
    i = 1
    while(min(diff(x)) < th)
    {
    if(x[i+1] - x[i] >= th){ 
           i <- i + 1}
    else {
           x[i] <- x[i+1] <- 
           mean(c(x[i+1], x[i]));x <- unique(x); i <- i }  
    }
    return(x)
}

Rcpp version

cppFunction('
 NumericVector combine_points_Roland2(NumericVector x, double th) {
  int i=0;
  while(min(diff(x)) < th)   
  {
    if ((x[i+1] - x[i]) >= th)
    {
      i = i + 1;
    }
    else{
      x[i] = (x[i+1] + x[i])/2;
      x[i+1] = x[i];
      x = sort_unique(x);
      i = i;
    }
   }
   return x;
}
')
Was it helpful?

Solution

Here is something faster. It avoids resizing/copying x in the loop.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
 NumericVector combine_points_Roland3(NumericVector x, double th) {
  int i=0, j;
  int n(x.size());
  while(i < n-1)   
  {
    if ((x[i+1] - x[i]) >= th)
    {
      i = i + 1;
    }
    else{
      x[i] = (x[i+1] + x[i])/2;
      n = n-1;
      for (j=i+1; j<n; j++) 
      {
        x[j]=x[j+1];
      }      
    }
  }
  NumericVector y(n);
  for (i = 0; i < n; i++) {
    y[i] = x[i];
  }
   return y;
}

An R implementation of the same algorithm:

combine_points_Roland3R <- function(x, th) {
  i <- 1
  n <- length(x)

  while(i < n) {
    if ((x[i+1] - x[i]) >= th) {
      i <- i + 1;
    } else {
      x[i] <- (x[i+1] + x[i])/2
      n <- n-1
      x[(i+1):n] <- x[(i+2):(n+1)]
    }
  }
  x[1:n]
}

set.seed(41);  d <- seq(1:4000) + runif(4000, 0, 1)
x2 <- combine_points_Roland2(d, 0.5)
x3 <- combine_points_Roland3(d, 0.5)
all.equal(x2, x3)
#TRUE
x4 <- combine_points_Roland3R(d, 0.5)
all.equal(x2, x4)
#TRUE

Benchmarks:

library(microbenchmark)
microbenchmark(combine_points_Roland2(d, 0.5),
               combine_points_Roland3(d, 0.5), 
               combine_points_Roland3R(d, 0.5))

# Unit: microseconds
#                            expr       min         lq      median          uq        max neval
#  combine_points_Roland2(d, 0.5) 126458.64 131414.592 132355.4285 133422.2235 147306.728   100
#  combine_points_Roland3(d, 0.5)    121.34    128.269    140.8955    143.3595    393.582   100
# combine_points_Roland3R(d, 0.5)  17564.24  18626.878  19155.6565  20910.2935  68707.888   100

OTHER TIPS

See if this does what you want:

combine_points <- function(x, th=0.5) {
  group <- cumsum(c(FALSE, diff(x) > th))
  unname(sapply(split(x, group), mean))
}

combine_points(c(-1, 0.1, 0.2, 0.3, 1, 1.5, 2.0, 2.5, 3.0, 10), 0.5)
# [1] -1.0  0.2  2.0 10.0

Here is a translation of your function into Rcpp. It uses sugar functions, which are very convenient, but often there are faster alternatives (RcppEigen or RcppArmadillo are good for that). And of course the algorithm could be improved.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector combine_points1(NumericVector x, double th) {
  int i=0;
  while(min(diff(x)) < th)   
  {
    if ((x[i+1] - x[i]) < th)
    {
      x[i] = (x[i+1] + x[i])/2;
      x[i+1] = x[i];
    } 
    x = sort_unique(x);
    if(i <= x.size())
    {
      i = i+1;
    }
    else {
      i=1;
    }
  }
   return x;
}

I recommend using RStudio for writing Rcpp functions and sourcing them.

all.equal(combine_points1(d, 0.5),
          combine_points(d, 0.5))
#[1] TRUE

library(compiler)
combine_points_comp <- cmpfun(combine_points) 

library(microbenchmark)
microbenchmark(combine_points1(d, 0.5),
               combine_points_comp(d, 0.5),
               combine_points(d, 0.5))

# Unit: microseconds
#                        expr      min        lq    median        uq       max neval
#     combine_points1(d, 0.5)  652.772  664.6815  683.1315   714.653  1030.171   100
# combine_points_comp(d, 0.5) 8344.839 8692.0880 9010.1470 10627.049 14117.553   100
#      combine_points(d, 0.5) 8996.768 9371.0805 9687.0235 10560.226 12800.831   100

A speed-up by a factor of 14 without real effort.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top