An alternate way (without using .SD
) would be:
dt.ex[, seq := (seq_len(.N)-1) %/% 3, by=newID][,
list(pos = mean(pos), count=sum(count)), list(newID, seq)]
Benchmarking on (relatively) bigger data:
set.seed(45)
get_grps <- function() paste(sample(letters, 5, TRUE), collapse="")
grps <- unique(replicate(1e4, get_grps()))
dt.in <- data.table(newID = sample(grps, 6e6, TRUE),
pos = sample(-1000:1000, 6e6, TRUE),
count = runif(6e6))
setkey(dt.in, newID)
require(microbenchmark)
eddi <- function(dt) {
dt[, .SD[, list(pos = mean(pos), count = sum(count)),
by = seq(0, .N-1) %/% 3], by = newID]
}
arun <- function(dt) {
dt[, seq := (seq_len(.N)-1) %/% 3, by=newID][,
list(pos = mean(pos), count=sum(count)), list(newID, seq)]
}
microbenchmark(o1 <- eddi(copy(dt.in)), o2 <- arun(copy(dt.in)), times=2)
Unit: seconds
expr min lq median uq max neval
o1 <- eddi(copy(dt.in)) 25.23282 25.23282 26.16009 27.08736 27.08736 2
o2 <- arun(copy(dt.in)) 13.59597 13.59597 14.41190 15.22783 15.22783 2