Domanda

Has anyone developed an elegant, fast way to perform a rolling sum by date? For example, if I wanted to create a rolling 180-day total for the following dataset by Cust_ID, is there a way to do it faster (like something in data.table). I have been using the following example to currently calculate the rolling sum, but I am afraid it is far to inefficient.

library("zoo")
library("plyr")
library("lubridate")

##Make some sample variables
set.seed(1)
Trans_Dates <- as.Date(c(31,33,65,96,150,187,210,212,240,273,293,320,
                         32,34,66,97,151,188,211,213,241,274,294,321,
                         33,35,67,98,152,189,212,214,242,275,295,322),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
Target <- rpois(36,3)

##Combine into one dataset
Example.Data <- data.frame(Trans_Dates,Cust_ID,Target)

##Create extra variable with 180 day rolling sum
Example.Data2 <- ddply(Example.Data, .(Cust_ID), 
  function(datc) adply(datc, 1, 
   function(x) data.frame(Target_Running_Total =
    sum(subset(datc, Trans_Dates>(as.Date(x$Trans_Dates)-180) & Trans_Dates<=x$Trans_Dates)$Target))))

#Print new data
Example.Data2 
È stato utile?

Soluzione 2

I think I stumbled upon an answer that is fairly efficient..

set.seed(1)
Trans_Dates <- as.Date(c(31,33,65,96,150,187,210,212,240,273,293,320,
                         32,34,66,97,151,188,211,213,241,274,294,321,
                         33,35,67,98,152,189,212,214,242,275,295,322),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
Target <- rpois(36,3)

##Make simulated data into a data.table
library(data.table)
data <- data.table(Cust_ID,Trans_Dates,Target)

##Assign each customer an number that ranks them
data[,Cust_No:=.GRP,by=c("Cust_ID")]

##Create "list" of comparison dates
Ref <- data[,list(Compare_Value=list(I(Target)),Compare_Date=list(I(Trans_Dates))), by=c("Cust_No")]

##Compare two lists and see of the compare date is within N days
data$Roll.Val <- mapply(FUN = function(RD, NUM) {
  d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
  sum((d <= 0 & d >= -180)*Ref$Compare_Value[[NUM]])
}, RD = data$Trans_Dates,NUM=data$Cust_No)

##Print out data
data <- data[,list(Cust_ID,Trans_Dates,Target,Roll.Val)][order(Cust_ID,Trans_Dates)]
data

Altri suggerimenti

Assuming that your panel is more-or-less balanced, then I suspect that expand.grid and ave will be pretty fast (you'll have to benchmark with your data to be sure). I use expand.grid to fill in the missing days so that I can naively take a rolling sum with cumsum then subtract all but the most recent 180 with head.

-As a question for you (and more skilled R users), why does my identical call always fail?-

I build on your same data.

full <- expand.grid(seq(from=min(Example.Data$Trans_Dates), to=max(Example.Data$Trans_Dates), by=1), unique(Example.Data$Cust_ID))
Example.Data3 <- merge(Example.Data, full, by.x=c("Trans_Dates", "Cust_ID"), by.y=c("Var1", "Var2"), all=TRUE)
Example.Data3 <- Example.Data3[with(Example.Data3, order(Cust_ID, Trans_Dates)), ]
Example.Data3$Target.New <- ifelse(is.na(Example.Data3$Target), 0, Example.Data3$Target)
Example.Data3$Target_Running_Total <- ave(Example.Data3$Target.New, Example.Data3$Cust_ID, FUN=function(x) cumsum(x) - c(rep(0, 180), head(cumsum(x), -180)))
Example.Data3$Target.New <- NULL
Example.Data3 <- Example.Data3[complete.cases(Example.Data3), ]
row.names(Example.Data3) <- seq(nrow(Example.Data3))
Example.Data3

identical(Example.Data2$Target_Running_Total, Example.Data3$Target_Running_Total)
sum(Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total)
(Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total) 

Which yields the following.

> (Example.Data2$Target_Running_Total - Example.Data3$Target_Running_Total) 
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
library(data.table)

set.seed(1)

data <- data.table(Cust_ID = c(rep(1, 12), rep(2, 12), rep(3, 12)),
                   Trans_Dates = as.Date(c(31, 33, 65, 96, 150, 187, 210,
                                           212, 240, 273, 293, 320, 32, 34,
                                           66, 97, 151, 188, 211, 213, 241,
                                           274, 294, 321, 33, 35, 67, 98,
                                           152, 189, 212, 214, 242, 275,
                                           295, 322),
                                         origin = "2010-01-01"),
                   Target = rpois(36, 3))

data[, RollingSum := {
         d <- data$Trans_Dates - Trans_Dates
         sum(data$Target[Cust_ID == data$Cust_ID & d <= 0 & d >= -180])
       },
     by = list(Trans_Dates, Cust_ID)]
Autorizzato sotto: CC-BY-SA insieme a attribuzione
Non affiliato a StackOverflow
scroll top