Question

I have a dataframe like

  ID_CASE   Month   
CS00000026A 201301  
CS00000026A 201302  
CS00000026A 201303  
CS00000026A 201304  
CS00000026A 201305  
CS00000026A 201306  
CS00000026A 201307  
CS00000026A 201308  
CS00000026A 201309  
CS00000026A 201310  
CS00000191C 201302  
CS00000191C 201303  
CS00000191C 201304  
CS00000191C 201305  
CS00000191C 201306  
CS00000191C 201307  
CS00000191C 201308  
CS00000191C 201309  
CS00000191C 201310  

I want the final data frame to have three additional column like

  ID_CASE   Month   Lag_1   Lag_2   Lag_3
CS00000026A 201301  NA      NA      NA
CS00000026A 201302  201301  NA      NA
CS00000026A 201303  201202  201201  NA
CS00000026A 201304  201203  201202  201201
CS00000026A 201305  201204  201203  201202
CS00000026A 201306  201305  201304  201303
CS00000026A 201307  201306  201305  201304
CS00000026A 201308  201307  201306  201305
CS00000026A 201309  201308  201307  201306
CS00000026A 201310  201309  201308  201307
CS00000191C 201302  NA       NA     NA
CS00000191C 201303  201302   NA     NA
CS00000191C 201304  201303  201302      NA
CS00000191C 201305  201304  201303  201302
CS00000191C 201306  201305  201304  201303
CS00000191C 201307  201306  201305  201304
CS00000191C 201308  201307  201306  201305
CS00000191C 201309  201308  201307  201306
CS00000191C 201310  201309  201308  201307

where

  • Lag_1 is lagged by 1 Month
  • Lag_2 is lagged by 2 Months
  • Lag_3 is lagged by 3 Months.

I have used the following code to atleast get Lag_1

df <- ddply(df,.(ID_CASE),transform,
                  Lag_1 <- c(NA,Month[-nrow(df)])) 

But this does not give me the desired output for Lag_1.

I have also tried looking at the solutions in Lag in R dataframe

And how can this be done if I have a date object instead of an int column 'Month' as in the current example?

Any help on this will be appreciated.

Was it helpful?

Solution 2

From data.table v1.9.6 you can use shift():

require(data.table)
setDT(df)[, paste("lag", 1:3, sep="_") := shift(Month, 1:3), by=ID_CASE]

OTHER TIPS

Try data.table

library(data.table)
setDT(df)[, `:=` (Lag_1 = c(NA, Month[-.N]),
                  Lag_2 = c(rep(NA, 2), Month[-.N]),
                  Lag_3 = c(rep(NA, 3), Month[-.N])), by = ID_CASE]
df
#         ID_CASE  Month  Lag_1  Lag_2  Lag_3
#  1: CS00000026A 201301     NA     NA     NA
#  2: CS00000026A 201302 201301     NA     NA
#  3: CS00000026A 201303 201302 201301     NA
#  4: CS00000026A 201304 201303 201302 201301
#  5: CS00000026A 201305 201304 201303 201302
#  6: CS00000026A 201306 201305 201304 201303
#  7: CS00000026A 201307 201306 201305 201304
#  8: CS00000026A 201308 201307 201306 201305
#  9: CS00000026A 201309 201308 201307 201306
# 10: CS00000026A 201310 201309 201308 201307
# 11: CS00000191C 201302     NA     NA     NA
# 12: CS00000191C 201303 201302     NA     NA
# 13: CS00000191C 201304 201303 201302     NA
# 14: CS00000191C 201305 201304 201303 201302
# 15: CS00000191C 201306 201305 201304 201303
# 16: CS00000191C 201307 201306 201305 201304
# 17: CS00000191C 201308 201307 201306 201305
# 18: CS00000191C 201309 201308 201307 201306
# 19: CS00000191C 201310 201309 201308 201307

You may use lag.zoo, where k can be a vector of lags.

library(plyr)
library(zoo)

ddply(df, .(ID_CASE), function(x){
  z <- zoo(x$Month)
  lag(z, k = 0:-3)
})

#        ID_CASE   lag0  lag-1  lag-2  lag-3
# 1  CS00000026A 201301     NA     NA     NA
# 2  CS00000026A 201302 201301     NA     NA
# 3  CS00000026A 201303 201302 201301     NA
# 4  CS00000026A 201304 201303 201302 201301
# 5  CS00000026A 201305 201304 201303 201302
# 6  CS00000026A 201306 201305 201304 201303
# 7  CS00000026A 201307 201306 201305 201304
# 8  CS00000026A 201308 201307 201306 201305
# 9  CS00000026A 201309 201308 201307 201306
# 10 CS00000026A 201310 201309 201308 201307
# 11 CS00000191C 201302     NA     NA     NA
# 12 CS00000191C 201303 201302     NA     NA
# 13 CS00000191C 201304 201303 201302     NA
# 14 CS00000191C 201305 201304 201303 201302
# 15 CS00000191C 201306 201305 201304 201303
# 16 CS00000191C 201307 201306 201305 201304
# 17 CS00000191C 201308 201307 201306 201305
# 18 CS00000191C 201309 201308 201307 201306
# 19 CS00000191C 201310 201309 201308 201307

Edit following comment.

If there are groups with only one date, the code above will generate an error. A small example:

df <- data.frame(ID_CASE = c(1, 1, 1, 2), Month = 1:4)
df
#   ID_CASE Month
# 1       1     1
# 2       1     2
# 3       1     3
# 4       2     4

ddply(df, .(ID_CASE), function(x){
  z <- zoo(x$Month)
  lag(z, k = 0:-3)
})

# Error in list_to_dataframe(res, attr(.data, "split_labels"), .id, id_as_factor) : 
#   Results do not have equal lengths

This is due to the 'one registration-only groups' are coerced to a univariate time series. To avoid such coercion, use [ subsetting and drop = FALSE

ddply(df, .(ID_CASE), function(x){
  z <- zoo(x[ , "Month", drop = FALSE])
  lag(z, k = 0:-3)
})

#   ID_CASE Month.lag0 Month.lag-1 Month.lag-2 Month.lag-3
# 1       1          1          NA          NA          NA
# 2       1          2           1          NA          NA
# 3       1          3           2           1          NA
# 4       2          4          NA          NA          NA

using dplyr:

library(dplyr)

 df %.%
  group_by(ID_CASE) %.%
  mutate(lag_1 = lag(Month, 1),
         lag_2 = lag(Month, 2),
         lag_3 = lag(Month, 3))
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top