You can use rle()
:
dat[, index2:=rep(seq(length(unique(year))), rle(year)$lengths), by=id]
I hope that helps.
Question
I am trying to generate the variable index based on years within groups of observations. It is an index of order of birth.
Here an example in R:
id <- c(rep(1, 5), rep(2, 3), 3, 3)
year <- c(1982, 1991, 1994, 1994, 1997, 1989, 1989, 1989, 1945, 1970)
index <- c(1,2, 3, 3, 4, 1, 1, 1,1,2)
dat <- data.table(id, year, index)
Any suggestions? Thanks
Solution
You can use rle()
:
dat[, index2:=rep(seq(length(unique(year))), rle(year)$lengths), by=id]
I hope that helps.
OTHER TIPS
What it looks like you are calculating is a type of rank called a dense rank. The difference between the various ranking types is fairly straight forward...
dense_rank <- function(x) rank(u<-unique(x), ties.method = "first")[match(x, u)]
x <- c(1,2,2,2,3,5,6,6,6,6)
rbind( "normal"=rank(x),
"avg"=rank(x,ties.method = "average"),
"random"=rank(x,ties.method = "random"),
"min"=rank(x,ties.method = "min"),
"max"=rank(x,ties.method = "max"),
"first"=rank(x,ties.method = "first"),
"dense"=dense_rank(x) )
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## normal 1 3 3 3 5 6 8.5 8.5 8.5 8.5
## avg 1 3 3 3 5 6 8.5 8.5 8.5 8.5
## random 1 4 2 3 5 6 10.0 8.0 9.0 7.0
## min 1 2 2 2 5 6 7.0 7.0 7.0 7.0
## max 1 4 4 4 5 6 10.0 10.0 10.0 10.0
## first 1 2 3 4 5 6 7.0 8.0 9.0 10.0
## dense 1 2 2 2 3 4 5.0 5.0 5.0 5.0
Notice the dense rank pattern is the same as the min rank. This pattern can also be seen using run length encoding (rle) as shown in an earlier answer.
rep(1:length(rle(x)$values), rle(x)$lengths)
## [1] 1 2 2 2 3 4 5 5 5 5
If your data sets are small or the ranking is rarely used then there are a variety of ways to accomplish the goal. Here are timings on a few of those methods.
library(data.table, quietly = TRUE)
suppressMessages( library(dplyr, quietly = TRUE) )
library(rbenchmark)
library( microbenchmark )
id <- c(rep(1, 5), rep(2, 3), 3, 3)
year <- c( 1982, 1991, 1994, 1994, 1997, 1989, 1989, 1989, 1945, 1970)
index <- c( 1, 2, 3, 3, 4, 1, 1, 1, 1, 2)
dat <- data.table( id, year )
ordered_dr <- function(dt) dt[,Index:=as.integer(ordered(rank(year,"first"))), by=id]
list_dr <- function(dt) dt[,Index:=sort.list(year)[match(year,unique(year))], by=id]
dplyr_dr <- function(dt) dt[,Index:=dense_rank(year), by=id]
rle_drA <- function(dt) dt[,Index:=rep(1:length(rle(year)$values), rle(year)$lengths), by=id]
rank_dr <- function(dt) dt[,Index:=rank(u<-unique(year), ties.method = "first")[match(year, u)], by=id]
# If your data is as clean and ordered as the sample given then...
match_dr <- function(dt) dt[,Index:=match(year,unique(year)), by=id]
data.table( Ref=index,
Ordered=ordered_dr(dat)[,Index],
List=list_dr(dat)[,Index],
Dplyr=dplyr_dr(dat)[,Index],
Rle=rle_drA(dat)[,Index],
Rank=rank_dr(dat)[,Index],
Match=match_dr(dat)[,Index] )
## Ref Ordered List Dplyr Rle Rank Match
## 1: 1 1 1 1 1 1 1
## 2: 2 2 2 2 2 2 2
## 3: 3 3 3 3 3 3 3
## 4: 3 3 3 3 3 3 3
## 5: 4 4 4 4 4 4 4
## 6: 1 1 1 1 1 1 1
## 7: 1 1 1 1 1 1 1
## 8: 1 1 1 1 1 1 1
## 9: 1 1 1 1 1 1 1
## 10: 2 2 2 2 2 2 2
microbenchmark( ordered_dr(dat),
list_dr(dat),
dplyr_dr(dat),
rle_drA(dat),
rank_dr(dat),
match_dr(dat),
times=500 )
## Unit: microseconds
## expr min lq median uq max neval
## ordered_dr(dat) 890.8 922.1 946.2 973.0 30831 500
## list_dr(dat) 755.3 794.6 814.5 837.9 2271 500
## dplyr_dr(dat) 800.0 830.4 853.9 877.1 2884 500
## rle_drA(dat) 895.4 934.6 961.6 997.1 2442 500
## rank_dr(dat) 914.7 954.7 977.1 1012.2 2039 500
## match_dr(dat) 634.7 656.8 673.1 694.1 1829 500
benchmark( ordered_dr(dat),
list_dr(dat),
dplyr_dr(dat),
rle_drA(dat),
rank_dr(dat),
match_dr(dat),
columns=c("test", "relative"),
order="relative")
## test relative
## 6 match_dr(dat) 1.000
## 2 list_dr(dat) 1.200
## 3 dplyr_dr(dat) 1.243
## 1 ordered_dr(dat) 1.386
## 4 rle_drA(dat) 1.429
## 5 rank_dr(dat) 1.443
Now, if your datasets are large then calculating the rle (especially twice for single column) may not be the answer.
Your question seems unclear, but if you want to get the 'n'-th value of 'year' within groups of 'id' values where 'n' is the value of 'index':
> dat[ ,nth.yr := year[index], by=id]
> dat
id year index nth.yr
1: 1 1982 1 1982
2: 1 1991 2 1991
3: 1 1994 3 1994
4: 1 1994 3 1994
5: 1 1997 4 1994
6: 2 1989 1 1989
7: 2 1989 1 1989
8: 2 1989 1 1989
9: 3 1945 1 1945
10: 3 1970 2 1970