Question

I am trying to generate the variable index based on years within groups of observations. It is an index of order of birth.

Here an example in R:

id <- c(rep(1, 5), rep(2, 3), 3, 3)
year <- c(1982, 1991, 1994, 1994, 1997, 1989, 1989, 1989, 1945, 1970)
index <- c(1,2, 3, 3, 4, 1, 1, 1,1,2)
dat <- data.table(id, year, index)

Any suggestions? Thanks

Was it helpful?

Solution

You can use rle():

dat[, index2:=rep(seq(length(unique(year))), rle(year)$lengths), by=id]

I hope that helps.

OTHER TIPS

What it looks like you are calculating is a type of rank called a dense rank. The difference between the various ranking types is fairly straight forward...

dense_rank <- function(x) rank(u<-unique(x), ties.method = "first")[match(x, u)]

x <- c(1,2,2,2,3,5,6,6,6,6)
rbind( "normal"=rank(x),
       "avg"=rank(x,ties.method = "average"),
       "random"=rank(x,ties.method = "random"),
       "min"=rank(x,ties.method = "min"),
       "max"=rank(x,ties.method = "max"),
       "first"=rank(x,ties.method = "first"),
       "dense"=dense_rank(x) )

##        [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## normal    1    3    3    3    5    6  8.5  8.5  8.5   8.5
## avg       1    3    3    3    5    6  8.5  8.5  8.5   8.5
## random    1    4    2    3    5    6 10.0  8.0  9.0   7.0
## min       1    2    2    2    5    6  7.0  7.0  7.0   7.0
## max       1    4    4    4    5    6 10.0 10.0 10.0  10.0
## first     1    2    3    4    5    6  7.0  8.0  9.0  10.0
## dense     1    2    2    2    3    4  5.0  5.0  5.0   5.0

Notice the dense rank pattern is the same as the min rank. This pattern can also be seen using run length encoding (rle) as shown in an earlier answer.

rep(1:length(rle(x)$values), rle(x)$lengths)

##  [1] 1 2 2 2 3 4 5 5 5 5

If your data sets are small or the ranking is rarely used then there are a variety of ways to accomplish the goal. Here are timings on a few of those methods.

library(data.table, quietly = TRUE)
suppressMessages( library(dplyr, quietly = TRUE) )
library(rbenchmark)
library( microbenchmark )

id <- c(rep(1, 5), rep(2, 3), 3, 3)
year <- c( 1982, 1991, 1994, 1994, 1997, 1989, 1989, 1989, 1945, 1970)
index <- c( 1, 2, 3, 3, 4, 1, 1, 1, 1, 2)
dat <- data.table( id, year )

ordered_dr <- function(dt) dt[,Index:=as.integer(ordered(rank(year,"first"))), by=id]
list_dr <- function(dt) dt[,Index:=sort.list(year)[match(year,unique(year))], by=id]
dplyr_dr <- function(dt) dt[,Index:=dense_rank(year), by=id]
rle_drA <- function(dt) dt[,Index:=rep(1:length(rle(year)$values), rle(year)$lengths), by=id]
rank_dr <- function(dt) dt[,Index:=rank(u<-unique(year), ties.method = "first")[match(year, u)], by=id]

# If your data is as clean and ordered as the sample given then...
match_dr <- function(dt) dt[,Index:=match(year,unique(year)), by=id]

data.table( Ref=index,
            Ordered=ordered_dr(dat)[,Index],
            List=list_dr(dat)[,Index],
            Dplyr=dplyr_dr(dat)[,Index],
            Rle=rle_drA(dat)[,Index],
            Rank=rank_dr(dat)[,Index],
            Match=match_dr(dat)[,Index] )

##     Ref Ordered List Dplyr Rle Rank Match
##  1:   1       1    1     1   1    1     1
##  2:   2       2    2     2   2    2     2
##  3:   3       3    3     3   3    3     3
##  4:   3       3    3     3   3    3     3
##  5:   4       4    4     4   4    4     4
##  6:   1       1    1     1   1    1     1
##  7:   1       1    1     1   1    1     1
##  8:   1       1    1     1   1    1     1
##  9:   1       1    1     1   1    1     1
## 10:   2       2    2     2   2    2     2

microbenchmark( ordered_dr(dat),
                list_dr(dat),
                dplyr_dr(dat),
                rle_drA(dat),
                rank_dr(dat),
                match_dr(dat),
                times=500 )

## Unit: microseconds
##             expr   min    lq median     uq   max neval
##  ordered_dr(dat) 890.8 922.1  946.2  973.0 30831   500
##     list_dr(dat) 755.3 794.6  814.5  837.9  2271   500
##    dplyr_dr(dat) 800.0 830.4  853.9  877.1  2884   500
##     rle_drA(dat) 895.4 934.6  961.6  997.1  2442   500
##     rank_dr(dat) 914.7 954.7  977.1 1012.2  2039   500
##    match_dr(dat) 634.7 656.8  673.1  694.1  1829   500

benchmark( ordered_dr(dat),
           list_dr(dat),
           dplyr_dr(dat),
           rle_drA(dat),
           rank_dr(dat),
           match_dr(dat),
           columns=c("test", "relative"),
           order="relative")

##              test relative
## 6   match_dr(dat)    1.000
## 2    list_dr(dat)    1.200
## 3   dplyr_dr(dat)    1.243
## 1 ordered_dr(dat)    1.386
## 4    rle_drA(dat)    1.429
## 5    rank_dr(dat)    1.443

Now, if your datasets are large then calculating the rle (especially twice for single column) may not be the answer.

Your question seems unclear, but if you want to get the 'n'-th value of 'year' within groups of 'id' values where 'n' is the value of 'index':

> dat[ ,nth.yr := year[index], by=id]
> dat
    id year index nth.yr
 1:  1 1982     1   1982
 2:  1 1991     2   1991
 3:  1 1994     3   1994
 4:  1 1994     3   1994
 5:  1 1997     4   1994
 6:  2 1989     1   1989
 7:  2 1989     1   1989
 8:  2 1989     1   1989
 9:  3 1945     1   1945
10:  3 1970     2   1970
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top