Index of year using data.table

Question 1

You can use rle():

dat[, index2:=rep(seq(length(unique(year))), rle(year)$lengths), by=id]

I hope that helps.

Question 2

What it looks like you are calculating is a type of rank called a dense rank. The difference between the various ranking types is fairly straight forward...

dense_rank <- function(x) rank(u<-unique(x), ties.method = "first")[match(x, u)]

x <- c(1,2,2,2,3,5,6,6,6,6)
rbind( "normal"=rank(x),
       "avg"=rank(x,ties.method = "average"),
       "random"=rank(x,ties.method = "random"),
       "min"=rank(x,ties.method = "min"),
       "max"=rank(x,ties.method = "max"),
       "first"=rank(x,ties.method = "first"),
       "dense"=dense_rank(x) )

##        [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## normal    1    3    3    3    5    6  8.5  8.5  8.5   8.5
## avg       1    3    3    3    5    6  8.5  8.5  8.5   8.5
## random    1    4    2    3    5    6 10.0  8.0  9.0   7.0
## min       1    2    2    2    5    6  7.0  7.0  7.0   7.0
## max       1    4    4    4    5    6 10.0 10.0 10.0  10.0
## first     1    2    3    4    5    6  7.0  8.0  9.0  10.0
## dense     1    2    2    2    3    4  5.0  5.0  5.0   5.0

Notice the dense rank pattern is the same as the min rank. This pattern can also be seen using run length encoding (rle) as shown in an earlier answer.

rep(1:length(rle(x)$values), rle(x)$lengths)

##  [1] 1 2 2 2 3 4 5 5 5 5

If your data sets are small or the ranking is rarely used then there are a variety of ways to accomplish the goal. Here are timings on a few of those methods.

library(data.table, quietly = TRUE)
suppressMessages( library(dplyr, quietly = TRUE) )
library(rbenchmark)
library( microbenchmark )

id <- c(rep(1, 5), rep(2, 3), 3, 3)
year <- c( 1982, 1991, 1994, 1994, 1997, 1989, 1989, 1989, 1945, 1970)
index <- c( 1, 2, 3, 3, 4, 1, 1, 1, 1, 2)
dat <- data.table( id, year )

ordered_dr <- function(dt) dt[,Index:=as.integer(ordered(rank(year,"first"))), by=id]
list_dr <- function(dt) dt[,Index:=sort.list(year)[match(year,unique(year))], by=id]
dplyr_dr <- function(dt) dt[,Index:=dense_rank(year), by=id]
rle_drA <- function(dt) dt[,Index:=rep(1:length(rle(year)$values), rle(year)$lengths), by=id]
rank_dr <- function(dt) dt[,Index:=rank(u<-unique(year), ties.method = "first")[match(year, u)], by=id]

# If your data is as clean and ordered as the sample given then...
match_dr <- function(dt) dt[,Index:=match(year,unique(year)), by=id]

data.table( Ref=index,
            Ordered=ordered_dr(dat)[,Index],
            List=list_dr(dat)[,Index],
            Dplyr=dplyr_dr(dat)[,Index],
            Rle=rle_drA(dat)[,Index],
            Rank=rank_dr(dat)[,Index],
            Match=match_dr(dat)[,Index] )

##     Ref Ordered List Dplyr Rle Rank Match
##  1:   1       1    1     1   1    1     1
##  2:   2       2    2     2   2    2     2
##  3:   3       3    3     3   3    3     3
##  4:   3       3    3     3   3    3     3
##  5:   4       4    4     4   4    4     4
##  6:   1       1    1     1   1    1     1
##  7:   1       1    1     1   1    1     1
##  8:   1       1    1     1   1    1     1
##  9:   1       1    1     1   1    1     1
## 10:   2       2    2     2   2    2     2

microbenchmark( ordered_dr(dat),
                list_dr(dat),
                dplyr_dr(dat),
                rle_drA(dat),
                rank_dr(dat),
                match_dr(dat),
                times=500 )

## Unit: microseconds
##             expr   min    lq median     uq   max neval
##  ordered_dr(dat) 890.8 922.1  946.2  973.0 30831   500
##     list_dr(dat) 755.3 794.6  814.5  837.9  2271   500
##    dplyr_dr(dat) 800.0 830.4  853.9  877.1  2884   500
##     rle_drA(dat) 895.4 934.6  961.6  997.1  2442   500
##     rank_dr(dat) 914.7 954.7  977.1 1012.2  2039   500
##    match_dr(dat) 634.7 656.8  673.1  694.1  1829   500

benchmark( ordered_dr(dat),
           list_dr(dat),
           dplyr_dr(dat),
           rle_drA(dat),
           rank_dr(dat),
           match_dr(dat),
           columns=c("test", "relative"),
           order="relative")

##              test relative
## 6   match_dr(dat)    1.000
## 2    list_dr(dat)    1.200
## 3   dplyr_dr(dat)    1.243
## 1 ordered_dr(dat)    1.386
## 4    rle_drA(dat)    1.429
## 5    rank_dr(dat)    1.443

Now, if your datasets are large then calculating the rle (especially twice for single column) may not be the answer.

Question 3

Your question seems unclear, but if you want to get the 'n'-th value of 'year' within groups of 'id' values where 'n' is the value of 'index':

> dat[ ,nth.yr := year[index], by=id]
> dat
    id year index nth.yr
 1:  1 1982     1   1982
 2:  1 1991     2   1991
 3:  1 1994     3   1994
 4:  1 1994     3   1994
 5:  1 1997     4   1994
 6:  2 1989     1   1989
 7:  2 1989     1   1989
 8:  2 1989     1   1989
 9:  3 1945     1   1945
10:  3 1970     2   1970