This approach, which appears to run ~40X faster than OP's, uses lookup tables and takes advantage of the extremely fast data table joins. Also, it takes advantage of the fact that, while there may be 1e6 combinations of date and time, there can be at most 86400 unique times, and probably even fewer dates. Finally, it avoids the use of paste(...)
altogether.
library(data.table)
library(stringr)
# create a dataset with 1MM rows
set.seed(1)
x <- 1000*sample(1:1e5,1e6,replace=T)
dt <- data.table(id=1:1e6,
V1=format(as.POSIXct(x,origin="2011-01-01"),"%d/%m/%Y"),
V2=format(as.POSIXct(x,origin="2011-01-01"),"%H:%M:%S"),
V3=x)
DT <- dt
index.date <- function(dt) {
# Edit: this change processes only times from the dataset; slightly more efficient
V2 <- unique(dt$V2)
dt.time <- data.table(char.time=V2,
int.time=as.integer(substr(V2,7,8))+
60*(as.integer(substr(V2,4,5))+
60*as.integer(substr(V2,1,2))))
setkey(dt.time,char.time)
# all dates from dataset
dt.date <- data.table(char.date=unique(dt$V1), int.date=as.integer(as.POSIXct(unique(dt$V1),format="%d/%m/%Y")))
setkey(dt.date,char.date)
# join the dates
setkey(dt,V1)
dt <- dt[dt.date]
# join the times
setkey(dt,V2)
dt <- dt[dt.time, nomatch=0]
# numerical index
dt[,int.index:=int.date+int.time]
# POSIX date index
dt[,index:=as.POSIXct(int.index,origin='1970-01-01')]
# get back original order
setkey(dt,id)
return(dt)
}
# new approach
system.time(dt<-index.date(dt))
# user system elapsed
# 2.26 0.00 2.26
# original approach
DT <- dt
system.time(DT[,`:=`(index= as.POSIXct(paste(V1,V2),
format='%d/%m/%Y %H:%M:%S'),
V1=NULL,V2=NULL)])
# user system elapsed
# 84.33 0.06 84.52
Note that performance does depend on how many unique dates there are. In the test case there were ~1200 unique dates.
EDIT proposition to write the function in more data.table-sugar syntax and avoid "$" for subsetting:
index.date <- function(dt,fmt="%d/%m/%Y") {
dt.time <- data.table(char.time = dt[,unique(V2)],key='char.time')
dt.time[,int.time :=as.integer(substr(char.time,7,8))+
60*(as.integer(substr(char.time,4,5))+
60*as.integer(substr(char.time,1,2)))]
# all dates from dataset
dt.date <- data.table(char.date = dt[,unique(V1)],key='char.date')
dt.date[,int.date:=as.integer(as.POSIXct(char.date,format=fmt))]
# join the dates
setkey(dt,V1)
dt <- dt[dt.date]
# join the times
setkey(dt,V2)
dt <- dt[dt.time, nomatch=0]
# numerical index
dt[,int.index:=int.date+int.time]
# POSIX date index
dt[,index:=as.POSIXct.numeric(int.index,origin='1970-01-01')]
# remove extra/temporary variables
dt[,`:=`(int.index=NULL,int.date=NULL,int.time=NULL)]
}