I don't work with datasets anywhere near this scale, so I have no idea if this is going to be of use or not. One thing that comes to mind is to use a matrix
and matrix indexing.
Since I'm impatient, I've only tried it on 1e5 rows on my slow system :-)
Create your sample data
big_n <- data.table(Subscription_Id = rep(n[,Subscription_Id],1e5),
Account_Desc = rep(n[,Account_Desc],1e5))
Write a function to create your matrix
StringMat <- function(input) {
Temp <- strsplit(input, " ", fixed = TRUE)
Lens <- vapply(Temp, length, 1L)
A <- unlist(Temp, use.names = FALSE)
Rows <- rep(sequence(length(Temp)), Lens)
Cols <- sequence(Lens)
m <- matrix(NA, nrow = length(Temp), ncol = max(Lens),
dimnames = list(NULL, paste0("V", sequence(max(Lens)))))
m[cbind(Rows, Cols)] <- A
m
}
Time it and view the output
system.time(outB1 <- cbind(big_n, StringMat(big_n$Account_Desc)))
# user system elapsed
# 4.524 0.000 4.533
outB1
# Subscription_Id Account_Desc V1 V2 V3
# 1: 13.855.231.846.091.000 AGUAYO CARLA AGUAYO CARLA NA
# 2: 11.156.048.529.090.800 LEIVA LILIANA LEIVA LILIANA NA
# 3: 24.940.584.090.830 FULLANA MARIA LAURA FULLANA MARIA LAURA
# 4: 242.753.039.111.124 PETREL SERGIO PETREL SERGIO NA
# 5: 27.843.782.090.830 IPTICKET SRL IPTICKET SRL NA
# ---
# 999996: 13.773.513.145.090.800 LEDESMA ORLANDO LEDESMA ORLANDO NA
# 999997: 25.691.374.090.830 CATTANEO LUIS RAUL CATTANEO LUIS RAUL
# 999998: 12.236.174.155.090.900 CABRAL CARMEN ESTELA CABRAL CARMEN ESTELA
# 999999: 252.027.904.121.210 ITURGOYEN HECTOR ITURGOYEN HECTOR NA
# 1000000: 11.136.991.054.110.100 CASA CASILDO CASA CASILDO NA
Correct the set_method
function and compare timings
set_method <- function(DT){
cols = paste0("V",1:3)
for(j in 1:3){
set(DT,i=NULL,j=cols[j],
value = sapply(strsplit(as.character(DT[, Account_Desc, with = TRUE]),
"[ +]"), "[", j))
}
}
system.time(set_method(big_n))
# user system elapsed
# 25.319 0.022 25.586
Reset the "big_n" dataset and try out str_split_fixed
(ouch!)
big_n[, c("V1", "V2", "V3") := NULL]
library(stringr)
system.time(outBrodie <- cbind(big_n, as.data.table(str_split_fixed(
big_n$Account_Desc, "[ +]", 4))))
# user system elapsed
# 204.966 0.514 206.910