r data.frame create new variable

https://stackoverflow.com/questions/19228870

30-06-2022
|

Question

I have a dataframe with around 1.5 million rows and 5 cols. One variable (VARIABLE) is of this type NATIONALITY_YEAR (e.g. SPAIN_1998) and I want to split it in two columns, one containing the Nationality, which is the left side of the name before the underscore, and one containing the Year, right side of the underscore. I have tried with concat.split which should be the easiest way:

aa <- concat.split(mydata, "VARIABLE", sep = "_", drop = F)

but after 2 hours running it did not produce any output. I am not sure if I should leave it running for a longer period of time or if there is a non time consuming way to do this.

Any help on the issue would be very much appreciated!

Here is a reproducible (subset!) sample:

mydata<-  structure(list(PROVINCE = c(1L, 4L, 7L, 8L, 11L, 14L, 17L, 20L, 
24L, 28L, 30L, 33L, 36L, 41L, 44L, 46L, 48L, 3L, 6L, 8L, 10L, 
13L, 15L, 18L, 23L, 26L, 29L, 31L, 35L, 38L, 41L, 46L, 47L, 2L, 
4L, 8L, 8L, 11L, 15L, 17L, 21L, 24L, 28L, 30L, 33L, 37L, 41L, 
45L, 46L, 49L, 3L, 6L, 8L, 10L, 13L, 15L, 19L, 23L, 27L, 29L, 
32L, 36L, 39L, 43L, 46L, 48L, 2L, 5L, 8L, 8L, 12L, 15L, 18L, 
21L, 24L, 28L, 30L, 33L, 37L, 41L, 45L, 46L, 50L, 3L, 7L, 8L, 
10L, 14L, 16L, 20L, 23L, 27L, 29L, 32L, 36L, 39L, 43L, 46L, 48L, 
3L, 6L, 8L, 8L, 12L, 15L, 18L, 21L, 25L, 28L, 31L, 34L, 38L, 
41L, 45L, 46L, 50L, 3L, 7L, 8L, 11L, 14L, 17L, 20L, 23L, 27L, 
29L, 33L, 36L, 40L, 43L, 46L, 48L, 3L, 6L, 8L, 9L, 12L, 15L, 
18L, 22L, 25L, 28L, 31L, 35L, 38L, 41L, 45L, 46L, 50L, 4L, 7L, 
8L, 11L, 14L, 17L, 20L, 24L, 28L, 30L, 33L, 36L, 41L, 43L, 46L, 
48L, 3L, 6L, 8L, 10L, 13L, 15L, 18L, 22L, 26L, 28L, 31L, 35L, 
38L, 41L, 46L, 47L, 1L, 4L, 8L, 8L, 11L, 14L, 17L, 20L, 24L, 
28L, 30L, 33L, 36L, 41L, 44L, 46L, 49L, 3L, 6L), AGE5 = structure(c(1L, 
5L, 9L, 7L, 6L, 7L, 5L, 8L, 3L, 3L, 3L, 5L, 8L, 2L, 3L, 6L, 9L, 
5L, 7L, 4L, 3L, 5L, 8L, 8L, 2L, 8L, 2L, 9L, 7L, 9L, 9L, 2L, 7L, 
2L, 9L, 1L, 8L, 8L, 1L, 8L, 1L, 6L, 4L, 6L, 7L, 2L, 3L, 1L, 7L, 
5L, 6L, 9L, 5L, 6L, 8L, 9L, 3L, 4L, 3L, 4L, 4L, 1L, 3L, 1L, 2L, 
2L, 6L, 6L, 2L, 9L, 2L, 2L, 1L, 5L, 9L, 5L, 8L, 9L, 7L, 4L, 3L, 
7L, 2L, 8L, 2L, 6L, 9L, 1L, 5L, 1L, 6L, 6L, 6L, 7L, 3L, 6L, 3L, 
3L, 4L, 1L, 1L, 2L, 9L, 6L, 4L, 3L, 8L, 3L, 7L, 1L, 5L, 2L, 6L, 
6L, 8L, 5L, 9L, 5L, 6L, 2L, 3L, 1L, 4L, 8L, 9L, 8L, 1L, 5L, 1L, 
6L, 4L, 6L, 2L, 3L, 3L, 5L, 9L, 5L, 5L, 4L, 7L, 8L, 4L, 2L, 5L, 
7L, 8L, 9L, 8L, 3L, 7L, 7L, 5L, 6L, 3L, 6L, 1L, 2L, 2L, 3L, 7L, 
1L, 9L, 5L, 8L, 4L, 5L, 4L, 1L, 3L, 7L, 7L, 9L, 3L, 9L, 7L, 5L, 
7L, 8L, 1L, 4L, 4L, 6L, 1L, 8L, 7L, 8L, 6L, 8L, 4L, 3L, 4L, 5L, 
9L, 2L, 6L, 6L, 1L, 5L, 7L), .Label = c("10-14", "15-19", "20-24", 
"25-29", "30-34", "35-39", "40-44", "45-49", "50-54"), class = "factor"), 
ZONA91OK = c(101L, 4079L, 712L, 8205L, 11022L, 14021L, 1714L, 
20067L, 2414L, 2810L, 300799L, 3305L, 36026L, 41024L, 4405L, 
4607L, 48015L, 308L, 610L, 8121L, 1006L, 1307L, 1511L, 1813L, 
2308L, 2605L, 2910L, 310799L, 35026L, 3811L, 411199L, 4601L, 
4708L, 202L, 405L, 8015L, 837L, 11033L, 1502L, 1702L, 2112L, 
2408L, 28047L, 30015L, 3305L, 3709L, 410199L, 4511L, 1202L, 
490699L, 3063L, 610L, 827L, 1006L, 1301L, 15036L, 1901L, 
2310L, 2709L, 29025L, 3201L, 36008L, 390899L, 4301L, 46184L, 
4805L, 206L, 504L, 817L, 813L, 12135L, 1519L, 1810L, 2104L, 
2402L, 28130L, 30030L, 3305L, 3707L, 411399L, 45165L, 46181L, 
5008L, 305L, 7026L, 803L, 1006L, 1413L, 16078L, 200999L, 
2312L, 2712L, 29069L, 3210L, 3616L, 391199L, 4313L, 46105L, 
4805L, 310L, 6153L, 8252L, 8205L, 1205L, 1505L, 1808L, 2110L, 
2508L, 2810L, 311399L, 3405L, 3807L, 41024L, 4507L, 46102L, 
500599L, 3014L, 706L, 8121L, 11028L, 14042L, 1712L, 20045L, 
2314L, 27031L, 29901L, 33024L, 3614L, 400199L, 4307L, 46021L, 
4805L, 3066L, 6153L, 8015L, 901L, 12040L, 1522L, 1806L, 2203L, 
2508L, 28047L, 311099L, 35004L, 3801L, 410199L, 4515L, 46017L, 
501199L, 407L, 7027L, 827L, 1102L, 1404L, 17155L, 200599L, 
24089L, 2812L, 30019L, 33024L, 3612L, 41038L, 4301L, 4628L, 
4805L, 307L, 6153L, 817L, 1004L, 1309L, 1508L, 1804L, 2206L, 
2606L, 28130L, 310799L, 35011L, 38022L, 411399L, 4622L, 4701L, 
1036L, 4079L, 807L, 803L, 1108L, 1410L, 1708L, 201399L, 2410L, 
28058L, 30043L, 33024L, 3610L, 410399L, 4401L, 4621L, 490499L, 
3059L, 6153L), VARIABLE = structure(c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L), .Label = c("SPAIN_1998", 
"EU15DC_1998", "ROE_1998", "MAGREB_1998", "SSA_1998", "LA_1998", 
"ASIA_1998", "ROW_1998", "Total_1998", "SPAIN_1999", "EU15DC_1999", 
"ROE_1999", "MAGREB_1999", "SSA_1999", "LA_1999", "ASIA_1999", 
"ROW_1999", "Total_1999", "SPAIN_2000", "EU15DC_2000", "ROE_2000", 
"MAGREB_2000", "SSA_2000", "LA_2000", "ASIA_2000", "ROW_2000", 
"Total_2000", "SPAIN_2001", "EU15DC_2001", "ROE_2001", "MAGREB_2001", 
"SSA_2001", "LA_2001", "ASIA_2001", "ROW_2001", "Total_2001", 
"SPAIN_2002", "EU15DC_2002", "ROE_2002", "MAGREB_2002", "SSA_2002", 
"LA_2002", "ASIA_2002", "ROW_2002", "Total_2002", "SPAIN_2003", 
"EU15DC_2003", "ROE_2003", "MAGREB_2003", "SSA_2003", "LA_2003", 
"ASIA_2003", "ROW_2003", "Total_2003", "SPAIN_2004", "EU15DC_2004", 
"ROE_2004", "MAGREB_2004", "SSA_2004", "LA_2004", "ASIA_2004", 
"ROW_2004", "Total_2004", "SPAIN_2005", "EU15DC_2005", "ROE_2005", 
"MAGREB_2005", "SSA_2005", "LA_2005", "ASIA_2005", "ROW_2005", 
"Total_2005", "SPAIN_2006", "EU15DC_2006", "ROE_2006", "MAGREB_2006", 
"SSA_2006", "LA_2006", "ASIA_2006", "ROW_2006", "Total_2006", 
"SPAIN_2007", "EU15DC_2007", "ROE_2007", "MAGREB_2007", "SSA_2007", 
"LA_2007", "ASIA_2007", "ROW_2007", "Total_2007", "SPAIN_2008", 
"EU15DC_2008", "ROE_2008", "MAGREB_2008", "SSA_2008", "LA_2008", 
"ASIA_2008", "ROW_2008", "Total_2008", "SPAIN_2009", "EU15DC_2009", 
"ROE_2009", "MAGREB_2009", "SSA_2009", "LA_2009", "ASIA_2009", 
"ROW_2009", "Total_2009", "SPAIN_2010", "EU15DC_2010", "ROE_2010", 
"MAGREB_2010", "SSA_2010", "LA_2010", "ASIA_2010", "ROW_2010", 
"Total_2010", "SPAIN_2011", "EU15DC_2011", "ROE_2011", "MAGREB_2011", 
"SSA_2011", "LA_2011", "ASIA_2011", "ROW_2011", "Total_2011", 
"SPAIN_2012", "EU15DC_2012", "ROE_2012", "MAGREB_2012", "SSA_2012", 
"LA_2012", "ASIA_2012", "ROW_2012", "Total_2012", "NOTSPAIN_1998", 
"NOTSPAIN_1999", "NOTSPAIN_2000", "NOTSPAIN_2001", "NOTSPAIN_2002", 
"NOTSPAIN_2003", "NOTSPAIN_2004", "NOTSPAIN_2005", "NOTSPAIN_2006", 
"NOTSPAIN_2007", "NOTSPAIN_2008", "NOTSPAIN_2009", "NOTSPAIN_2010", 
"NOTSPAIN_2011", "NOTSPAIN_2012", "AFRICA_1998", "AFRICA_1999", 
"AFRICA_2000", "AFRICA_2001", "AFRICA_2002", "AFRICA_2003", 
"AFRICA_2004", "AFRICA_2005", "AFRICA_2006", "AFRICA_2007", 
"AFRICA_2008", "AFRICA_2009", "AFRICA_2010", "AFRICA_2011", 
"AFRICA_2012", "DWC_1998", "DWC_1999", "DWC_2000", "DWC_2001", 
"DWC_2002", "DWC_2003", "DWC_2004", "DWC_2005", "DWC_2006", 
"DWC_2007", "DWC_2008", "DWC_2009", "DWC_2010", "DWC_2011", 
"DWC_2012"), class = "factor"), FREQUENCY = c(614, 1943, 
59, 201, 188, 10859, 93, 
1494, 60, 1001, 1000, 689, 675, 934, 51, 
1240, 165, 13, 0, 14, 2, 2, 
2, 0, 3, 0, 40, 1, 18, 41, 1, 0, 3, 0, 0, 0, 1, 0, 
0, 0, 0, 0, 7, 1, 0, 0, 0, 0, 0, 0, 0, 0, 80, 0, 
0, 0, 4, 0, 0, 15, 0, 0, 1, 1, 3, 4, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 2, 11, 0, 0, 0, 3, 2, 1, 5, 
64, 1, 4, 1, 3, 4, 8, 1, 1, 1, 1, 0, 0, 0, 
0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 2173, 907, 9059, 839, 
4303, 100, 1727, 663, 694, 1210, 623, 
1261, 772, 697, 490, 1031, 490, 956, 704, 
1293, 1011, 739, 927, 755, 3340, 1190, 1254, 12880, 528, 
3244, 277, 892, 837, 1, 2, 10, 1, 1, 2, 2, 0, 0, 1, 8, 3, 
12, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0, 1, 12, 0, 7, 0, 0, 0, 
0, 0, 5, 2)), .Names = c("PROVINCE", "AGE5", "ZONA91OK", 
"VARIABLE", "FREQUENCY"), row.names = c(1L, 501L, 1001L, 1501L, 
2001L, 2501L, 3001L, 3501L, 4001L, 4501L, 5001L, 5501L, 6001L, 
6501L, 7001L, 7501L, 8001L, 8501L, 9001L, 9501L, 10001L, 10501L, 
11001L, 11501L, 12001L, 12501L, 13001L, 13501L, 14001L, 14501L, 
15001L, 15501L, 16001L, 16501L, 17001L, 17501L, 18001L, 18501L, 
19001L, 19501L, 20001L, 20501L, 21001L, 21501L, 22001L, 22501L, 
23001L, 23501L, 24001L, 24501L, 25001L, 25501L, 26001L, 26501L, 
27001L, 27501L, 28001L, 28501L, 29001L, 29501L, 30001L, 30501L, 
31001L, 31501L, 32001L, 32501L, 33001L, 33501L, 34001L, 34501L, 
35001L, 35501L, 36001L, 36501L, 37001L, 37501L, 38001L, 38501L, 
39001L, 39501L, 40001L, 40501L, 41001L, 41501L, 42001L, 42501L, 
43001L, 43501L, 44001L, 44501L, 45001L, 45501L, 46001L, 46501L, 
47001L, 47501L, 48001L, 48501L, 49001L, 49501L, 50001L, 50501L, 
51001L, 51501L, 52001L, 52501L, 53001L, 53501L, 54001L, 54501L, 
55001L, 55501L, 56001L, 56501L, 57001L, 57501L, 58001L, 58501L, 
59001L, 59501L, 60001L, 60501L, 61001L, 61501L, 62001L, 62501L, 
63001L, 63501L, 64001L, 64501L, 65001L, 65501L, 66001L, 66501L, 
67001L, 67501L, 68001L, 68501L, 69001L, 69501L, 70001L, 70501L, 
71001L, 71501L, 72001L, 72501L, 73001L, 73501L, 74001L, 74501L, 
75001L, 75501L, 76001L, 76501L, 77001L, 77501L, 78001L, 78501L, 
79001L, 79501L, 80001L, 80501L, 81001L, 81501L, 82001L, 82501L, 
83001L, 83501L, 84001L, 84501L, 85001L, 85501L, 86001L, 86501L, 
87001L, 87501L, 88001L, 88501L, 89001L, 89501L, 90001L, 90501L, 
91001L, 91501L, 92001L, 92501L, 93001L, 93501L, 94001L, 94501L, 
95001L, 95501L, 96001L, 96501L, 97001L, 97501L, 98001L, 98501L, 
99001L, 99501L), class = "data.frame")

Solution

Try this instead:

library(data.table)
dt = data.table(mydata)

dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
          YEAR        = sub('(.*)_(.*)', '\\2', VARIABLE))]

OTHER TIPS

It seems like I need to look into updating my concat.split functions!

The version of the function that you tried to use makes use of read.table, which does tend to struggle with large datasets. I had used read.table because it has a convenient text argument that lets you specify a column in a data.frame as the input. This is really convenient when working with small-ish datasets, but evidently not with larger ones :)

As far as I can tell, fread from the "data.table" package doesn't have a similar feature, but since R tends to write files pretty quickly, I thought that it would be worth trying a similar approach as what I used in concat.split with fread instead of read.table.

Here's the concept:

Write the variable that needs to be split to a new file.
Use the blazing fast fread to read it back in.
Wait for fread to get a text argument somewhere down the line?

Here's that concept as a function (updated with edits as per @eddi's suggestions in the comments):

csDataTable <- function(dataset, splitcol, sep, drop = FALSE) {
  if (is.numeric(splitcol)) splitcol <- names(dataset)[splitcol]
  if (!is.data.table(dataset)) dataset <- data.table(dataset)
  if (sep == ".") {
    dataset[, (splitcol) := gsub(".", "|", get(splitcol), fixed = TRUE)]
    sep <- "|"
  }
  if (!is.character(dataset[[splitcol]])) {
    dataset[, (splitcol) := as.character(get(splitcol))]
  }
  x <- tempfile()
  writeLines(dataset[[splitcol]], x)
  Split <- fread(x, sep=sep, header = FALSE)
  setnames(Split, paste(splitcol, seq_along(Split), sep = "_"))
  if (isTRUE(drop)) dataset[, (splitcol) := NULL]
  cbind(dataset, Split)
}

Here's the function in action:

## Expand your sample data to 1.5 million rows to test
out <- mydata[rep(rownames(mydata), 1500000/nrow(mydata)), ]

csDataTable(out, "VARIABLE", "_")
#          PROVINCE  AGE5 ZONA91OK    VARIABLE FREQUENCY VARIABLE_1 VARIABLE_2
#       1:        1 10-14      101  SPAIN_1998       614      SPAIN       1998
#       2:        4 30-34     4079  SPAIN_1998      1943      SPAIN       1998
#       3:        7 50-54      712  SPAIN_1998        59      SPAIN       1998
#       4:        8 40-44     8205  SPAIN_1998       201      SPAIN       1998
#       5:       11 35-39    11022  SPAIN_1998       188      SPAIN       1998
#      ---                                                                    
# 1499996:       44 35-39     4401    ROE_1999         0        ROE       1999
# 1499997:       46 35-39     4621    ROE_1999         0        ROE       1999
# 1499998:       49 10-14   490499    ROE_1999         0        ROE       1999
# 1499999:        3 30-34     3059 MAGREB_1999         5     MAGREB       1999
# 1500000:        6 40-44     6153 MAGREB_1999         2     MAGREB       1999

In this test, at least, the solution fares much better than I expected:

subFun <- function() {
  dt = data.table(out)
  dt[, `:=`(NATIONALITY = sub('(.*)_(.*)', '\\1', VARIABLE),
            YEAR        = sub('(.*)_(.*)', '\\2', VARIABLE))]
} 
freadFun <- function() {
  csDataTable(out, "VARIABLE", "_")
}

library(microbenchmark)
microbenchmark(subFun(), freadFun(), times = 20)
# Unit: seconds
#        expr      min       lq   median       uq      max neval
#    subFun() 3.814174 4.244820 4.273834 4.345358 4.480520    20
#  freadFun() 1.356533 2.064262 2.152159 2.226465 2.300886    20

Here is some solution with splitting factor labels

VARIABLE_LEVELS <- cbind("VARIABLE"=levels(mydata$VARIABLE),
                         as.data.frame(do.call("rbind",
                                       strsplit(levels(mydata$VARIABLE), split="_")))
mydata <- merge(mydata, VARIABLE_LEVELS)
#
# Insted of merege you can use VARIABLE (in mydata) as index
#
mydata <- cbind(mydata, VARIABLE_LEVELS[as.integer(mydata$VARIABLE),c("V1","V2")])

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow