Question

I want to loop through several dataframes, and certain columns within each dataframe, and separate columns with numbers in brackets with asterisks into two columns. For instance, splitting [123]*** into a column with 123 (no brackets) and column with ***. Here's what I've tried:

require(reshape2)
# create dataframes
  df1 <- data.frame(var=c("item1", "item2"),
                    var.se.1=c("[123]**", "[456]"),
                    var.se.2=c("[1]***", "[45]*"))
  df2 <- data.frame(var=c("item3", "item4"),
                    var.se.1=c("[7]*", "[89]***"),
                    var.se.2=c("[34]**", "[2]"))

I have:

# df1
#     var var.se.1 var.se.2
# 1 item1  [123]**   [1]***
# 2 item2    [456]    [45]*

I want:

#     var var.se.1 var.se.1.ast var.se.2 var.se.2.ast
# 1 item1      123           **        1          ***
# 2 item2      456                    45            *

I tried:

# create list of dataframes
  dfs <- list(df1, df2)
# loop through dataframes
  for (i in 1:length(dfs)) {
    # index columns with .se in the name
      seCols <- grep(".se", names(dfs[[i]]))
    # loop through every column with .se in the name
      for (s in seCols) {
        # remove [, which will leave ] to split on
        # want to get rid of [] anyway
          dfs[[i]][,s] <- gsub("\\[", "", as.character(dfs[[i]][,s]))
        # split into two columns on ]
          dfs[[i]] <- cbind(dfs[[i]],
                            colsplit(dfs[[i]][,s],
                                     pattern = "\\]",
                                     names = c(names(dfs[[i]][s]),
                                               paste(names(dfs[[i]][s]),
                                                     "ast", sep="."))))
      }
  }

The code is mostly doing what I want it to do, but the results are not being stored in the dataframe. If I run the loop and then run dfs[[i]], so when i==2, I get the following:

# i == 2
dfs[[i]]
    var var.se.1 var.se.2 var.se.1 var.se.1.ast var.se.2 var.se.2.ast
1 item3      7]*    34]**        7            *       34           **
2 item4   89]***       2]       89          ***        2 

I need to delete columns 2 and 3, but other than that, it works. Just need to get columns 1, 4, 5, 6, and 7 (column naming is correct) into df1 and df2 (df2 in this case).

UPDATE

My actual use case is more complex than the MRE and breaks @Tyler's code. The problem seems to be related to my real dataframes having different numbers of observations. If we redefine df2 to have 3 rows, and leave df2 to have 2 rows, R will throw an error when running @Tyler's code: "Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 2, 3"

df2 <- data.frame(var=c("item3", "item4", "item5"),
                  var.se.1=c("[7]*", "[89]***", "new"),
                  var.se.2=c("[34]**", "[2]", "rows"))
Was it helpful?

Solution

Here's one approach using the qdap package. I made some tweaks based on the OP's comments. I'd operate out of the list but if you wanted them to be in the global environment I provide that as well.

L1 <- setNames(list(df1, df2), c("df1", "df2"))

library(qdap)

bot_scum <- function(x) identical(character(0), x)
FUN <- function(x) {
    y <- bracketXtract(x)
    y[sapply(y, bot_scum)] <- NA
    as.numeric(unlist(y))
}
FUN2 <- function(x) gsub("[^*]", "", x)
FUN3 <- function(x) cbind.data.frame(FUN(x), FUN2(x))

(L2 <- lapply(L1, function(x) {
    inds <- grep(".se", colnames(x), fixed=TRUE)
    ninds <- grep(".se", colnames(x), fixed=TRUE, invert=TRUE)

    out <- do.call(cbind.data.frame, lapply(inds, function(i) {
        setNames(FUN3(x[, i]), c(colnames(x)[i], paste0(colnames(x)[i], ".ast")))
    }))

    cbind.data.frame(x[, ninds, drop=FALSE],out)
}))

list2env(L2, envir = .GlobalEnv)
df1
df2

## $df1
##     var var.se.1 var.se.1.ast var.se.2 var.se.2.ast
## 1 item1      123           **        1          ***
## 2 item2      456                    45            *
## 
## $df2
##     var var.se.1 var.se.1.ast var.se.2 var.se.2.ast
## 1 item3        7            *       34           **
## 2 item4       89          ***        2             

OTHER TIPS

Not as clean as Tyler's was, but in base:

lapply(
  dfs,
  function(df) {
    proc.cols <- lapply(df[grep("se.", names(df), fixed=TRUE)], 
      function(df.col) {
        df.col.chr <- as.character(df.col)
        matched.vals <- vapply(
          regmatches(df.col.chr, regexec("\\[([0-9]+)\\](\\**)", df.col.chr)),
          `[`, character(2L), -1L
        )
        t(matched.vals)
    } )
    do.call(cbind, c(list(df[!grepl("se.", names(df), fixed=TRUE)]), proc.cols))
} )

Produces:

[[1]]
    var var.se.1.1 var.se.1.2 var.se.2.1 var.se.2.2
1 item1        123         **          1        ***
2 item2        456                    45          *

[[2]]
    var var.se.1.1 var.se.1.2 var.se.2.1 var.se.2.2
1 item3          7          *         34         **
2 item4         89        ***          2           

Basically we loop through the list, then loop through all but the first column of each data frame, and use regmatches to pull out he numbers and the stars.

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top