Question

I have df dataframe that needs subsetting into chunks of 2 names. From example below, there are 4 unique names: a,b,c,d. I need to subset into 2 one column matrices a,b and c,d.

Output format:

name1
item_value
item_value
...
END
name2
item_value
item_value
...
END

Example:

#dummy data
df <- data.frame(name=sort(c(rep(letters[1:4],2),"a","a","c")),
                   item=round(runif(11,1,10)),
                   stringsAsFactors=FALSE)
#tried approach - split per name. I need to split per 2 names.
lapply(split(df,f=df$name),
       function(x) 
       {name <- unique(x$name)
        as.matrix(c(name,x[,2],"END"))
       })

#expected output
[,1] 
[1,] "a"  
[2,] "8"  
[3,] "9"  
[4,] "6"  
[5,] "4"  
[6,] "END"
[1,] "b"  
[2,] "2"  
[3,] "10" 
[4,] "END"

[,2] 
[1,] "c"  
[2,] "6"  
[3,] "6"  
[4,] "2"  
[5,] "END"
[1,] "d"  
[2,] "4"  
[3,] "1"  
[4,] "END"

Note: Actual df has ~300000 rows with ~35000 unique names.

Was it helpful?

Solution

You may try this.

# for each 'name', "pad" 'item' with 'name' and 'END'
l1 <- lapply(split(df, f = df$name), function(x){
  name <- unique(x$name)
  as.matrix(c(name, x$item, "END")) 
  })

# create a sequence of numbers, to select two by two elements from the list
steps <- seq(from = 0, to = length(unique(df$name))/2, by = 2)

# loop over 'steps' to bind together list elements, two by two. 
l2 <- lapply(steps, function(x){
  do.call(rbind, l1[1:2 + x])
})

l2
# [[1]]
#      [,1] 
# [1,] "a"  
# [2,] "6"  
# [3,] "4"  
# [4,] "10" 
# [5,] "3"  
# [6,] "END"
# [7,] "b"  
# [8,] "6"  
# [9,] "7"  
# [10,] "END"
# 
# [[2]]
#     [,1] 
# [1,] "c"  
# [2,] "2"  
# [3,] "6"  
# [4,] "10" 
# [5,] "END"
# [6,] "d"  
# [7,] "5"  
# [8,] "4"  
# [9,] "END"

OTHER TIPS

Instead of making the lists from individual names make it from the column of subsets of the data.frame

res <- list("a_b" = c(df[df$name == "a",2],"END",df[df$name == "b", 2],"END"),
        "c_d" = c(df[df$name == "c",2],"END", df[df$name == "d", 2],"END"))

res2 <- vector(mode="list",length=2)
res2 <- sapply(1:(length(unique(df$name))/2),function(x) {
  sapply(seq(1,length(unique(df$name))-1,by=2), function(y) {
    name <- unique(df$name)
    res2[x] <- as.matrix(c(name[y],df[df$name == name[y],2],"END",name[y+1],df[df$name == name[y+1],2],"END"))
  })
})
answer <- res2[,1]

This is giving me a matrix of lists since there are two sapplys happening, I think everything you want is in res2[,1]

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top