Question

I am currently trying to parse RDP multiclassifier hierarchy files in R, however the problem is more generally applicable. Basically I create a list which contains data-frames of several files which contain "hierarchical" rows:

dput(corner(hierlist$hier_M2MID06_Trimmed_noGaps.fas_fixrank.txt,n=c(7,10)))
structure(list(X1 = structure(c(30L, 31L, 163L, 45L, 64L, 65L, 
66L), .Label = c("-1071", "-1102", "-1153", "-1159", "-1176", 
"-1177", "-1207", "-1241", "-1256", "-1281", "-1332", "-1353", 
"-1354", "-1502", "-1567", "-18", "-2", "-2715", "-423", "-460", 
"-463", "-471", "-567", "-568", "-828", "-842", "-843", "-871", 
"-980", "0", "1", "1031", "1069", "1070", "1093", "1101", "1126", 
"1151", "1152", "1158", "1159", "1164", "1165", "1166", "1175", 
"1176", "1195", "1200", "1206", "1207", "1215", "1216", "1217", 
"1219", "1240", "1251", "1255", "1256", "1261", "1269", "1279", 
"1280", "1282", "1330", "1331", "1339", "1341", "1343", "1348", 
"1352", "1353", "1354", "1355", "1356", "1357", "1358", "1360", 
"1501", "1566", "16", "1668", "1672", "1674", "17", "1762", "1763", 
"1764", "1767", "1883", "1884", "1885", "1891", "1893", "1894", 
"2", "2164", "2179", "2180", "2183", "2184", "2187", "2192", 
"2195", "2208", "2209", "2210", "2211", "2259", "2260", "2333", 
"2371", "2372", "254", "255", "261", "264", "2684", "2713", "2714", 
"274", "3", "35", "422", "458", "459", "46", "462", "470", "48", 
"49", "54", "565", "566", "567", "570", "577", "581", "648", 
"653", "657", "659", "804", "805", "806", "807", "808", "817", 
"818", "819", "820", "822", "824", "825", "826", "827", "829", 
"832", "834", "837", "838", "839", "840", "841", "842", "843", 
"844", "846", "848", "870", "886", "887", "908", "918", "927", 
"929", "950", "957", "978", "979", "taxid"), class = "factor"), 
X2 = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Root", 
"lineage", "null"), class = "factor"), X3 = structure(c(1L, 
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Root", "name", "rootrank"
), class = "factor"), X4 = structure(c(2L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = c("Bacteria", "no rank", "rank"), class = "factor"), 
X5 = structure(c(1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("194", 
"M2MID06_Trimmed_noGaps.fas", "domain"), class = "factor"), 
X6 = structure(c(NA, NA, 10L, 10L, 10L, 10L, 10L), .Label = c("", 
"Acidobacteria", "Actinobacteria", "Bacteroidetes", "Cyanobacteria/Chloroplast", 
"Firmicutes", "Gemmatimonadetes", "Nitrospira", "Planctomycetes", 
"Proteobacteria", "Spirochaetes", "Verrucomicrobia", "unclassified_Bacteria"
), class = "factor"), X7 = structure(c(NA, 2L, 3L, 3L, 3L, 
3L, 3L), .Label = c("", "Bacteria", "phylum"), class = "factor"), 
X8 = structure(c(NA, 21L, NA, 8L, 8L, 8L, 8L), .Label = c("", 
"Acidobacteria_Gp3", "Acidobacteria_Gp4", "Actinobacteria", 
"Alphaproteobacteria", "Bacilli", "Bacteroidetes_incertae_sedis", 
"Betaproteobacteria", "Chloroplast", "Deltaproteobacteria", 
"Flavobacteria", "Gammaproteobacteria", "Gemmatimonadetes", 
"Nitrospira", "Phycisphaerae", "Planctomycetacia", "Sphingobacteria", 
"Spirochaetes", "Subdivision3", "Verrucomicrobiae", "domain", 
"unclassified_Bacteroidetes", "unclassified_Proteobacteria"
), class = "factor"), X9 = structure(c(NA, 2L, 11L, 14L, 
14L, 14L, 14L), .Label = c("", "194", "Acidobacteria", "Actinobacteria", 
"Bacteroidetes", "Cyanobacteria/Chloroplast", "Firmicutes", 
"Gemmatimonadetes", "Nitrospira", "Planctomycetes", "Proteobacteria", 
"Spirochaetes", "Verrucomicrobia", "class", "unclassified_Bacteria"
), class = "factor"), X10 = structure(c(NA, NA, 29L, NA, 
22L, 22L, 22L), .Label = c("", "Actinobacteridae", "Bdellovibrionales", 
"Burkholderiales", "Caulobacterales", "Chloroplast", "Chromatiales", 
"Flavobacteriales", "Gemmatimonadales", "Gp3", "Gp4", "Lactobacillales", 
"Legionellales", "Methylophilales", "Nitrospirales", "Ohtaekwangia", 
"Phycisphaerales", "Planctomycetales", "Pseudomonadales", 
"Rhizobiales", "Rhodobacterales", "Rhodocyclales", "Rhodospirillales", 
"Sphingobacteriales", "Sphingomonadales", "Spirochaetales", 
"Subdivision3_genera_incertae_sedis", "Verrucomicrobiales", 
"phylum", "unclassified_Alphaproteobacteria", "unclassified_Betaproteobacteria", 
"unclassified_Deltaproteobacteria", "unclassified_Gammaproteobacteria"
), class = "factor")), .Names = c("X1", "X2", "X3", "X4", 
"X5", "X6", "X7", "X8", "X9", "X10"), row.names = 2:8, class = "data.frame")

This basically means I have progressive rows which are filled up with NA in progressive columns. However, there is no way of telling of a specific row where the first NA will be. Preceding this first NA column, I have the two columns which actually interest me: a count of the number of contigs at a specified taxonomical level, and two columns before it the taxonomic level's name.

I already created a list containing the indices for each dataframe that would select the last row by:

library(plyr)
lastcollist<-lapply(hierlist,function(p)lapply(apply(p, 1, function(x) which(!is.na(x)) ),function(x)if(length(x)>0){max(x)}else{0}))
lastcollist<-lapply(lastcollist,unlist)
lastcollist.idx<-llply(lastcollist,function(x)cbind(seq(1,length(x)),x))

Here lastcollist.idx will contain the indices of each row with the last non-NA column:

head(lastcollist.idx$hier_M2MID06_Trimmed_noGaps.fas_fixrank.txt)
        x
[1,] 1  5
[2,] 2  5
[3,] 3  9
[4,] 4 11
[5,] 5 13
[6,] 6 15

So what I would basically want to do now is create a new list which contains dataframes (or in the case of only the last column, variable x in lastcollist.idx) that have for each given row the last selected column.

This would be the desired output for the given example:

 dput(rbind(c('domain','194'),c('Proteobacteria','Phylum'),c('Betaproteobacteria','class'),c ('class','Rhodocyclales'),c('class','Rhodocyclales'),c('class','Rhodocyclales')))
structure(c("domain", "Proteobacteria", "Betaproteobacteria", 
"class", "class", "class", "194", "Phylum", "class", "Rhodocyclales", 
"Rhodocyclales", "Rhodocyclales"), .Dim = c(6L, 2L))

And I have to admit, I wouldn't immediately know how to do so. Any pointers are warmly welcomed. I am not a novice to R, so you don't have to go through great lengths in your explanation.

For a larger reproducible example consider the dataset 'khanmiss' from the bioconductor library impute (bioconductor library impute).

 source("http://bioconductor.org/biocLite.R")
 biocLite("impute")
 require(impute)
 data(khanmiss)

Which is basically a dataframe that has NAs introduced in several places. It is not exactly the same hierarchical structure as my file but it will fit the purpose. As this is a very inconvenient dataframe with 2309 observations and only 222 of it's rows contain missing values, I selected the rows with missing values and added 78 rows randomly that did not have missing values in a new data.frame. This data.frame was then split into a list of 4 dataframes of arbirtraty size (adding up to 300).

isnadf<-as.data.frame(which(is.na(khanmiss),arr.ind=T))
na.rows<-sort(unique(isnadf$row))
length(na.rows) #the dataset has 222 rows which contain NA
na.khanmiss<-khanmiss[na.rows,]
notna.rows<-setdiff(rownames(khanmiss),na.rows)
notna.rows.selected<-sort(as.numeric(sample(notna.rows,78)))
notna.selected.khanmiss<-khanmiss[notna.rows.selected,]
khanmiss.selected<-rbind(na.khanmiss,notna.selected.khanmiss)
dfsizes<-c(82,74,79,65) #arbitrarily selected, adds up to 300
khanmiss.list<-split(khanmiss.selected,rep(letters[1:4],dfsizes))

Which finally gives a list somewhat similar to my dataset.

Was it helpful?

Solution

something along these untested lines might work:

apply(dfrm, 1, function(r) { r[ (which(is.na(r))[1]-1):(which(is.na(r))[1]-2)) ] } )

My usual ways of load text output as dataframes are failing for this example so my suggestion is to post dput output rather than screen scrapings. (It also appears to me that you should have done your data entry with header =TRUE since your first row of data does not look like data.)

With the new data (and realizing the need to test for no NA's:

 apply(hierlist, 1, function(r) { r[ 
                      if( any(is.na(r))){ 
                          (which(is.na(r))[1]-1):( which(is.na(r))[1]-2) 
                       }else{
                          (length(r)-2): (length(r)-1)}
                                      ] }
       )
#--------------------------------------
     2         3          4                5                   
[1,] "194"     "domain"   "phylum"         "class"             
[2,] "no rank" "Bacteria" "Proteobacteria" "Betaproteobacteria"
     6                    7                    8                   
[1,] "Betaproteobacteria" "Betaproteobacteria" "Betaproteobacteria"
[2,] "class"              "class"              "class" 

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top