R - Average columns by information in row X

https://stackoverflow.com/questions/22390094

14-06-2023
|

Question

I have a data.frame where the first 13 rows contain site/observation information. Each column represents 1 individual, however most individuals have an A and B observation (although some only have A while a few have an A, B, and C observation). I'd like to average each row for every individual, and create a new data.frame from this information.

Example (small subset with row 1, row 7, row 13, and row 56-61):

      OriginalID        Tree003A        Tree003B        Tree008B        Tree013A    
1           Township              LY              LY              LY              LY    
7         COFECHA ID        LY1A003A        LY1A003B        LY1A008B        LY1A013A    
13        PathLength         37.5455         54.8963         57.9732         64.0679    
56              2006           1.538           1.915           0.827           2.722    
57              2007           1.357           1.923           0.854           2.224    
58              2008           1.311           2.204           0.669           2.515    
59              2009           0.702           1.125           0.382           2.413    
60              2010           0.937           1.556           0.907           2.315    
61              2011           0.942           1.268           1.514           1.858

I'd like to create a new data.frame that averages each individual's annual observations, whether they have an A, A and B, or A B and C observation. Individual's IDs are in Row 7 (COFECHA ID):

Intended Output:

      OriginalID           Tree003avg      Tree008avg        Tree013avg    
1           Township              LY              LY              LY    
7         COFECHA ID        LY1A003avg     LY1A008avg        LY1A013avg    
13        PathLength         46.2209         57.9732         64.0679    
56              2006           1.727           0.827           2.722    
57              2007           1.640           0.854           2.224    
58              2008           1.758           0.669           2.515    
59              2009           0.914           0.382           2.413    
60              2010           1.247           0.907           2.315    
61              2011           1.105           1.514           1.858

Any ideas on how to average the columns would be great. I've been trying to modify the following code, but due to the 13 rows of additional information at the top of the data.frame, I didn't know how to specify to only average rows 14:61.

rowMeans(subset(LY011B, select = c("LY1A003A", "LY1A003B")), na.rm=TRUE)

The code for a larger set of the data that I'm working with is:

> dput(LY011B)
structure(list(OriginalTreeID = structure(c(58L, 53L, 57L, 59L, 
51L, 61L, 50L, 55L, 56L, 60L, 54L, 49L, 52L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 
45L, 46L, 47L, 48L), .Label = c("1964", "1965", "1966", "1967", 
"1968", "1969", "1970", "1971", "1972", "1973", "1974", "1975", 
"1976", "1977", "1978", "1979", "1980", "1981", "1982", "1983", 
"1984", "1985", "1986", "1987", "1988", "1989", "1990", "1991", 
"1992", "1993", "1994", "1995", "1996", "1997", "1998", "1999", 
"2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", 
"2008", "2009", "2010", "2011", "AnalysisDateTime", "COFECHA ID", 
"CoreLetter", "PathLength", "Plot#", "RingCount", "SiteID", "SP", 
"Subplot#", "Township", "Tree#", "YearLastRing", "YearLastWhiteWood"
), class = "factor"), Tree003A = structure(c(35L, 8L, 34L, 7L, 
34L, 21L, 36L, 31L, 37L, 30L, 32L, 29L, 33L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 23L, 22L, 25L, 28L, 27L, 24L, 26L, 20L, 16L, 
15L, 6L, 18L, 12L, 10L, 3L, 9L, 11L, 19L, 17L, 14L, 13L, 2L, 
4L, 5L), .Label = c("", "0.702", "0.803", "0.937", "0.942", "0.961", 
"003", "1", "1.09", "1.116", "1.124", "1.224", "1.311", "1.357", 
"1.471", "1.509", "1.538", "1.649", "1.679", "1.782", "1999", 
"2.084", "2.148", "2.162", "2.214", "2.313", "2.429", "2.848", 
"2/19/2014 11:06", "2011", "23017323011sp1", "24", "37.5455", 
"A", "LY", "LY1A003A", "sp1"), class = "factor"), Tree003B = structure(c(56L, 
19L, 54L, 18L, 55L, 49L, 57L, 51L, 58L, 50L, 52L, 48L, 53L, 1L, 
1L, 1L, 1L, 10L, 7L, 8L, 6L, 5L, 4L, 3L, 2L, 11L, 9L, 30L, 15L, 
24L, 20L, 23L, 33L, 37L, 42L, 13L, 44L, 36L, 12L, 16L, 21L, 27L, 
35L, 41L, 38L, 26L, 40L, 14L, 46L, 32L, 28L, 17L, 31L, 22L, 39L, 
43L, 45L, 47L, 25L, 34L, 29L), .Label = c("", "0.073", "0.092", 
"0.173", "0.174", "0.358", "0.413", "0.425", "0.58", "0.697", 
"0.719", "0.843", "0.883", "0.896", "0.937", "0.941", "0.964", 
"003", "1", "1.048", "1.067", "1.075", "1.097", "1.119", "1.125", 
"1.176", "1.207", "1.267", "1.268", "1.27", "1.297", "1.402", 
"1.429", "1.556", "1.662", "1.693", "1.704", "1.735", "1.76", 
"1.792", "1.816", "1.881", "1.915", "1.92", "1.923", "2.155", 
"2.204", "2/19/2014 11:06", "2000", "2011", "23017323011sp1", 
"48", "54.8963", "A", "B", "LY", "LY1A003B", "sp1"), class = "factor"), 
    Tree008B = structure(c(59L, 24L, 57L, 23L, 58L, 52L, 60L, 
    54L, 61L, 53L, 55L, 51L, 56L, 19L, 14L, 13L, 22L, 7L, 8L, 
    9L, 4L, 6L, 3L, 1L, 2L, 10L, 25L, 47L, 43L, 49L, 46L, 40L, 
    50L, 48L, 44L, 17L, 36L, 31L, 27L, 30L, 39L, 37L, 34L, 45L, 
    38L, 32L, 41L, 29L, 42L, 33L, 28L, 26L, 21L, 11L, 15L, 16L, 
    18L, 12L, 5L, 20L, 35L), .Label = c("0.302", "0.31", "0.318", 
    "0.357", "0.382", "0.412", "0.452", "0.476", "0.5", "0.539", 
    "0.591", "0.669", "0.673", "0.787", "0.79", "0.827", "0.835", 
    "0.854", "0.879", "0.907", "0.917", "0.967", "008", "1", 
    "1.027", "1.037", "1.141", "1.152", "1.172", "1.263", "1.383", 
    "1.411", "1.446", "1.498", "1.514", "1.611", "1.671", "1.685", 
    "1.695", "1.719", "1.783", "1.879", "1.884", "1.927", "1.97", 
    "2.019", "2.069", "2.35", "2.696", "2.979", "2/19/2014 11:06", 
    "2000", "2011", "23017323011sp1", "48", "57.9732", "A", "B", 
    "LY", "LY1A008B", "sp1"), class = "factor"), Tree013A = structure(c(45L, 
    6L, 44L, 5L, 44L, 38L, 46L, 40L, 47L, 39L, 42L, 37L, 43L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 10L, 
    13L, 8L, 22L, 14L, 18L, 24L, 4L, 11L, 25L, 7L, 36L, 41L, 
    33L, 29L, 17L, 28L, 23L, 21L, 16L, 26L, 15L, 3L, 20L, 12L, 
    2L, 9L, 34L, 35L, 27L, 32L, 31L, 30L, 19L), .Label = c("", 
    "0.608", "0.916", "0.945", "013", "1", "1.125", "1.18", "1.388", 
    "1.423", "1.493", "1.498", "1.554", "1.579", "1.619", "1.629", 
    "1.719", "1.756", "1.858", "1.867", "1.869", "1.876", "1.9", 
    "1.916", "2.023", "2.089", "2.224", "2.246", "2.247", "2.315", 
    "2.413", "2.515", "2.547", "2.645", "2.722", "2.785", "2/19/2014 11:11", 
    "2002", "2011", "23017323011sp1", "3.375", "34", "64.0679", 
    "A", "LY", "LY1A013A", "sp1"), class = "factor")), .Names = c("OriginalTreeID", 
"Tree003A", "Tree003B", "Tree008B", "Tree013A"), row.names = c(NA, 
61L), class = "data.frame")

La solution 2

try this.....

d.f <- your data structure...above

subset the data

d.f <- d[-(1:13), -1]

c.n <- colnames(d.f)

build the grouping var

~~f <- gsub(".?$", "", c.n)~~

f <- d[4, 2:ncol(d)]

split the dataframeinto sub-dataframes

d.f <- apply(d.f, 2, as.numeric)
d.f[is.na(d.f)] <- 0
d.f.g <- as.data.frame(t(d.f))
a <- split(d.f.g, f)

calculate the groupwise averages as colMeans (because transposed)

grp.means <- lapply(a, colMeans)

the grp.means is a list of dataframes each containing the date averages for each grp. re-form this as required, you'll probably want to transpose again.

Autres conseils

Here is another approach where most of the work is done by rearranging the data with the reshape package. After the data is "munged", it can be rearranged into almost anything you want with the cast function.

# I'm used to the transpose
y = t(x)

# Make the first row the column names
# Also get rid of hashes. They make things difficult
library(stringr)
colnames(y) = str_replace( y[1,], "#", "" )
y = data.frame(y[-1,],check.names=FALSE)

# reshape the data by defining the "ID" variables
library(reshape)
z = melt(y,id.vars=c("Township","Plot","Subplot","Tree",
                     "CoreLetter","COFECHA ID","SiteID","SP","AnalysisDateTime"))
z$value = as.numeric(as.character(z$value))

# Now you can do lots of things!
# All the info you wanted is there, but it's in a different format
# than your "intended output"
cast( z, Tree ~ variable, mean, na.rm=TRUE )

# To get to your "intended output"
out = cast( z, Township + Plot + Subplot + Tree ~ variable, mean, na.rm=TRUE )
out[["COFECHA ID"]] = with(out,paste0(Township,Plot,Subplot,Tree,"avg"))
out2 = out[,c(1,ncol(out),8:(ncol(out)-1))]
out3 = cbind(colnames(out2),t(out2))
colnames(out3) = c("OriginalID",paste0("Tree",out$Tree,"avg"))

# For kicks, here are some other things. Have fun!
cast(z, Tree ~ variable, median, na.rm=TRUE ) # the median instead of the mean
cast(z, Tree + CoreLetter ~ variable ) # back to your original data
cast(z, CoreLetter ~ variable, length ) # How many measurements from each core?
cast(z, CoreLetter ~ variable, mean ) # The average across different cores

For even more fun!

library(ggplot2)
d = z[-c(1:16), ] # A not so pretty hack
colnames(d)[10] = "Year"
d$Year = as.integer(as.character(d$Year))
ggplot(d, aes(x=Year, y=value, group=Tree, color=Tree, shape=CoreLetter)) + 
  geom_point() + geom_smooth(method="loess",span=0.3)

enter image description here

Does this mean that early 2000's were dry?

Licencié sous: CC-BY-SA avec attribution

Non affilié à StackOverflow