New variable based on conditional arithmetic by group

https://stackoverflow.com/questions/22667496

21-06-2023
|

Question

I have a data.frame df where I want to create a new variable that is the proportion of another by group. That is for each Species ID Plot Sub paring I'd like to find the proportion of Area by Type. If Type = 0, then PropArea == 1, if Type does not equal 0 (i.e. 1 or 2), then, for example, PropArea = Area (Type 1) / Area (Type 0). An sample data.frame is below. I know how to do this with if statements in excel, but was hoping to find a way to do this within r.

df <- structure(list(Species = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("BFGR", "RNNN"), class = "factor"), 
    ID = c(201L, 201L, 201L, 201L, 201L, 201L, 219L, 219L, 219L, 
    219L, 219L, 219L, 220L, 220L), Plot = c(1L, 1L, 2L, 2L, 1L, 
    1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L), Sub = c(2L, 2L, 2L, 
    2L, 3L, 3L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L), Type = c(0L, 
    1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L), Area = c(0.78, 
    0.445, 0.023, 0.015, 0.79, 0.235, 1.29, 1.29, 2.555, 1.065, 
    1.365, 1.365, 2.678, 1.305), PropArea = c(1, 0.570512821, 
    1, 0.652173913, 1, 0.297468354, 1, 1, 1, 0.416829746, 1, 
    1, 1, 0.487303958)), .Names = c("Species", "ID", "Plot", 
"Sub", "Type", "Area", "PropArea"), class = "data.frame", row.names = c(NA, 
-14L))

## A more complete data set    
 df_more <- structure(list(Species = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 
6L, 6L, 6L, 6L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ACRU", "DIVI", 
"LIST", "LITU", "PEPA", "QULA"), class = "factor"), ID = c(205L, 
205L, 205L, 205L, 205L, 205L, 219L, 219L, 219L, 219L, 219L, 219L, 
219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 
219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 219L, 
219L, 219L, 221L, 221L, 222L, 222L, 222L, 222L, 222L, 222L, 222L, 
222L, 222L, 222L, 222L, 222L, 222L, 222L, 222L, 222L, 222L, 222L, 
222L, 222L, 222L, 222L, 222L, 222L, 227L, 227L, 227L, 227L, 227L, 
227L, 227L, 227L, 227L, 227L, 227L, 227L, 228L, 228L, 228L, 228L, 
228L, 228L, 228L, 228L, 228L, 228L, 228L, 228L, 228L, 228L, 228L, 
228L, 228L, 228L, 228L, 228L, 228L, 228L, 228L, 229L, 229L, 229L, 
229L, 229L, 229L, 229L, 229L, 229L, 229L, 229L, 229L, 229L, 229L
), Plot = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), Sub = c(2L, 2L, 3L, 3L, 4L, 4L, 2L, 2L, 2L, 3L, 3L, 
3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 
10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 2L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 
6L, 6L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 7L, 8L, 
8L, 8L, 9L, 9L, 9L, 10L, 10L, 11L, 11L, 2L, 2L, 2L, 3L, 3L, 3L, 
4L, 4L, 5L, 5L, 6L, 6L, 6L, 7L), Type = c(0L, 1L, 0L, 1L, 0L, 
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 0L, 1L, 0L, 
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
1L, 0L, 0L, 1L, 1L, 2L, 2L, 0L, 0L, 1L, 1L, 2L, 2L, 0L, 0L, 1L, 
1L, 2L, 2L, 0L, 0L, 1L, 1L, 2L, 2L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 
1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 2L, 
0L, 1L, 2L, 0L, 1L, 2L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 2L, 0L, 
1L, 2L, 0L, 1L, 1L, 2L, 0L, 1L, 2L, 0L), Area = c(5.67, 3.24, 
6.65, 4.26, 10.24, 1.31, 1.12, 1.23, 1.23, 0.88, 0.86, 0.86, 
0.11, 1.36, 1.36, 1.17, 2.33, 2.33, 1.15, 1.15, 1.23, 1.23, 1.27, 
1.27, 0.97, 0.97, 1.39, 1.39, 1.07, 1.07, 1.49, 1.49, 1.33, 1.33, 
2.35, 2.35, 1.8, 1.8, 7.5, 7.42, 6.35, 6.82, 0.37, 0.48, 8.67, 
8.57, 5.47, 5.66, 2.35, 2.42, 11.99, 12.8, 6.18, 6.19, 2.56, 
2.71, 25.77, 25.6, 16.01, 16.56, 3.36, 3.35, 1.08, 0.12, 5.34, 
5.34, 6.15, 6.15, 6.93, 6.93, 8.91, 8.91, 10.91, 10.91, 2.31, 
1.21, 3.2, 2.42, 2.41, 2.41, 2.32, 2.32, 2.48, 2.48, 0.7, 2.89, 
2.89, 1.27, 3.66, 3.66, 0.75, 8, 8, 8.85, 8.85, 11.22, 11.22, 
5.08, 2.96, 0.22, 5, 3.01, 0.92, 6.94, 3.88, 4.48, 1.18, 9.03, 
4.19, 0.5, 9.97)), .Names = c("Species", "ID", "Plot", "Sub", 
"Type", "Area"), row.names = c(NA, 111L), class = "data.frame")

Solution

As long as you're OK with your data.frame being resorted, this should work:

library(plyr)
df2 <- ddply(df_more, .(Species, ID, Plot, Sub), function(groupdf) {
  denominator <- groupdf[groupdf$Type==0,"Area"]
  if(length(denominator) == 0) denominator <- groupdf[groupdf$Type==1,"Area"]
  transform(groupdf, PropArea=Area/denominator)
})

And if you want to keep the same ordering, add these lines:

df1 <- df2[match(
  interaction(df[c("Species", "ID", "Plot", "Sub", "Type")]), 
  interaction(df2[c("Species", "ID", "Plot", "Sub", "Type")])),]

OTHER TIPS

If you can guarantee alternation of 0s with 1s and 2s like in your example, you could use ifelse:

df$PropArea <- ifelse(df$Type == 0, 1, df$Area / c(1, df$Area[-nrow(df)]))

There are duplicates in the df_more dataset. E.g. DIVI/22/1/2/0 is having an area of both 7.50 and 7.42. This will lead to errors.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow