You could also try ave
:
# for each individual within group, calculate number of 1s in PropertyType
v1 <- with(df, ave(PropertyType, list(GroupId, IndId), FUN = function(x) sum(x == 1)))
# within each group, check if all v1 is 1.
# The boolean result is coerced to 1 and 0 by ave.
df$ValidGroup <- ave(v1, df$GroupId, FUN = function(x) all(x == 1))
# GroupId IndId IndGroupProperty PropertyType ValidGroup
# 1 1 1 1 1 1
# 2 1 1 2 2 1
# 3 1 2 1 1 1
# 4 1 2 2 2 1
# 5 2 3 3 2 0
# 6 2 4 3 2 0
# 7 2 4 4 1 0
# 8 3 5 5 2 0
# 9 3 5 6 2 0
Edit Added dplyr
alternative and benchmark for data sets of different size: original data, and data that are 10 and 100 times larger than original.
First wrap up the alternatives in functions:
fun_ave <- function(df){
v1 <- with(df, ave(PropertyType, list(GroupId, IndId), FUN = function(x) sum(x == 1)))
df$ValidGroup <- ave(v1, list(df$GroupId), FUN = function(x) all(x == 1))
df
}
library(dplyr)
fun_dp <- function(df){
df %.%
group_by(GroupId, IndId) %.%
mutate(
type1 = any(PropertyType == 1)) %.%
group_by(GroupId, add = FALSE) %.%
mutate(
ValidGroup = all(type1) * 1) %.%
select(-type1)
}
fun_by <- function(df){
bar <- by(data=df,INDICES=df$GroupId,FUN=function(xx){
foo <- table(xx$IndId,xx$PropertyType)
if ( !("1" %in% colnames(foo)) ) {
return(FALSE) # no PropertyType=1 at all in this group
} else {
return(all(foo[,"1"]>0)) # return whether all IndId have an 1 entry
}})
cbind(df,ValidGroup = as.integer(bar[as.character(df$GroupId)]))
}
Benchmarks
Original data:
microbenchmark(
fun_ave(df),
fun_dp(df),
fun_by(df))
# Unit: microseconds
# expr min lq median uq max neval
# fun_ave(df) 497.964 519.8215 538.8275 563.5355 651.535 100
# fun_dp(df) 851.861 870.6765 931.1170 968.5590 1760.360 100
# fun_by(df) 1343.743 1412.5455 1464.6225 1581.8915 12588.607 100
On a tiny data set ave
is about twice as fast as dplyr
and more than 2.5 times faster than by
.
Generate some larger data; 10 times the number of groups and individuals
GroupId <- sample(1:30, 100, replace = TRUE)
IndId <- sample(1:50, 100, replace = TRUE)
PropertyType <- sample(1:2, 100, replace = TRUE)
df2 <- data.frame(GroupId, IndId, PropertyType)
microbenchmark(
fun_ave(df2),
fun_dp(df2),
fun_by(df2))
# Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df2) 2.928865 3.185259 3.270978 3.435002 5.151457 100
# fun_dp(df2) 1.079176 1.231226 1.273610 1.352866 2.717896 100
# fun_by(df2) 9.464359 9.855317 10.137180 10.484994 12.445680 100
dplyr
is three times faster than ave
and nearly 10 times faster than by
.
100 times the number of groups and individuals
GroupId <- sample(1:300, 1000, replace = TRUE)
IndId <- sample(1:500, 1000, replace = TRUE)
PropertyType <- sample(1:2, 1000, replace = TRUE)
df2 <- data.frame(GroupId, IndId, PropertyType)
microbenchmark(
fun_ave(df2),
fun_dp(df2),
fun_by(df2))
# Unit: milliseconds
# expr min lq median uq max neval
# fun_ave(df2) 337.889895 392.983915 413.37554 441.58179 549.5516 100
# fun_dp(df2) 3.253872 3.477195 3.58173 3.73378 75.8730 100
# fun_by(df2) 92.248791 102.122733 104.09577 109.99285 186.6829 100
ave
is really loosing ground now. dplyr
is nearly 30 times faster than by
, and more than 100 times faster than ave
.