Question

I'm working on a data set where the source name is specified by a 2-letter abbreviation in front of the variable. So all variables from source AA start with AA_var1, and source bb has bb_variable_name_2. There are actually a lot of sources, and a lot of variable names, but I leave only 2 as a minimal example.

I want to create a mean variable for any row where the number of sources, that is, where the number of unique prefixes for which the data on that row is not NA, is greater than 1. If there's only one source, I want that total variable to be NA.

So, for example, my data looks like this:

> head(df)
  AA_var1 AA_var2   myid   bb_meow bb_A_v1
1      NA      NA 123456        10      12
2      NA      10 194200        12      NA
3      12      10 132200        NA      NA
4      12      NA 132201        NA      12
5      NA      NA 132202        NA      NA
6      12      13 132203        14      NA

And I want the following:

> head(df)
  AA_var1 AA_var2   myid   bb_meow bb_A_v1  rowMeanIfDiverseData
1      NA      NA 123456        10      12                    NA #has only bb
2      NA      10 194200        12      NA                    11 #has AA and bb
3      12      10 132200        NA      NA                    NA #has only AA
4      12      NA 132201        NA      12                    12 #has AA and bb
5      NA      NA 132202        NA      NA                    NA #has neither
6      12      13 132203        14      NA                    13 #has AA and bb

Normally, I just use rowMeans() for this kind of thing. But the additional subsetting of selecting only rows whose variable names follow a convention /at the row level/ has caught me confused between the item-level and the general apply-level statements I'm used to.

I can get the prefixes at the dataframe level:

mynames <- names(df[!names(df) %in% c("myid")])
tmp <- str_extract(mynames, perl("[A-Za-z]{2}(?=_)"))
uniq <- unique(tmp[!is.na(tmp)])

So,

> uniq
[1] "AA" "bb"

So, I can make this a function I can apply to df like so:

multiSource <- function(x){
    nm = names(x[!names(x) %in% badnames])           # exclude c("myid")
    tmp <- str_extract(nm, perl("[A-Za-z]{2}(?=_)")) # get prefixes
    uniq <- unique(tmp[!is.na(tmp)])                 # ensure unique and not NA
    if (length(uniq) > 1){
        return(T)
    } else {
        return(F)
    }
 }

But this is clearly confused, and still getting data-set level, ie:

> lapply(df,multiSource)
$AA_var1
[1] FALSE

$AA_var2
[1] FALSE

$bb_meow
[1] FALSE

$bb_A_v1
[1] FALSE

And...

> apply(df,MARGIN=1,FUN=multiSource)

Gives TRUE for all.

I'd otherwise like to be saying...

df$rowMean <- rowMeans(df, na.rm=T)

# so, in this case
rowMeansIfTest <- function(X,test) {
   # is this row muliSource True?
   # if yes, return(rowMeans(X))
   # else return(NA)
}

df$rowMeanIfDiverseData <- rowMeansIfTest(df, test=multiSource)

But it is unclear to me how to do this without some kind of for loop.

Was it helpful?

Solution

The strategy here is to split the data frame by columns into variable groups, and for each row identifying if there are non-NA values. We then check with rowsums to make sure there are at least two variables with non-NA values for a row, and if so, add the mean of those values with cbind.

This will generalize to any number of columns so long as they are named in the AA_varXXX format, and so long as the only column not in that format is myid. Easy enough to fix if this isn't strictly the case, but these are the limitations on the code as written now.

df.dat <- df[!names(df) == "myid"]
diverse.rows <- rowSums(
  sapply(
    split.default(df.dat, gsub("^([A-Z]{2})_var.*", "\\1", names(df.dat))), 
    function(x) apply(x, 1, function(y) any(!is.na(y)))
) ) > 1
cbind(df, div.mean=ifelse(diverse.rows, rowMeans(df.dat, na.rm=T), NA))

Produces:

  AA_var1 AA_var2   myid BB_var3 BB_var4 div.mean
1      NA      NA 123456      10      12       NA
2      NA      10 194200      12      NA       11
3      12      10 132200      NA      NA       NA
4      12      NA 132201      NA      12       12
5      NA      NA 132202      NA      NA       NA
6      12      13 132203      14      NA       13

OTHER TIPS

This solution seems a little convoluted to me, so there's probably a better way, but it should work for you.

# Here's your data:
df <- data.frame(AA_var1 = c(NA,NA,12,12,NA,12),
                 AA_var2 = c(NA,10,10,NA,NA,13),
                 BB_var3 = c(10,12,NA,NA,NA,14),
                 BB_var4 = c(12,NA,NA,12,NA,NA))

# calculate rowMeans for each subset of variables
a <- rowMeans(df[,grepl('AA',names(df))], na.rm=TRUE)
b <- rowMeans(df[,grepl('BB',names(df))], na.rm=TRUE)

# count non-missing values for each subset of variables
a2 <- rowSums(!is.na(df[,grepl('AA',names(df))]), na.rm=TRUE)
b2 <- rowSums(!is.na(df[,grepl('BB',names(df))]), na.rm=TRUE)

# calculate means:
rowSums(cbind(a*a2,b*b2)) /
    rowSums(!is.na(df[,grepl('[AA]|[BB]',names(df))]), na.rm=TRUE)

Result:

> df$rowMeanIfDiverseData <- rowSums(cbind(a*a2,b*b2)) /
+         rowSums(!is.na(df[,grepl('[AA]|[BB]',names(df))]), na.rm=TRUE)
> df
  AA_var1 AA_var2 BB_var3 BB_var4 rowMeanIfDiverseData
1      NA      NA      10      12                  NaN
2      NA      10      12      NA                   11
3      12      10      NA      NA                  NaN
4      12      NA      NA      12                   12
5      NA      NA      NA      NA                  NaN
6      12      13      14      NA                   13

And a little cleanup to exactly match your intended output:

> df$rowMeanIfDiverseData[is.nan(df$rowMeanIfDiverseData)] <- NA
> df
  AA_var1 AA_var2 BB_var3 BB_var4 rowMeanIfDiverseData
1      NA      NA      10      12                   NA
2      NA      10      12      NA                   11
3      12      10      NA      NA                   NA
4      12      NA      NA      12                   12
5      NA      NA      NA      NA                   NA
6      12      13      14      NA                   13

My attempt, somewhat longwinded.....

dat<-data.frame(AA_var1=c(NA,NA,12,12,NA,12),
                    AA_var2=c(NA,10,10,NA,NA,13),
                    myid=1:6,
                    BB_var3=c(10,12,NA,NA,NA,14),
                    BB_var4=c(12,NA,NA,12,NA,NA))

#what columns are associated with variables used in our mean
varcols<-grep("*var[1-9]",names(dat),value=T)

#which rows have the requisite diversification of non-nulls
#i assume these columns will start with capitals and folloowed by underscore
meanrow<-apply(!is.na(dat[,varcols]),1,function(x){n<-varcols[x]
                              1<length(unique(regmatches(n,regexpr("[A-Z]+_",n))))
                                            })
#do the row mean for all 
dat$meanval<-rowMeans(dat[,varcols],na.rm=T)

#null out for those without diversification (i.e. !meanrow)
dat[!meanrow,"meanval"]<-NA

I think some of the answers are making this seem more complicated than it is. This will do it:

df$means = ifelse(rowSums(!is.na(df[, grep('AA_var', names(df))])) &
                    rowSums(!is.na(df[, grep('BB_var', names(df))])),
                  rowMeans(df[, grep('_var', names(df))], na.rm = T), NA)
#  AA_var1 AA_var2   myid BB_var3 BB_var4 means
#1      NA      NA 123456      10      12    NA
#2      NA      10 194200      12      NA    11
#3      12      10 132200      NA      NA    NA
#4      12      NA 132201      NA      12    12
#5      NA      NA 132202      NA      NA    NA
#6      12      13 132203      14      NA    13

Here's a generalization of the above, given the comment, assuming unique id's (if they're not, create a unique index instead):

library(data.table)
library(reshape2)

dt = data.table(df)
setkey(dt, myid) # not strictly necessary, but makes life easier

# find the conditional
cond = melt(dt, id.var = 'myid')[,
         sum(!is.na(value)), by = list(myid, sub('_var.*', '', variable))][,
         all(V1 != 0), keyby = myid]$V1

# fill in the means (could also do a join, but will rely on ordering instead)
dt[cond, means := rowMeans(.SD, na.rm = T), .SDcols = grep('_var', names(dt))]

dt
#   AA_var1 AA_var2   myid BB_var3 BB_var4 means
#1:      NA      NA 123456      10      12    NA
#2:      12      10 132200      NA      NA    NA
#3:      12      NA 132201      NA      12    12
#4:      NA      NA 132202      NA      NA    NA
#5:      12      13 132203      14      NA    13
#6:      NA      10 194200      12      NA    11
fun <- function(x) {
    MEAN <- mean(c(x[1], x[2], x[4], x[5]), na.rm=TRUE)
    CHECK <- sum(!is.na(c(x[1], x[2]))) > 0 & sum(!is.na(c(x[4], x[5])) > 0)
    MEAN * ifelse(CHECK, 1, NaN)
}
df$rowMeanIfDiverseData <- apply(df, 1, fun)
df

  AA_var1 AA_var2   myid BB_var3 BB_var4 rowMeanIfDiverseData
1      NA      NA 123456      10      12                  NaN
2      NA      10 194200      12      NA                   11
3      12      10 132200      NA      NA                  NaN
4      12      NA 132201      NA      12                   12
5      NA      NA 132202      NA      NA                  NaN
6      12      13 132203      14      NA                   13
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top