Question

I have a huge data.frame with several NA values in it. It seems that I get problems, if many NA values occur sequently.

Is there an easy way to find those rows in which NA values occur e.g. 20 times one after another, but not the ones where 20 NA values occur isolated?

EDIT (added by agstudy)

The accepted solution uses apply which is not very efficient for hudge matrix. So I edit the solution (I add the Rcpp tag) to ask for more efficient solution.

Was it helpful?

Solution

You can create a function anlagous to complete.cases that computes consecutive missings values using rle:

cons.missings <- 
function(dat,n)
apply(is.na(dat),1,function(x){
  yy <- rle(x)
  any(yy$lengths[yy$values]>n)
})

Then to keep only good rows:

dat[!cons.missings(dat,20),]

Example with 4 connectives missings values:

dat <- as.matrix(t(data.frame(a= c(1,rep(NA,4),5),
           b= c(2,rep(NA,2),1,rep(NA,2)))))

 [,1] [,2] [,3] [,4] [,5] [,6]
a    1   NA   NA   NA   NA    5
b    2   NA   NA    1   NA   NA

dat[!cons.missings(dat,3),]
[1]  2 NA NA  1 NA NA

OTHER TIPS

Although not "Rcpp" as you asked, here is an alternative using .Call, that seems valid:

library(inline)

ff = cfunction(sig = c(R_mat = "matrix", R_n = "numeric"), body = '
    SEXP mat, dims, ans;

    PROTECT(mat = coerceVector(R_mat, REALSXP)); //or `as.numeric(.)` in R
    PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
    R_len_t rows = INTEGER(dims)[0], cols = INTEGER(dims)[1];
    R_len_t n = INTEGER(coerceVector(R_n, INTSXP))[0];

    R_len_t *buf = (int *) R_alloc(rows, sizeof(int)), b = 0; //dynamic allocation 
                                                             //of a pointer to store 
                                                             //the indices of rows
                                                          //that match the criterion.
                                                           //a classic usage of this
                                                         //is in do_which (summary.c)

    double *pmat = REAL(mat);  //pointer to the matrix input
    for(int ir = 0; ir < rows; ir++) {
       R_len_t COUNT_CONS_NAS = 0;
       for(int ic = 0; ic < cols; ic++) {
           if(ISNAN(pmat[ir + ic*rows])) { //if NA is found
               COUNT_CONS_NAS++;          //start counting NAs  
               if(COUNT_CONS_NAS == n) break;  //no need to search all columns
           }
           else {
               COUNT_CONS_NAS = 0; //if not NA, counter back to zero 
           }
       }
       if(COUNT_CONS_NAS == n) {  //if the specific row matched the criterion
           buf[b] = ir + 1;   //store its index
           b++;
       }
    }

    PROTECT(ans = allocVector(INTSXP, b));  //allocate a vector with 
                                           //length = No rows that matched criterion
    memcpy(INTEGER(ans), buf, sizeof(int)*b);  //copy rows indices to 
                                               //the pointer of ans

    UNPROTECT(3);

    return(ans);
')


set.seed(11);mat = matrix(sample(c(NA, 0:2), 30, T, prob = c(0.7, 0.1, 0.1, 0.1)), 6)
mat
#     [,1] [,2] [,3] [,4] [,5]
#[1,]   NA   NA    0   NA   NA
#[2,]   NA   NA    2   NA   NA
#[3,]   NA    2    1   NA   NA
#[4,]   NA   NA   NA   NA   NA
#[5,]   NA   NA   NA   NA   NA
#[6,]    0   NA   NA   NA   NA
ff(mat, 3)
#[1] 4 5 6    
mat[-ff(mat, 3),]      
#     [,1] [,2] [,3] [,4] [,5]
#[1,]   NA   NA    0   NA   NA
#[2,]   NA   NA    2   NA   NA
#[3,]   NA    2    1   NA   NA

And some benchmarking:

#library(Rcpp) ; sourceCpp("~/ffcpp.cpp")
identical(dat[!cons.missings(dat,3), ], dat[cons_missings(is.na(dat),3), ])
#[1] TRUE
identical(dat[!cons.missings(dat,3), ], dat[-ff(dat, 4), ])
#[1] TRUE
library(microbenchmark)
microbenchmark(dat[!cons.missings(dat,3), ], 
               dat[cons_missings(is.na(dat),3), ],
               dat[-ff(dat, 4), ], times = 10)
#Unit: milliseconds
                                expr         min          lq      median         uq        max neval
       #dat[!cons.missings(dat, 3), ] 3628.960362 3674.531704 3777.270890 3840.79075 3885.58709    10
 #dat[cons_missings(is.na(dat), 3), ] 5256.550903 5267.137257 5325.497516 5365.13947 5554.88023    10
                  #dat[-ff(dat, 4), ]    6.444897    7.749669    8.971304   11.35649   58.94499    10

#the rows that each function will remove
resff <- ff(dat, 4)
rescons.mis <- which(cons.missings(dat,3)) 
rescons_mis <- seq_len(nrow(dat))[-cons_missings(is.na(dat),3)]

sum(resff != rescons.mis)
#[1] 0
sum(resff != rescons_mis)
#[1] 0
sum(rescons_mis != rescons.mis)
#[1] 0
length(resff)
#[1] 5671
length(rescons.mis)
#[1] 5671
length(rescons_mis)
#[1] 5671

I add another answer using Rcpp since OP is using big matrix. I am not an Rcpp proficient so I can't get better solution even I think I tried to implement an efficient rle missings algorithms.

#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
bool maxMissingSequence(IntegerVector x,int n) {

  // Initialise first value
  int lmissings = 1;
  double prev = x[0];
  for(IntegerVector::iterator it = x.begin() + 1; it != x.end(); ++it) {
    if (prev == *it && prev==1)lmissings++;
    if(lmissings >n) break;   // we are OK
    if(*it==0) lmissings =1;  // reset counter
    prev = *it;
  }
  return lmissings >n;
}

// [[Rcpp::export]]
IntegerVector cons_missings(IntegerMatrix Im, int n ){
   IntegerVector res ; 
   int nrows = Im.nrow();
   for (int i = 0; i < nrows; i++)
      if(!maxMissingSequence(Im(i,_),n))
         res.push_back(i+1);
  return res;
}

Benchmarkings

set.seed(2)
N <- 3*1e5
dat <- matrix(sample(c(1,NA),N,replace=TRUE),ncol=5)

cons.missings <- 
function(dat,n)
apply(is.na(dat),1,function(x){
  yy <- rle(x)
  any(yy$lengths[yy$values]>n)
})


identical(dat[!cons.missings(dat,3),],dat[cons_missings(is.na(dat),3),])
[1] TRUE

system.time(dat[!cons.missings(dat,3),])
   user  system elapsed 
   4.24    0.02    4.35 

> system.time(dat[cons_missings(is.na(dat),3),])
   user  system elapsed 
   6.34    0.00    6.48 
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top