Although not "Rcpp" as you asked, here is an alternative using .Call
, that seems valid:
library(inline)
ff = cfunction(sig = c(R_mat = "matrix", R_n = "numeric"), body = '
SEXP mat, dims, ans;
PROTECT(mat = coerceVector(R_mat, REALSXP)); //or `as.numeric(.)` in R
PROTECT(dims = getAttrib(R_mat, R_DimSymbol));
R_len_t rows = INTEGER(dims)[0], cols = INTEGER(dims)[1];
R_len_t n = INTEGER(coerceVector(R_n, INTSXP))[0];
R_len_t *buf = (int *) R_alloc(rows, sizeof(int)), b = 0; //dynamic allocation
//of a pointer to store
//the indices of rows
//that match the criterion.
//a classic usage of this
//is in do_which (summary.c)
double *pmat = REAL(mat); //pointer to the matrix input
for(int ir = 0; ir < rows; ir++) {
R_len_t COUNT_CONS_NAS = 0;
for(int ic = 0; ic < cols; ic++) {
if(ISNAN(pmat[ir + ic*rows])) { //if NA is found
COUNT_CONS_NAS++; //start counting NAs
if(COUNT_CONS_NAS == n) break; //no need to search all columns
}
else {
COUNT_CONS_NAS = 0; //if not NA, counter back to zero
}
}
if(COUNT_CONS_NAS == n) { //if the specific row matched the criterion
buf[b] = ir + 1; //store its index
b++;
}
}
PROTECT(ans = allocVector(INTSXP, b)); //allocate a vector with
//length = No rows that matched criterion
memcpy(INTEGER(ans), buf, sizeof(int)*b); //copy rows indices to
//the pointer of ans
UNPROTECT(3);
return(ans);
')
set.seed(11);mat = matrix(sample(c(NA, 0:2), 30, T, prob = c(0.7, 0.1, 0.1, 0.1)), 6)
mat
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA 0 NA NA
#[2,] NA NA 2 NA NA
#[3,] NA 2 1 NA NA
#[4,] NA NA NA NA NA
#[5,] NA NA NA NA NA
#[6,] 0 NA NA NA NA
ff(mat, 3)
#[1] 4 5 6
mat[-ff(mat, 3),]
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA 0 NA NA
#[2,] NA NA 2 NA NA
#[3,] NA 2 1 NA NA
And some benchmarking:
#library(Rcpp) ; sourceCpp("~/ffcpp.cpp")
identical(dat[!cons.missings(dat,3), ], dat[cons_missings(is.na(dat),3), ])
#[1] TRUE
identical(dat[!cons.missings(dat,3), ], dat[-ff(dat, 4), ])
#[1] TRUE
library(microbenchmark)
microbenchmark(dat[!cons.missings(dat,3), ],
dat[cons_missings(is.na(dat),3), ],
dat[-ff(dat, 4), ], times = 10)
#Unit: milliseconds
expr min lq median uq max neval
#dat[!cons.missings(dat, 3), ] 3628.960362 3674.531704 3777.270890 3840.79075 3885.58709 10
#dat[cons_missings(is.na(dat), 3), ] 5256.550903 5267.137257 5325.497516 5365.13947 5554.88023 10
#dat[-ff(dat, 4), ] 6.444897 7.749669 8.971304 11.35649 58.94499 10
#the rows that each function will remove
resff <- ff(dat, 4)
rescons.mis <- which(cons.missings(dat,3))
rescons_mis <- seq_len(nrow(dat))[-cons_missings(is.na(dat),3)]
sum(resff != rescons.mis)
#[1] 0
sum(resff != rescons_mis)
#[1] 0
sum(rescons_mis != rescons.mis)
#[1] 0
length(resff)
#[1] 5671
length(rescons.mis)
#[1] 5671
length(rescons_mis)
#[1] 5671