A perfect use for ddply
from plyr
. Split the data.frame
up into subsets (by Gene_label
) and operate on each piece (find the snp
which relates to the max
of minus_logpval
):
## Reproducible example data
set.seed(1234)
df <- data.frame( Gene_label = rep( letters[1:3] , 3 ) , snp = rep( letters[5:7] , each = 3 ) , minus_logpval = rnorm(9) )
df
# Gene_label snp minus_logpval
#1 a e -1.2070657
#2 b e 0.2774292
#3 c e 1.0844412
#4 a f -2.3456977
#5 b f 0.4291247
#6 c f 0.5060559
#7 a g -0.5747400
#8 b g -0.5466319
#9 c g -0.5644520
## And a single line using 'ddply'
require(plyr)
ddply( df , .(Gene_label) , summarise , SNP = snp[which.max(minus_logpval)] )
# Gene_label SNP
#1 a g
#2 b f
#3 c e