With your data
tx0 <- read.table(textConnection("chr start end
NONHSAT000001 chr1 11868 14409
NONHSAT000002 chr1 11871 14412
NONHSAT000003 chr1 11873 14409
NONHSAT000004 chr1 12009 13670
NONHSAT000005 chr1 14777 16668
NONHSAT000006 chr1 15602 29370"))
gene0 <- read.table(textConnection("chr start end
NONHSAG000001 chr1 11869 14412
NONHSAG000002 chr1 14778 29370
NONHSAG000003 chr1 29554 31109
NONHSAG000004 chr1 34554 36081
NONHSAG000005 chr1 36273 50281
NONHSAG000006 chr1 62948 63887"))
The GenomicRanges package in Bioconductor does this easily and efficiently (for millions of overlaps).
library(GenomicRanges)
tx <- with(tx0, GRanges(chr, IRanges(start, end)))
gene <- with(gene0, GRanges(chr, IRanges(start, end)))
## increase width by 10 on both sides of the center of the gene range
gene <- resize(gene, width=width(gene) + 20, fix="center")
## find overlaps of 'query' tx and 'subject' gene, where query is within subject
olaps <- findOverlaps(tx, gene, type="within")
Showing, e.g., that 'query' (tx) 1, 2, 3, and 4 are within 'subject' (gene) 1.
> findOverlaps(tx, gene, type="within")
Hits of length 6
queryLength: 6
subjectLength: 6
queryHits subjectHits
<integer> <integer>
1 1 1
2 2 1
3 3 1
4 4 1
5 5 2
6 6 2
and that gene 1 is overlapped by 4 transcripts, gene 2 by 2 transcripts.
> table(subjectHits(olaps))
1 2
4 2
See also this publication. Using the larger data set:
tx <- with(transcriptcoords, GRanges(V1, IRanges(V2, V3, names=rownames(tx0))))
gene <- with(genecoords, GRanges(V1, IRanges(V2, V3, names=rownames(gene0))))
with some timings:
system.time(gene <- resize(gene, width=width(gene) + 20, fix="center"))
## user system elapsed
## 0.056 0.000 0.057
system.time(findOverlaps(tx, gene, type="within"))
## user system elapsed
## 2.248 0.000 2.250
I think this is approximately the time for the data.table solution from @danas.zuokos with just 1000 transcripts
system.time({
dt <- genecoords[transcriptcoords, allow.cartesian = TRUE];
res <- dt[start <= start.1 + tol & end >= end.1 - tol,
list(gene = gene[which.min(size)]), by = transcript]
})
## user system elapsed
## 2.148 0.244 2.400