Try this on for size:
# this code assumes Python 2.7
from itertools import groupby, izip
from operator import attrgetter
INPUT = "file.txt"
HOMO_YES = "homologs.txt"
HOMO_NO = "nonhomologs.txt"
MAX_DIFF = 5
class Row:
__slots__ = ["line", "con", "protein", "start", "end"]
def __init__(self, s):
self.line = s.rstrip()
data = s.split()
self.con = data[0]
self.protein = data[1]
self.start = int(data[2])
self.end = int(data[3])
def __str__(self):
return self.line
def count_homologs(items, max_diff=MAX_DIFF):
num_items = len(items)
counts = [0] * num_items
# first item
for i, item_i in enumerate(items):
max_start = item_i.start + max_diff
max_end = item_i.end + max_diff
# second item
for j in xrange(i+1, num_items):
item_j = items[j]
if item_j.start > max_start:
break
elif item_j.end <= max_end:
counts[i] += 1
counts[j] += 1
return counts
def main():
with open(INPUT) as inf, open(HOMO_YES, "w") as outhomo, open(HOMO_NO, "w") as outnothomo:
# skip header
next(inf, '')
rows = (Row(line) for line in inf)
for con, item_iter in groupby(rows, key=attrgetter("con")):
# per-con list of Rows sorted by start,end
items = sorted(item_iter, key=attrgetter("start", "end"))
# get #homologs for each item
counts = count_homologs(items)
# do output
for c,item in izip(counts, items):
if c:
outhomo.write(str(item) + "\n")
else:
outnothomo.write(str(item) + "\n")
if __name__=="__main__":
main()
on your given data, produces:
=== homologs.txt ===
con1 P1 140 602
con1 P2 140 602
con3 P9 16 348
con3 P8 17 348
=== nonhomologs.txt ===
con1 P3 232 548
con2 P6 335 779
con2 P4 335 801
con2 P5 642 732
con2 P7 729 812