Filter sequences

Question

If you can be sure that the proteins and lengths file are in the same order, you might want to modify the code not to use a dict to be more memory efficient for large data sets, e.g. write a generator function that yields the second column converted to int and then itertools.izip() it with SeqIO.parse().

def read_lengths(path):
    """Reads "length file" into dict mapping sequence ID to length."""
    lengths = {}
    with open(path) as f:
        for line in f:
            seq, length = line.strip().split()
            lengths[seq] = int(length)
    return lengths


def enclosed_substrings(s, start, stop):
    """Find all substrings starting with `start` and ending with `stop`."""
    startpos = 0
    stoppos = 0
    while True:
        startpos = s.find(start, startpos)
        if startpos < 0:
            break
        stoppos = s.find(stop, startpos + 1)
        if stoppos < 0:
            break
        yield s[startpos:stoppos + 1]
        startpos += 1


def seq_check(record, expected_lens, len_factor=0.9, start='M', stop='*'):
    min_len = expected_lens[record.id] * len_factor
    for sub in enclosed_substrings(record.seq, start, stop):
        if len(sub) >= min_len:
            return True
    return False


source_file = 'protein.fasta'
out_file = 'filtered.fa'
length_file = 'protein_length.txt'

expected_lengths = read_lengths(length_file)
seqs = SeqIO.parse(source_file, 'fasta')
filtered = (seq for seq in seqs if seq_check(seq, expected_lengths))
SeqIO.write(filtered, out_file, 'fasta')