regex python Fasta

Question 1

You could first collect the id numbers from file a to a set for fast lookup later:

seta = set()
regexa = re.compile(r'\*(\d+)') #matches asterisk followed by digits, captures digits
for line in a:
    m = regexa.match(line)      #looks for match at start of line
    if m:
        seta.add(m.group(1))

Then loop over b. Use b.next() inside the loop to get the second line where the sequence is.

regexb = re.compile(r'>OCTU(\d+)')  #matches ">OCTU" followed by digits, captures digits
for line in b:
    m = regexb.match(line)
    if m:
        sequence = b.next() 
        if m.group(1) in seta:
            c.write(line)
            c.write(sequence)

Question 2

You may want to use Biopython to parse the fasta file.

Then you can slice out the number and look it up in your list and access the sequence and sequence name more reliably...If a fasta file has line wrapping the above method may run into problems...

import collections
from Bio import SeqIO

infile = "yourfastafile.fasta"
outfile = "desired_outfilename.fasta"

dct = collections.OrderedDict()
for record in SeqIO.parse(open(infile), "fasta"):
    dct[record.description()] = str(record.seq).upper()

for k,v in dct.items():
    if int(k[4:]) in seta: #from answer above
        with open(outfile, "a") as handle:
            handle.write(">" + k + "\n" + str(v) + "\n")

Question 3

coding=utf8

the above tag defines encoding for this document and is for Python 2.x compatibility

import re

regex = r">.+\n[acgtnACGTN\n]+"

test_str = (">AB000263 |acc=AB000263|descr=Homo sapiens mRNA for prepro cortistatin like peptide, complete cds.|len=368\n"
    "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGGCCACGGCCACCGCTGCCCTGCC\n"
    "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGCGGCAGGAATAAGGAAAAGCAGC\n"
    "CTCCTGACTTTCCTCGCTTGGTGGTTTGAGTGGACCTCCCAGGCCAGTGCCGGGCCCCTCATAGGAGAGG\n"
    "AAGCTCGGGAGGTGGCCAGGCGGCAGGAAGGCGCACCCCCCCAGCAATCCGCGCGCCGGGACAGAATGCC\n"
    "CTGCAGGAACTTCTTCTGGAAGACCTTCTCCTCCTGCAAATAAAACCTCACCCATGAATGCTCACGCAAG\n"
    "TTTAATTACAGACCTGAA")

matches = re.finditer(regex, test_str)

for matchNum, match in enumerate(matches):
    matchNum = matchNum + 1

    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))

    for groupNum in range(0, len(match.groups())):
        groupNum = groupNum + 1

        print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))

Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.