Question

I have 2 fasta files with sequence's in it.I want to align the sequences in second file to first file and report identity

For example:

File1:

>s1
aaccggactggacatccg
>s2
gtcgactctcggaattg
....

File2:

>a1
actg
>a2
tccg
.....

I want to take the file2 sequences and look in file1 and print the matching with mismatched base in uppercase and identity in csv format

Output

name,a1_alignment,a1_identity,a2_alignment,a2_identity
s1,actg,100,tccg,100
s2,aCtg,95,tcCg,95

Here what I did so far:

import sys
import os,csv
from Bio import SeqIO
from itertools import *
from optparse import OptionParser

parser = OptionParser()
parser.add_option("-m", "--mismatch_threshold", dest="mismatch_threshold", default = 2,
              help="This is the number of differences you'll allow between the   actualread and your sequence of interest.  Default is 2")
(options, args) = parser.parse_args()

if len(sys.argv) != 4:
    print "Usage : python search.py <file1> <file2> <fout>"
sys.exit()

f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
fout = open(sys.argv[3],'w')
writer = csv.writer(fout)

def long(f1):
    for record in SeqIO.parse(f1,'fasta'):
        header = record.name
        sequence = record.seq   
    yield [header, sequence]

def short(f2):
    for record in SeqIO.parse(f2,'fasta'):
            head = record.name
            seq = record.seq  
    return seq

def alignment(sequence,seq,mismatch_threshold):
    l1 = len(sequence)
    l2 = len(seq)
    alignment = []
    for i in range(0,min(l1,l2)):
        if sequence[i] == seq[i]:
            alignment.append(i)
        else:
            mismatch = sum( c1 != c2 for c1,c2 in zip(sequence,seq))                                    
            if mismatch <= mismatch_threshold:                               
                    alignment.append(i)
    k = 0
    l = 0
    for read in alignment:
        for letter in read:
            if letter == isupper():
                pass
            else:
                if letter == alignment[0].seq[j]:
                    l +=1
            k += 1
        k = 0
        length = seq
        percent = 100*l/len(seq)
        #print percent
        yield percent


longsequences = long(open(sys.argv[1],'r'))
shortsequences = short(open(sys.argv[2],'r'))
align = alignment(longsequences,shortsequences,options.mismatch_threshold)

for name in head:
    writer.writerow(( name +'_alignment' , name + '_identity'))
for s in align:
    # print to csv file    

I need help in looking the file2 sequences in file1 with mismatches and print the alignment and also in calculating the identity percentage

Error:

File "s.py", line 34, in alignment
l1 = len(sequence)
TypeError: object of type 'generator' has no len()

No correct solution

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top