search pattern in sequence and report identity
-
19-10-2022 - |
Question
I have 2 fasta files with sequence's in it.I want to align the sequences in second file to first file and report identity
For example:
File1:
>s1
aaccggactggacatccg
>s2
gtcgactctcggaattg
....
File2:
>a1
actg
>a2
tccg
.....
I want to take the file2 sequences and look in file1 and print the matching with mismatched base in uppercase and identity in csv format
Output
name,a1_alignment,a1_identity,a2_alignment,a2_identity
s1,actg,100,tccg,100
s2,aCtg,95,tcCg,95
Here what I did so far:
import sys
import os,csv
from Bio import SeqIO
from itertools import *
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--mismatch_threshold", dest="mismatch_threshold", default = 2,
help="This is the number of differences you'll allow between the actualread and your sequence of interest. Default is 2")
(options, args) = parser.parse_args()
if len(sys.argv) != 4:
print "Usage : python search.py <file1> <file2> <fout>"
sys.exit()
f1 = open(sys.argv[1],'r')
f2 = open(sys.argv[2],'r')
fout = open(sys.argv[3],'w')
writer = csv.writer(fout)
def long(f1):
for record in SeqIO.parse(f1,'fasta'):
header = record.name
sequence = record.seq
yield [header, sequence]
def short(f2):
for record in SeqIO.parse(f2,'fasta'):
head = record.name
seq = record.seq
return seq
def alignment(sequence,seq,mismatch_threshold):
l1 = len(sequence)
l2 = len(seq)
alignment = []
for i in range(0,min(l1,l2)):
if sequence[i] == seq[i]:
alignment.append(i)
else:
mismatch = sum( c1 != c2 for c1,c2 in zip(sequence,seq))
if mismatch <= mismatch_threshold:
alignment.append(i)
k = 0
l = 0
for read in alignment:
for letter in read:
if letter == isupper():
pass
else:
if letter == alignment[0].seq[j]:
l +=1
k += 1
k = 0
length = seq
percent = 100*l/len(seq)
#print percent
yield percent
longsequences = long(open(sys.argv[1],'r'))
shortsequences = short(open(sys.argv[2],'r'))
align = alignment(longsequences,shortsequences,options.mismatch_threshold)
for name in head:
writer.writerow(( name +'_alignment' , name + '_identity'))
for s in align:
# print to csv file
I need help in looking the file2 sequences in file1 with mismatches and print the alignment and also in calculating the identity percentage
Error:
File "s.py", line 34, in alignment
l1 = len(sequence)
TypeError: object of type 'generator' has no len()
No correct solution
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow