0
|
1
|
|
2 # This program checks if any of the sequence in a query fasta file is present in the
|
|
3 # reference fasta file (checks for sub-string) or not. If they are present, it removes it.
|
|
4 # The updated database will have sequences unique to query fasta file.
|
|
5
|
|
6 def main():
|
|
7 import sys
|
|
8 from Bio import SeqIO
|
|
9
|
|
10 ref_fastaFile = sys.argv[1].strip()
|
|
11 query_fastaFile = sys.argv[2].strip()
|
|
12
|
|
13 x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta"))
|
|
14 y = x.values()
|
|
15 b = []
|
|
16 for a in y:
|
|
17 b.append(str(a.seq))
|
|
18 ref_fastaSeq = "#".join(b)
|
|
19
|
|
20 outfh = open(sys.argv[3].strip(), "w")
|
|
21
|
|
22 x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta"))
|
|
23 y = x.values()
|
|
24 count = 0
|
|
25 for a in x.keys():
|
|
26 seq = str(x[a].seq)
|
|
27 desc = str(x[a].description)
|
|
28 if ref_fastaSeq.find(seq) < 0:
|
|
29 outfh.write(">" + desc + "\n" + seq + "\n")
|
|
30 else:
|
|
31 count = count + 1
|
|
32 print >> sys.stdout,"Total Number of Sequences Removed: %d" % count
|
|
33 outfh.close()
|
|
34 return None
|
|
35
|
|
36 if __name__ == "__main__":
|
|
37 main()
|
|
38
|
|
39
|
|
40
|
|
41
|
|
42
|