annotate removeFastaSubSequence.py @ 1:d49328dfeceb draft default tip

planemo upload
author pravs
date Wed, 02 Aug 2017 18:20:55 -0400
parents 9ec27561593e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9ec27561593e planemo upload
pravs
parents:
diff changeset
1
9ec27561593e planemo upload
pravs
parents:
diff changeset
2 # This program checks if any of the sequence in a query fasta file is present in the
9ec27561593e planemo upload
pravs
parents:
diff changeset
3 # reference fasta file (checks for sub-string) or not. If they are present, it removes it.
9ec27561593e planemo upload
pravs
parents:
diff changeset
4 # The updated database will have sequences unique to query fasta file.
9ec27561593e planemo upload
pravs
parents:
diff changeset
5
9ec27561593e planemo upload
pravs
parents:
diff changeset
6 def main():
9ec27561593e planemo upload
pravs
parents:
diff changeset
7 import sys
9ec27561593e planemo upload
pravs
parents:
diff changeset
8 from Bio import SeqIO
9ec27561593e planemo upload
pravs
parents:
diff changeset
9
9ec27561593e planemo upload
pravs
parents:
diff changeset
10 ref_fastaFile = sys.argv[1].strip()
9ec27561593e planemo upload
pravs
parents:
diff changeset
11 query_fastaFile = sys.argv[2].strip()
9ec27561593e planemo upload
pravs
parents:
diff changeset
12
9ec27561593e planemo upload
pravs
parents:
diff changeset
13 x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta"))
9ec27561593e planemo upload
pravs
parents:
diff changeset
14 y = x.values()
9ec27561593e planemo upload
pravs
parents:
diff changeset
15 b = []
9ec27561593e planemo upload
pravs
parents:
diff changeset
16 for a in y:
9ec27561593e planemo upload
pravs
parents:
diff changeset
17 b.append(str(a.seq))
9ec27561593e planemo upload
pravs
parents:
diff changeset
18 ref_fastaSeq = "#".join(b)
9ec27561593e planemo upload
pravs
parents:
diff changeset
19
9ec27561593e planemo upload
pravs
parents:
diff changeset
20 outfh = open(sys.argv[3].strip(), "w")
9ec27561593e planemo upload
pravs
parents:
diff changeset
21
9ec27561593e planemo upload
pravs
parents:
diff changeset
22 x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta"))
9ec27561593e planemo upload
pravs
parents:
diff changeset
23 y = x.values()
9ec27561593e planemo upload
pravs
parents:
diff changeset
24 count = 0
9ec27561593e planemo upload
pravs
parents:
diff changeset
25 for a in x.keys():
9ec27561593e planemo upload
pravs
parents:
diff changeset
26 seq = str(x[a].seq)
9ec27561593e planemo upload
pravs
parents:
diff changeset
27 desc = str(x[a].description)
9ec27561593e planemo upload
pravs
parents:
diff changeset
28 if ref_fastaSeq.find(seq) < 0:
9ec27561593e planemo upload
pravs
parents:
diff changeset
29 outfh.write(">" + desc + "\n" + seq + "\n")
9ec27561593e planemo upload
pravs
parents:
diff changeset
30 else:
9ec27561593e planemo upload
pravs
parents:
diff changeset
31 count = count + 1
9ec27561593e planemo upload
pravs
parents:
diff changeset
32 print >> sys.stdout,"Total Number of Sequences Removed: %d" % count
9ec27561593e planemo upload
pravs
parents:
diff changeset
33 outfh.close()
9ec27561593e planemo upload
pravs
parents:
diff changeset
34 return None
9ec27561593e planemo upload
pravs
parents:
diff changeset
35
9ec27561593e planemo upload
pravs
parents:
diff changeset
36 if __name__ == "__main__":
9ec27561593e planemo upload
pravs
parents:
diff changeset
37 main()
9ec27561593e planemo upload
pravs
parents:
diff changeset
38
9ec27561593e planemo upload
pravs
parents:
diff changeset
39
9ec27561593e planemo upload
pravs
parents:
diff changeset
40
9ec27561593e planemo upload
pravs
parents:
diff changeset
41
9ec27561593e planemo upload
pravs
parents:
diff changeset
42