Mercurial > repos > pravs > remove_fasta_subsequences
diff removeFastaSubSequence.py @ 0:9ec27561593e draft
planemo upload
author | pravs |
---|---|
date | Wed, 02 Aug 2017 18:09:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/removeFastaSubSequence.py Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,42 @@ + +# This program checks if any of the sequence in a query fasta file is present in the +# reference fasta file (checks for sub-string) or not. If they are present, it removes it. +# The updated database will have sequences unique to query fasta file. + +def main(): + import sys + from Bio import SeqIO + + ref_fastaFile = sys.argv[1].strip() + query_fastaFile = sys.argv[2].strip() + + x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta")) + y = x.values() + b = [] + for a in y: + b.append(str(a.seq)) + ref_fastaSeq = "#".join(b) + + outfh = open(sys.argv[3].strip(), "w") + + x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta")) + y = x.values() + count = 0 + for a in x.keys(): + seq = str(x[a].seq) + desc = str(x[a].description) + if ref_fastaSeq.find(seq) < 0: + outfh.write(">" + desc + "\n" + seq + "\n") + else: + count = count + 1 + print >> sys.stdout,"Total Number of Sequences Removed: %d" % count + outfh.close() + return None + +if __name__ == "__main__": + main() + + + + +