Mercurial > repos > pravs > remove_fasta_subsequences
changeset 0:9ec27561593e draft
planemo upload
author | pravs |
---|---|
date | Wed, 02 Aug 2017 18:09:53 -0400 |
parents | |
children | d49328dfeceb |
files | removeFastaSubSequence.py removeFastaSubSequence.xml test-data/test_query.fasta test-data/test_ref.fasta test-data/uniqSeq_test_query.fasta tool_dependencies.xml |
diffstat | 6 files changed, 162 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/removeFastaSubSequence.py Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,42 @@ + +# This program checks if any of the sequence in a query fasta file is present in the +# reference fasta file (checks for sub-string) or not. If they are present, it removes it. +# The updated database will have sequences unique to query fasta file. + +def main(): + import sys + from Bio import SeqIO + + ref_fastaFile = sys.argv[1].strip() + query_fastaFile = sys.argv[2].strip() + + x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta")) + y = x.values() + b = [] + for a in y: + b.append(str(a.seq)) + ref_fastaSeq = "#".join(b) + + outfh = open(sys.argv[3].strip(), "w") + + x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta")) + y = x.values() + count = 0 + for a in x.keys(): + seq = str(x[a].seq) + desc = str(x[a].description) + if ref_fastaSeq.find(seq) < 0: + outfh.write(">" + desc + "\n" + seq + "\n") + else: + count = count + 1 + print >> sys.stdout,"Total Number of Sequences Removed: %d" % count + outfh.close() + return None + +if __name__ == "__main__": + main() + + + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/removeFastaSubSequence.xml Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,88 @@ + +<tool id="removeFastaSubSequence" name="Remove Fasta Substring Sequence" version="1.0.0"> + <description>Removes sequences that are subsequence in a reference Fasta File.</description> + <requirements> + <requirement type="package" version="1.70">biopython</requirement> + </requirements> + <command interpreter="python"><![CDATA[removeFastaSubSequence.py $ref_fastafile $query_fastafile $output]]></command> + <inputs> + <param name="ref_fastafile" type="data" format="fasta"> + <label>Input Reference Fasta File</label> + </param> + <param name="query_fastafile" type="data" format="fasta"> + <label>Input Query Fasta File</label> + </param> + </inputs> + + <outputs> + <data format="fasta" name="output" label="uniqSeq_${query_fastafile.name.rsplit('.',1)[0]}.fasta" /> + </outputs> + + <tests> + <test> + <param name="ref_fastafile" value="test_ref.fasta" /> + <param name="query_fastafile" value="test_query.fasta" /> + <output name="output" file="uniqSeq_test_query.fasta"> + <assert_contents> + <has_text text="ENSMUST00000193003" /> + </assert_contents> + </output> + </test> + </tests> + + + <help> +This program removes the sequences from the query fasta file that are present as subsequence in a reference fasta file. + +EXAMPLE: + +---- + +Ref sequences: + +>reference_seq_1 + +TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP + +>reference_seq_2 + +RGLCISGLEKEVQVQSRQAEGPVHLWLRKGSTSAE + +---- + +Query Sequences: + +>query_seq_1 + +TKTILNYAVLSPCLSPGHVLGC + + +>query_seq_2 + +LDKDHLELCCTLSLPFSWACSWVLVL + + +>query_seq_3 + +LWGVPRGLCISG + +---- + +Output Sequences: + +>query_seq_1 + +TKTILNYAVLSPCLSPGHVLGC + + +>query_seq_3 + +LWGVPRGLCISG + +---- + +Output Sequence file will have only query_seq_1 and query_seq_3. query_seq_2 is removed because query_seq_2's sequence "LDKDHLELCCTLSLPFSWACSWVLVL" is +present as substring in reference_seq_1's sequence "TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP". + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_query.fasta Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,10 @@ +>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +EGPVHLWLRKGSTSAE +>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +RGLCISGLEKEVQVQSRQA +>generic|ENSMUST00000193003.1_3| [28 - 75] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +LCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPV +>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TKTILNYAVLSPCLSPGHVLGC +>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_ref.fasta Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,10 @@ +>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 +MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ +>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2 +MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV +>sp|Q00896|A1AT3_MOUSE Alpha-1-antitrypsin 1-3 OS=Mus musculus GN=Serpina1c PE=1 SV=2 +MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPVSIATAFAMLSLGSKGDTHTQILEGLQFNLTQTSEADIHKSFQHLLQTLNRPDSELQLSTGNGLFVNNDLKLVEKFLEEAKNHYQAEVFSVNFAESEEAKKVINDFVEKGTQGKIVEAVKKLDQDTVFALANYILFKGKWKKPFDPENTEEAEFHVDESTTVKVPMMTLSGMLDVHHCSTLSSWVLLMDYAGNATAVFLLPDDGKMQHLEQTLSKELISKFLLNRRRRLAQIHFPRLSISGEYNLKTLMSPLGITRIFNNGADLSGITEENAPLKLSQAVHKAVLTIDETGTEAAAVTVLLAVPYSMPPILRFDHPFLFIIFEEHTQSPLFVGKVVDPTH +>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1 +MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/uniqSeq_test_query.fasta Wed Aug 02 18:09:53 2017 -0400 @@ -0,0 +1,8 @@ +>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP +>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +TKTILNYAVLSPCLSPGHVLGC +>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +EGPVHLWLRKGSTSAE +>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3 +RGLCISGLEKEVQVQSRQA