# HG changeset patch
# User pravs
# Date 1501711793 14400
# Node ID 9ec27561593ec3f779ebf61fd0fe497bd73d48d0
planemo upload
diff -r 000000000000 -r 9ec27561593e removeFastaSubSequence.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/removeFastaSubSequence.py Wed Aug 02 18:09:53 2017 -0400
@@ -0,0 +1,42 @@
+
+# This program checks if any of the sequence in a query fasta file is present in the
+# reference fasta file (checks for sub-string) or not. If they are present, it removes it.
+# The updated database will have sequences unique to query fasta file.
+
+def main():
+ import sys
+ from Bio import SeqIO
+
+ ref_fastaFile = sys.argv[1].strip()
+ query_fastaFile = sys.argv[2].strip()
+
+ x = SeqIO.to_dict(SeqIO.parse(ref_fastaFile, "fasta"))
+ y = x.values()
+ b = []
+ for a in y:
+ b.append(str(a.seq))
+ ref_fastaSeq = "#".join(b)
+
+ outfh = open(sys.argv[3].strip(), "w")
+
+ x = SeqIO.to_dict(SeqIO.parse(query_fastaFile, "fasta"))
+ y = x.values()
+ count = 0
+ for a in x.keys():
+ seq = str(x[a].seq)
+ desc = str(x[a].description)
+ if ref_fastaSeq.find(seq) < 0:
+ outfh.write(">" + desc + "\n" + seq + "\n")
+ else:
+ count = count + 1
+ print >> sys.stdout,"Total Number of Sequences Removed: %d" % count
+ outfh.close()
+ return None
+
+if __name__ == "__main__":
+ main()
+
+
+
+
+
diff -r 000000000000 -r 9ec27561593e removeFastaSubSequence.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/removeFastaSubSequence.xml Wed Aug 02 18:09:53 2017 -0400
@@ -0,0 +1,88 @@
+
+
+ Removes sequences that are subsequence in a reference Fasta File.
+
+ biopython
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This program removes the sequences from the query fasta file that are present as subsequence in a reference fasta file.
+
+EXAMPLE:
+
+----
+
+Ref sequences:
+
+>reference_seq_1
+
+TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
+
+>reference_seq_2
+
+RGLCISGLEKEVQVQSRQAEGPVHLWLRKGSTSAE
+
+----
+
+Query Sequences:
+
+>query_seq_1
+
+TKTILNYAVLSPCLSPGHVLGC
+
+
+>query_seq_2
+
+LDKDHLELCCTLSLPFSWACSWVLVL
+
+
+>query_seq_3
+
+LWGVPRGLCISG
+
+----
+
+Output Sequences:
+
+>query_seq_1
+
+TKTILNYAVLSPCLSPGHVLGC
+
+
+>query_seq_3
+
+LWGVPRGLCISG
+
+----
+
+Output Sequence file will have only query_seq_1 and query_seq_3. query_seq_2 is removed because query_seq_2's sequence "LDKDHLELCCTLSLPFSWACSWVLVL" is
+present as substring in reference_seq_1's sequence "TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP".
+
+
+
diff -r 000000000000 -r 9ec27561593e test-data/test_query.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_query.fasta Wed Aug 02 18:09:53 2017 -0400
@@ -0,0 +1,10 @@
+>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+EGPVHLWLRKGSTSAE
+>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+RGLCISGLEKEVQVQSRQA
+>generic|ENSMUST00000193003.1_3| [28 - 75] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+LCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPV
+>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TKTILNYAVLSPCLSPGHVLGC
+>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
diff -r 000000000000 -r 9ec27561593e test-data/test_ref.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_ref.fasta Wed Aug 02 18:09:53 2017 -0400
@@ -0,0 +1,10 @@
+>sp|Q9EST3|4ET_MOUSE Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1 PE=1 SV=2
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSRNVESHLLAPAEIPGQPVSKNILQELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|Q9EST3-2|4ET_MOUSE Isoform 2 of Eukaryotic translation initiation factor 4E transporter OS=Mus musculus GN=Eif4enif1
+MEKSVAETENGDAFLELKKLPTSKSPHRYTKEELLDIKERPYSKQRPSCLSEKYDSDGVWDPEKWHASLYPASGRSSPVESLKKESESDRPSLVRRIADPRERVKEDDLDVVLSPQRRSFGGGCHVTAAVSSRRSGSPLEKDSDGLRLLGGRRIGSGRIISARAFEKDHRLSDKDLRDLRDRDRERDYKDKRFRREFGDSKRVFGERRRNDSYTEEEPEWFSAGPTSQSETIELTGFDDKILEEDHKGRKRTRRRTASVKEGIVECNGGVAEEDEVEVILAQEPSADQEVPRDVILPEQSPGEFDFNEFFNLDKVPCLASMIEDVLGEGSVSASRFSRWFSNPSRSGSRSSSLGSTPHEELERLAGLEQAVLSPGQNSGNYFAPIPSEDHAENKVDILEMLQKAKVDLKPLLSSLSANKEKLKESSHSGVVLSVEEVEAGLKGLKVDQQMKNSTPFMAEHLEETLSAASSNRQLKKDGDMTAFNKLVNTMKASGTLPTQPKVSELLGQPVQRPASSNLLSGLMGSLEATASLLSQRAPSPPMSQVFRTQAASADYLHPRIPSPIGFPSGPQQLLGDPFQGMRKPMSPVSAQMSQLELQQAALEGLALPHDLAVQTAPFYQPGFSKPQVDRTRDGLRNRQQRMSKSPAPMHGGNSSSPAPAASITSMLSPSFTPTSVIRKMYESREKTKEEMAPGMVVPGDGKEDTQKTSEENLLSSNPIPNTDQDSSTTNPKLSTLQRSSCSTPLSQTSRYTKEQDYRPKTAGRKTPTLASPVPGTPFLRPTHQVPLVPHVPIVRPAHQLHPGLVQRLIAQGVHPQHLPSLLQAGVLPPGIDMAPLQGLSGPLLGQPLYPLVSAASHPLLNPRPGTPLHLAVMQQQLQRSVLHPPGSSSQAAAISVQTPQNVPSRSGMPHMHSQLEHRTSQRSSSPVGLAKWFGSDVLQQPLPSMPTKVISVDELEYRQ
+>sp|P34968|5HT2C_MOUSE 5-hydroxytryptamine receptor 2C OS=Mus musculus GN=Htr2c PE=2 SV=2
+MVNLGTAVRSLLVHLIGLLVWQFDISISPVAAIVTDTFNSSDGGRLFQFPDGVQNWPALSIVVIIIMTIGGNILVIMAVSMEKKLHNATNYFLMSLAIADMLVGLLVMPLSLLAILYDYVWPLPRYLCPVWISLDVLFSTASIMHLCAISLDRYVAIRNPIEHSRFNSRTKAIMKIAIVWAISIGVSVPIPVIGLRDESKVFVNNTTCVLNDPNFVLIGSFVAFFIPLTIMVITYFLTIYVLRRQTLMLLRGHTEEELRNISLNFLKCCCKKGDEEENAPNPNPDQKPRRKKKEKRPRGTMQAINNEKKASKVLGIVFFVFLIMWCPFFITNILSVLCGKACNQKLMEKLLNVFVWIGYVCSGINPLVYTLFNKIYRRAFSKYLRCDYKPDKKPPVRQIPRVAATALSGRELNVNIYRHTNERVVRKANDTEPGIEMQVENLELPVNPSNVVSERISSV
+>sp|Q00896|A1AT3_MOUSE Alpha-1-antitrypsin 1-3 OS=Mus musculus GN=Serpina1c PE=1 SV=2
+MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIATNLGDFAISLYRELVHQSNTSNIFFSPVSIATAFAMLSLGSKGDTHTQILEGLQFNLTQTSEADIHKSFQHLLQTLNRPDSELQLSTGNGLFVNNDLKLVEKFLEEAKNHYQAEVFSVNFAESEEAKKVINDFVEKGTQGKIVEAVKKLDQDTVFALANYILFKGKWKKPFDPENTEEAEFHVDESTTVKVPMMTLSGMLDVHHCSTLSSWVLLMDYAGNATAVFLLPDDGKMQHLEQTLSKELISKFLLNRRRRLAQIHFPRLSISGEYNLKTLMSPLGITRIFNNGADLSGITEENAPLKLSQAVHKAVLTIDETGTEAAAVTVLLAVPYSMPPILRFDHPFLFIIFEEHTQSPLFVGKVVDPTH
+>sp|Q9D2R0|AACS_MOUSE Acetoacetyl-CoA synthetase OS=Mus musculus GN=Aacs PE=1 SV=1
+MSKLARLEREEIMECQVMWEPDSKKDTQMDRFRAAVGTACGLALGNYNDLYHWSVRSYMDFWAEFWKFSGIVYSRMYDEVVDTSKGIADVPEWFRGSRLNYAENLLRHKENDRVALYVAREGREEIVKVTFEELRQQVALFAAAMRKMGVKKGDRVVGYLPNSAHAVEAMLAAASIGAIWSSTSPDFGVNGVLDRFSQIQPKLIFSVEAVVYNGKEHGHLEKLQRVVKGLPDLQRVVLIPYVLPREKIDISKIPNSVFLDDFLASGTGAQAPQLEFEQLPFSHPLFIMFSSGTTGAPKCMVHSAGGTLIQHLKEHMLHGNMTSSDILLYYTTVGWMMWNWMVSALATGASLVLYDGSPLVPTPNVLWDLVDRIGITILGTGAKWLSVLEEKDMKPVETHNLHTLHTILSTGSPLKAQSYEYVYRCIKSSVLLGSISGGTDIISCFMGQNSSIPVYKGEIQARNLGMAVEAWDEEGKAVWGASGELVCTKPIPCQPTHFWNDENGSKYRKAYFSKFPGVWAHGDYCRINPKTGGIIMLGRSDGTLNPNGVRFGSSEIYNIVEAFDEVEDSLCVPQYNRDGEERVVLFLKMASGHTFQPDLVKRIRDAIRLGLSARHVPSLILETRGIPYTLNGKKVEVAVKQVMAGRTVEHRGAFSNPETLDLYRDIPELQDF
diff -r 000000000000 -r 9ec27561593e test-data/uniqSeq_test_query.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/uniqSeq_test_query.fasta Wed Aug 02 18:09:53 2017 -0400
@@ -0,0 +1,8 @@
+>generic|ENSMUST00000193003.1_5| [53 - 205] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TSLDKDHLELCCTLSLPFSWACSWVLVLRLSINGQLPRSRLWAAHCLWGVP
+>generic|ENSMUST00000193003.1_4| [63 - 128] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+TKTILNYAVLSPCLSPGHVLGC
+>generic|ENSMUST00000193003.1_1| [2 - 49] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+EGPVHLWLRKGSTSAE
+>generic|ENSMUST00000193003.1_2| [3 - 59] cdna chromosome:GRCm38:6:41557693:41558452:-1 gene:ENSMUSG00000076499.3
+RGLCISGLEKEVQVQSRQA
diff -r 000000000000 -r 9ec27561593e tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Aug 02 18:09:53 2017 -0400
@@ -0,0 +1,4 @@
+
+
+
+
\ No newline at end of file