Mercurial > repos > peterjc > nlstradamus
changeset 1:f93ad4882338 draft
Uploaded v0.0.6, adds unit tests and minor documentation changes.
author | peterjc |
---|---|
date | Wed, 17 Apr 2013 08:26:25 -0400 |
parents | 0ad90e5eb390 |
children | 9ec94203d895 |
files | test-data/empty.fasta test-data/empty_nlstradamus.tabular test-data/four_human_proteins.fasta test-data/four_human_proteins.nlstradamus.tabular tools/protein_analysis/nlstradamus.txt tools/protein_analysis/nlstradamus.xml |
diffstat | 6 files changed, 128 insertions(+), 18 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/empty.fasta Wed Apr 17 08:26:25 2013 -0400 @@ -0,0 +1,2 @@ + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/empty_nlstradamus.tabular Wed Apr 17 08:26:25 2013 -0400 @@ -0,0 +1,1 @@ +#ID algorithm score start stop sequence
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta Wed Apr 17 08:26:25 2013 -0400 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.nlstradamus.tabular Wed Apr 17 08:26:25 2013 -0400 @@ -0,0 +1,2 @@ +#ID algorithm score start stop sequence +sp|Q9NSY1|BMP2K_HUMAN posterior 0.945 983 1027 RRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARRHKKVGRR
--- a/tools/protein_analysis/nlstradamus.txt Tue Jun 07 17:39:58 2011 -0400 +++ b/tools/protein_analysis/nlstradamus.txt Wed Apr 17 08:26:25 2013 -0400 @@ -1,7 +1,7 @@ -Galaxy wrapper for NLStradamus v1.7 (C++ version) -================================================= +Galaxy wrapper for NLStradamus v1.7 or v1.8 (C++ version) +========================================================= -This wrapper is copyright 2011 by Peter Cock, The James Hutton Institute +This wrapper is copyright 2011-2013 by Peter Cock, The James Hutton Institute (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. See the licence text below. @@ -11,15 +11,24 @@ A. N. Nguyen Ba, A. Pogoutse, N. Provart, A. M. Moses. NLStradamus: a simple Hidden Markov Model for nuclear localization signal prediction. BMC Bioinformatics. 2009 Jun 29;10(1):202. +http://dx.doi.org/10.1186/1471-2105-10-202 http://www.moseslab.csb.utoronto.ca/NLStradamus Early versions of NLStradamus did not have a native tabular output format, this was added in version 1.7. Additionally a fast C++ implementation was added at -this point (early versions of NLStradamus came as a perl script only). This -wrapper expects the compiled C++ binary "NLStradamus" to be on the system PATH. +this point (early versions of NLStradamus came as a perl script only). + +Version 1.8 fixed a C++ compilation issue on modern compilers, but is otherwise +unchanged. + -To install the wrapper installed the following files under the Galaxy tools +Installation +============ +This wrapper expects the compiled C++ binary "NLStradamus" to be on the system +PATH. + +To install the wrapper copy or move the following files under the Galaxy tools folder, e.g. in a tools/protein_analysis folder: * nlstradamus.xml (the Galaxy tool definition) @@ -31,6 +40,9 @@ <tool file="protein_analysis/nlstradamus.xml" /> +If you wish to run the unit tests, also add this to tools_conf.xml.sample +and move/copy the test-data files under Galaxy's test-data folder. + That's it. @@ -38,6 +50,11 @@ ======= v0.0.3 - Initial public release +v0.0.4 - Adding DOI link to reference + (Documentation change only) +v0.0.5 - Assume non-zero return codes are errors +v0.0.6 - Show output help text using a table + - Added unit tests Developers @@ -46,17 +63,20 @@ This script and related tools are being developed on the following hg branch: http://bitbucket.org/peterjc/galaxy-central/src/tools -For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use +For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use the following command from the Galaxy root folder: -tar -czf nlstradmus.tar.gz tools/protein_analysis/nlstradum.xml tools/protein_analysis/nlstradum.txt +$ tar -czf nlstradmus.tar.gz tools/protein_analysis/nlstradamus.xml tools/protein_analysis/nlstradamus.txt test-data/four_human_proteins.fasta test-data/four_human_proteins.nlstradamus.tabular test-data/empty.fasta test-data/empty_nlstradamus.tabular Check this worked: $ tar -tzf nlstradmus.tar.gz -filter/seq_filter_by_id.py -filter/seq_filter_by_id.txt -filter/seq_filter_by_id.xml +tools/protein_analysis/nlstradamus.xml +tools/protein_analysis/nlstradamus.txt +test-data/four_human_proteins.fasta +test-data/four_human_proteins.nlstradamus.tabular +test-data/empty.fasta +test-data/empty_nlstradamus.tabular Licence (MIT/BSD style)
--- a/tools/protein_analysis/nlstradamus.xml Tue Jun 07 17:39:58 2011 -0400 +++ b/tools/protein_analysis/nlstradamus.xml Wed Apr 17 08:26:25 2013 -0400 @@ -1,8 +1,13 @@ -<tool id="nlstradamus" name="NLStradamus" version="0.0.3"> +<tool id="nlstradamus" name="NLStradamus" version="0.0.6"> <description>Find nuclear localization signals (NLSs) in protein sequences</description> <command> NLStradamus -i $fasta_file -t $threshold -m $model -a $algorithm -tab > $tabular_file </command> + <stdio> + <!-- Assume anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="model" type="select" display="radio" label="Model"> @@ -25,6 +30,20 @@ <requirement type="binary">NLStradamus</requirement> </requirements> <tests> + <test> + <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta" /> + <param name="model" value="1" /> + <param name="algorithm" value="1" /> + <param name="threshold" value="0.6" /> + <output name="tabular_file" file="four_human_proteins.nlstradamus.tabular" ftype="tabular" /> + </test> + <test> + <param name="fasta_file" value="empty.fasta" ftype="fasta" /> + <param name="model" value="2" /> + <param name="algorithm" value="2" /> + <param name="threshold" value="0.125"/> + <output name="tabular_file" file="empty_nlstradamus.tabular" ftype="tabular" /> + </test> </tests> <help> @@ -36,12 +55,16 @@ The input is a FASTA file of protein sequences, and the output is tabular with six columns (one row per NLS): - * Sequence identifier - * Algorithm (posterior or Viterbi) - * Score (probability between threshold and 1 for posterior algorithm) - * Start - * End - * Sequence of NLS +====== =================================================================== +Column Description +------ ------------------------------------------------------------------- + c1 Sequence identifier + c2 Algorithm (posterior or Viterbi) + c3 Score (probability between threshold and 1 for posterior algorithm) + c4 Start + c5 End + c6 Sequence of NLS +====== =================================================================== ----- @@ -50,6 +73,7 @@ A. N. Nguyen Ba, A. Pogoutse, N. Provart, A. M. Moses. NLStradamus: a simple Hidden Markov Model for nuclear localization signal prediction. BMC Bioinformatics. 2009 Jun 29;10(1):202. +http://dx.doi.org/10.1186/1471-2105-10-202 http://www.moseslab.csb.utoronto.ca/NLStradamus