# HG changeset patch # User peterjc # Date 1525786545 14400 # Node ID c323e29a8248660821d3eff51537e6a6dfe2350b Initial release v0.0.1 diff -r 000000000000 -r c323e29a8248 test-data/SRR639755_sample_strict.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_strict.fastq Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,8 @@ +@SRR639755.6451003/1 +ATATCTGCAGTTAACATAAAAATATAGCACGAAAGTAACTTTAATATCTCCGACCACACGATAGCTAAGACCCAAACTGGGATTAGATACCCCGCTATGCT ++ +HBHGGEFH@?DDDDFGFHGHGIGB;CEH>A>DEEC?B;;=@CC9;;?CCCCCCC@<9>5<<@A4 +@SRR639755.6451003/2 +CTCATGGGCTACACCTTGACCTAACTTTTTTGTGTTAAGGCACTTGTGCTTACTTTTCTTCCTTTTTAGGGTTTGCTGAAGATGGCGGTATGTAGGCTGAA ++ +@@@F=DDDHGFHHBHEH>HDHIEH8CECCFAGDBHH@DFBHGG@DHBFCHFH@FHIIIGDD@CHIJJF>EECBFFFEEE>AC>CC@@B89?C:B3::AB?> diff -r 000000000000 -r c323e29a8248 test-data/SRR639755_sample_strict.length.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/SRR639755_sample_strict.length.tabular Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,3 @@ +#Identifier Length +SRR639755.6451003/1 101 +SRR639755.6451003/2 101 diff -r 000000000000 -r c323e29a8248 test-data/four_human_proteins.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA diff -r 000000000000 -r c323e29a8248 test-data/four_human_proteins.length.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.length.tabular Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,5 @@ +#Identifier Length +sp|Q9BS26|ERP44_HUMAN 406 +sp|Q9NSY1|BMP2K_HUMAN 1161 +sp|P06213|INSR_HUMAN 1382 +sp|P08100|OPSD_HUMAN 348 diff -r 000000000000 -r c323e29a8248 tools/seq_length/README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/README.rst Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,119 @@ +Galaxy tool to rename FASTA, QUAL, FASTQ or SFF sequences +========================================================= + +This tool is copyright 2011-2017 by Peter Cock, The James Hutton Institute +(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved. +See the licence text below. + +This tool is a short Python script (using Biopython library functions) to rename +sequences from a FASTA, QUAL, FASTQ, or SFF file based on an ID mapping gives as +two columns of a tabular file. The output order follows that of the sequence file, +and if there are duplicates in the input sequence file, there will be duplicates +in the output sequence file. + +This tool is available from the Galaxy Tool Shed, + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_length + +See also the sister tools to filter or select sequence files according to IDs +from column(s) of tabular file: + +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id +* http://toolshed.g2.bx.psu.edu/view/peterjc/seq_select_by_id + + +Automated Installation +====================== + +This should be straightforward using the Galaxy Tool Shed, which should be +able to automatically install the dependency on Biopython, and then install +this tool and run its unit tests. + + +Manual Installation +=================== + +There are just two files to install to use this tool from within Galaxy: + +* ``seq_length.py`` (the Python script) +* ``seq_length.xml`` (the Galaxy tool definition) + +The suggested location is in a dedicated ``tools/seq_length`` folder. + +You will also need to modify the ``tools_conf.xml`` file to tell Galaxy to offer the +tool. One suggested location is in the filters section. Simply add the line:: + + + +If you wish to run the unit tests, also move/copy the ``test-data/`` files +under Galaxy's ``test-data/`` folder. Then:: + + $ ./run_tests.sh -id seq_length + +You will also need to install Biopython 1.54 or later. That's it. + + +History +======= + +======= ====================================================================== +Version Changes +------- ---------------------------------------------------------------------- +v0.0.1 - Initial version. +======= ====================================================================== + + +Developers +========== + +Development is here: + +https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_length + +For pushing a release to the test or main "Galaxy Tool Shed", use the following +Planemo commands (which requires you have set your Tool Shed access details in +``~/.planemo.yml`` and that you have access rights on the Tool Shed):: + + $ planemo shed_update -t testtoolshed --check_diff tools/seq_length/ + ... + +or:: + + $ planemo shed_update -t toolshed --check_diff tools/seq_length/ + ... + +To just build and check the tar ball, use:: + + $ planemo shed_upload --tar_only tools/seq_length/ + ... + $ tar -tzf shed_upload.tar.gz + test-data/SRR639755_sample_strict.fastq + test-data/SRR639755_sample_strict.length.tabular + test-data/four_human_proteins.fasta + test-data/four_human_proteins.length.tabular + tools/seq_length/README.rst + tools/seq_length/seq_length.py + tools/seq_length/seq_length.xml + tools/seq_length/tool_dependencies.xml + + +Licence (MIT) +============= + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff -r 000000000000 -r c323e29a8248 tools/seq_length/seq_length.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/seq_length.py Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/env python +"""Compute length of FASTA, QUAL, FASTQ or SSF sequences. + +Takes three command line options: input sequence filename, input type +(e.g. FASTA or SFF) and the output filename (tabular). + +This tool is a short Python script which requires Biopython 1.54 or later +for SFF file support. If you use this tool in scientific work leading to a +publication, please cite the Biopython application note: + +Cock et al 2009. Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This script is copyright 2018 by Peter Cock, The James Hutton Institute UK. +All rights reserved. See accompanying text file for licence details (MIT +license). +""" + +from __future__ import print_function + +import sys + +if "-v" in sys.argv or "--version" in sys.argv: + print("v0.0.1") + sys.exit(0) + +try: + from Bio import SeqIO +except ImportError: + sys.exit("Missing required Python library Biopython.") + + +# Parse Command Line +try: + in_file, seq_format, out_file = sys.argv[1:] +except ValueError: + sys.exit("Expected three arguments (input file, format, output file), " + "got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv))) + + +if seq_format.startswith("fastq"): + # We don't care about the quality score encoding, just + # need to translate Galaxy format name into something + # Biopython will accept: + format = "fastq" +elif seq_format.lower() == "csfasta": + # I have not tested with colour space FASTA + format = "fasta" +elif seq_format.lower == "sff": + # The masked/trimmed numbers are more interesting + format = "sff-trim" +elif seq_format.lower() in ["fasta", "qual"]: + format = seq_format.lower() +else: + # TODO: Does Galaxy understand GenBank, EMBL, etc yet? + sys.exit("Unexpected format argument: %r" % seq_format) + + +count = 0 +total = 0 +with open(out_file, "w") as out_handle: + out_handle.write("#Identifier\tLength\n") + for record in SeqIO.parse(in_file, format): + count += 1 + length = len(record) + total += length + out_handle.write("%s\t%i\n" % (record.id, length)) +print("%i sequences, total length %i" % (count, total)) diff -r 000000000000 -r c323e29a8248 tools/seq_length/seq_length.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/seq_length.xml Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,54 @@ + + with ID mapping from a tabular file + + + biopython + + +python $__tool_directory__/seq_length.py --version + + +python $__tool_directory__/seq_length.py '$input_file' '$input_file.ext' '$output_file' + + + + + + + + + + + + + + + + + + +**What it does** + +Takes a FASTA, QUAL, FASTQ or Standard Flowgram Format (SFF) file and produces a +two-column tabular file containing one line per sequence giving the sequence +identifier and the associated sequence's length. + +WARNING: If there are any duplicate sequence identifiers, these will all appear +in the tabular output. + +**References** + +This tool uses Biopython's ``SeqIO`` library to read sequences, so please cite +the Biopython application note (and Galaxy too of course): + +Cock et al (2009). Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. + +This tool is available to install into other Galaxy Instances via the Galaxy +Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_length + + + 10.1093/bioinformatics/btp163 + + diff -r 000000000000 -r c323e29a8248 tools/seq_length/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/seq_length/tool_dependencies.xml Tue May 08 09:35:45 2018 -0400 @@ -0,0 +1,6 @@ + + + + + +