Mercurial > repos > peterjc > tmhmm_and_signalp
changeset 9:e52220a9ddad draft
Uploaded v0.1.2
Use the new <stdio> settings in the XML wrappers to catch errors.
Obeys SGE style XNSLOTS environment variable for thread count (otherwise default to 4).
author | peterjc |
---|---|
date | Fri, 25 Jan 2013 06:08:31 -0500 |
parents | 976a5f2833cd |
children | 09ff180d1615 |
files | test-data/four_human_proteins.fasta.orig tools/protein_analysis/README tools/protein_analysis/promoter2.py tools/protein_analysis/promoter2.xml tools/protein_analysis/rxlr_motifs.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.py tools/protein_analysis/signalp3.xml tools/protein_analysis/tmhmm2.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/wolf_psort.py tools/protein_analysis/wolf_psort.xml |
diffstat | 12 files changed, 160 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta.orig Fri Jan 25 06:08:31 2013 -0500 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA
--- a/tools/protein_analysis/README Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/README Fri Jan 25 06:08:31 2013 -0500 @@ -127,6 +127,9 @@ v0.1.0 - Added Promoter 2.0 wrapper (similar to SignalP & TMHMM wrappers) - Support Galaxy's <parallelism> tag for SignalP, TMHMM & Promoter v0.1.1 - Fixed an error in the header of the tabular output from Promoter +v0.1.2 - Use the new <stdio> settings in the XML wrappers to catch errors + - Use SGE style $NSLOTS for thread count (otherwise default to 4) + Developers ==========
--- a/tools/protein_analysis/promoter2.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/promoter2.py Fri Jan 25 06:08:31 2013 -0500 @@ -30,20 +30,15 @@ import os import commands import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " "Got %i arguments." % (len(sys.argv)-1)) -try: - num_threads = int(sys.argv[1]) -except: - num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) +num_threads = thread_count(sys.argv[3],default=4) fasta_file = os.path.abspath(sys.argv[2]) tabular_file = os.path.abspath(sys.argv[3])
--- a/tools/protein_analysis/promoter2.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/promoter2.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,4 +1,4 @@ -<tool id="promoter2" name="Promoter 2.0" version="0.0.2"> +<tool id="promoter2" name="Promoter 2.0" version="0.0.3"> <description>Find eukaryotic PolII promoters in DNA sequences</description> <!-- If job splitting is enabled, break up the query file into parts --> <!-- Using 2000 per chunk so 4 threads each doing 500 is ideal --> @@ -9,6 +9,11 @@ ##which (on SGE at least) will set the $NSLOTS environment variable. ##If the environment variable isn't set, get "", and defaults to one. </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of DNA sequences"/> </inputs>
--- a/tools/protein_analysis/rxlr_motifs.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/rxlr_motifs.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,9 +1,14 @@ -<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.5"> +<tool id="rxlr_motifs" name="RXLR Motifs" version="0.0.6"> <description>Find RXLR Effectors of Plant Pathogenic Oomycetes</description> <command interpreter="python"> rxlr_motifs.py $fasta_file 8 $model $tabular_file ##I want the number of threads to be a Galaxy config option... </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences" /> <param name="model" type="select" label="Which RXLR model?">
--- a/tools/protein_analysis/seq_analysis_utils.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/seq_analysis_utils.py Fri Jan 25 06:08:31 2013 -0500 @@ -19,6 +19,56 @@ sys.stderr.write("%s\n" % msg) sys.exit(error_level) +try: + from multiprocessing import cpu_count +except ImportError: + #Must be under Python 2.5, this is copied from multiprocessing: + def cpu_count(): + """Returns the number of CPUs in the system.""" + if sys.platform == 'win32': + try: + num = int(os.environ['NUMBER_OF_PROCESSORS']) + except (ValueError, KeyError): + num = 0 + elif 'bsd' in sys.platform or sys.platform == 'darwin': + comm = '/sbin/sysctl -n hw.ncpu' + if sys.platform == 'darwin': + comm = '/usr' + comm + try: + with os.popen(comm) as p: + num = int(p.read()) + except ValueError: + num = 0 + else: + try: + num = os.sysconf('SC_NPROCESSORS_ONLN') + except (ValueError, OSError, AttributeError): + num = 0 + + if num >= 1: + return num + else: + raise NotImplementedError('cannot determine number of cpus') + + +def thread_count(command_line_arg, default=1): + try: + num = int(command_line_arg) + except: + num = default + if num < 1: + stop_err("Threads argument %r is not a positive integer" % command_line_arg) + #Cap this with the pysical limit of the machine, + try: + num = min(num, cpu_count()) + except NotImplementedError: + pass + #For debugging, + #hostname = os.environ.get("HOSTNAME", "this machine") + #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) + return num + + def fasta_iterator(filename, max_len=None, truncate=None): """Simple FASTA parser yielding tuples of (title, sequence) strings.""" handle = open(filename) @@ -109,6 +159,11 @@ pending = jobs[:] running = [] results = {} + if threads == 1: + #Special case this for speed, don't need the waits + for cmd in jobs: + results[cmd] = subprocess.call(cmd, shell=True) + return results while pending or running: #See if any have finished for (cmd, process) in running:
--- a/tools/protein_analysis/signalp3.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/signalp3.py Fri Jan 25 06:08:31 2013 -0500 @@ -56,7 +56,8 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator +from seq_analysis_utils import stop_err, split_fasta, fasta_iterator +from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error @@ -78,15 +79,8 @@ if truncate < 0: stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) -try: - num_threads = int(sys.argv[3]) -except: - num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) - +num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4] - tabular_file = sys.argv[5] if len(sys.argv) == 8:
--- a/tools/protein_analysis/signalp3.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/signalp3.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,4 +1,4 @@ -<tool id="signalp3" name="SignalP 3.0" version="0.0.9"> +<tool id="signalp3" name="SignalP 3.0" version="0.0.10"> <description>Find signal peptides in protein sequences</description> <!-- If job splitting is enabled, break up the query file into parts --> <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> @@ -9,6 +9,11 @@ ##which (on SGE at least) will set the $NSLOTS environment variable. ##If the environment variable isn't set, get "", and defaults to one. </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="organism" type="select" display="radio" label="Organism">
--- a/tools/protein_analysis/tmhmm2.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/tmhmm2.py Fri Jan 25 06:08:31 2013 -0500 @@ -43,18 +43,14 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") -try: - num_threads = int(sys.argv[1]) -except: - num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) + +num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] tabular_file = sys.argv[3]
--- a/tools/protein_analysis/tmhmm2.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/tmhmm2.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,4 +1,4 @@ -<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.8"> +<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.9"> <description>Find transmembrane domains in protein sequences</description> <!-- If job splitting is enabled, break up the query file into parts --> <!-- Using 2000 chunks meaning 4 threads doing 500 each is ideal --> @@ -9,6 +9,11 @@ ##which (on SGE at least) will set the $NSLOTS environment variable. ##If the environment variable isn't set, get "", and defaults to one. </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <!--
--- a/tools/protein_analysis/wolf_psort.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/wolf_psort.py Fri Jan 25 06:08:31 2013 -0500 @@ -35,13 +35,13 @@ """ import sys import os -from seq_analysis_utils import stop_err, split_fasta, run_jobs +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 exe = "runWolfPsortSummary" """ -Note: I had trouble getting runWolfPsortSummary on the path (via a link, other +Note: I had trouble getting runWolfPsortSummary on the path (via a link), other than by including all of /opt/WoLFPSORT_package_v0.2/bin , so used a wrapper python script called runWolfPsortSummary as follows: @@ -65,15 +65,8 @@ if organism not in ["animal", "plant", "fungi"]: stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) -try: - num_threads = int(sys.argv[2]) -except: - num_threads = 0 -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[2]) - +num_threads = thread_count(sys.argv[2], default=4) fasta_file = sys.argv[3] - tabular_file = sys.argv[4] def clean_tabular(raw_handle, out_handle):
--- a/tools/protein_analysis/wolf_psort.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/wolf_psort.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,9 +1,14 @@ -<tool id="wolf_psort" name="WoLF PSORT" version="0.0.1"> +<tool id="wolf_psort" name="WoLF PSORT" version="0.0.2"> <description>Eukaryote protein subcellular localization prediction</description> <command interpreter="python"> wolf_psort.py $organism 8 $fasta_file $tabular_file ##I want the number of threads to be a Galaxy config option... </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + </stdio> <inputs> <param name="fasta_file" type="data" format="fasta" label="FASTA file of protein sequences"/> <param name="organism" type="select" display="radio" label="Organism">