# HG changeset patch # User peterjc # Date 1359112111 18000 # Node ID e52220a9ddada323daa22eb866198f62952bec02 # Parent 976a5f2833cd7c5d3dc840617124fbc1cccd5d2b Uploaded v0.1.2 Use the new settings in the XML wrappers to catch errors. Obeys SGE style XNSLOTS environment variable for thread count (otherwise default to 4). diff -r 976a5f2833cd -r e52220a9ddad test-data/four_human_proteins.fasta.orig --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/four_human_proteins.fasta.orig Fri Jan 25 06:08:31 2013 -0500 @@ -0,0 +1,61 @@ +>sp|Q9BS26|ERP44_HUMAN Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1 +MHPAVFLSLPDLRCSLLLLVTWVFTPVTTEITSLDTENIDEILNNADVALVNFYADWCRF +SQMLHPIFEEASDVIKEEFPNENQVVFARVDCDQHSDIAQRYRISKYPTLKLFRNGMMMK +REYRGQRSVKALADYIRQQKSDPIQEIRDLAEITTLDRSKRNIIGYFEQKDSDNYRVFER +VANILHDDCAFLSAFGDVSKPERYSGDNIIYKPPGHSAPDMVYLGAMTNFDVTYNWIQDK +CVPLVREITFENGEELTEEGLPFLILFHMKEDTESLEIFQNEVARQLISEKGTINFLHAD +CDKFRHPLLHIQKTPADCPVIAIDSFRHMYVFGDFKDVLIPGKLKQFVFDLHSGKLHREF +HHGPDPTDTAPGEQAQDVASSPPESSFQKLAPSEYRYTLLRDRDEL +>sp|Q9NSY1|BMP2K_HUMAN BMP-2-inducible protein kinase OS=Homo sapiens GN=BMP2K PE=1 SV=2 +MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAVGRHQVTLEESLAEG +GFSTVFLVRTHGGIRCALKRMYVNNMPDLNVCKREITIMKELSGHKNIVGYLDCAVNSIS +DNVWEVLILMEYCRAGQVVNQMNKKLQTGFTEPEVLQIFCDTCEAVARLHQCKTPIIHRD +LKVENILLNDGGNYVLCDFGSATNKFLNPQKDGVNVVEEEIKKYTTLSYRAPEMINLYGG +KPITTKADIWALGCLLYKLCFFTLPFGESQVAICDGNFTIPDNSRYSRNIHCLIRFMLEP +DPEHRPDIFQVSYFAFKFAKKDCPVSNINNSSIPSALPEPMTASEAAARKSQIKARITDT +IGPTETSIAPRQRPKANSATTATPSVLTIQSSATPVKVLAPGEFGNHRPKGALRPGNGPE +ILLGQGPPQQPPQQHRVLQQLQQGDWRLQQLHLQHRHPHQQQQQQQQQQQQQQQQQQQQQ +QQQQQQHHHHHHHHLLQDAYMQQYQHATQQQQMLQQQFLMHSVYQPQPSASQYPTMMPQY +QQAFFQQQMLAQHQPSQQQASPEYLTSPQEFSPALVSYTSSLPAQVGTIMDSSYSANRSV +ADKEAIANFTNQKNISNPPDMSGWNPFGEDNFSKLTEEELLDREFDLLRSNRLEERASSD +KNVDSLSAPHNHPPEDPFGSVPFISHSGSPEKKAEHSSINQENGTANPIKNGKTSPASKD +QRTGKKTSVQGQVQKGNDESESDFESDPPSPKSSEEEEQDDEEVLQGEQGDFNDDDTEPE +NLGHRPLLMDSEDEEEEEKHSSDSDYEQAKAKYSDMSSVYRDRSGSGPTQDLNTILLTSA +QLSSDVAVETPKQEFDVFGAVPFFAVRAQQPQQEKNEKNLPQHRFPAAGLEQEEFDVFTK +APFSKKVNVQECHAVGPEAHTIPGYPKSVDVFGSTPFQPFLTSTSKSESNEDLFGLVPFD +EITGSQQQKVKQRSLQKLSSRQRRTKQDMSKSNGKRHHGTPTSTKKTLKPTYRTPERARR +HKKVGRRDSQSSNEFLTISDSKENISVALTDGKDRGNVLQPEESLLDPFGAKPFHSPDLS +WHPPHQGLSDIRADHNTVLPGRPRQNSLHGSFHSADVLKMDDFGAVPFTELVVQSITPHQ +SQQSQPVELDPFGAAPFPSKQ +>sp|P06213|INSR_HUMAN Insulin receptor OS=Homo sapiens GN=INSR PE=1 SV=4 +MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTRLHELENCSVIEGHL +QILLMFKTRPEDFRDLSFPKLIMITDYLLLFRVYGLESLKDLFPNLTVIRGSRLFFNYAL +VIFEMVHLKELGLYNLMNITRGSVRIEKNNELCYLATIDWSRILDSVEDNYIVLNKDDNE +ECGDICPGTAKGKTNCPATVINGQFVERCWTHSHCQKVCPTICKSHGCTAEGLCCHSECL +GNCSQPDDPTKCVACRNFYLDGRCVETCPPPYYHFQDWRCVNFSFCQDLHHKCKNSRRQG +CHQYVIHNNKCIPECPSGYTMNSSNLLCTPCLGPCPKVCHLLEGEKTIDSVTSAQELRGC +TVINGSLIINIRGGNNLAAELEANLGLIEEISGYLKIRRSYALVSLSFFRKLRLIRGETL +EIGNYSFYALDNQNLRQLWDWSKHNLTITQGKLFFHYNPKLCLSEIHKMEEVSGTKGRQE +RNDIALKTNGDQASCENELLKFSYIRTSFDKILLRWEPYWPPDFRDLLGFMLFYKEAPYQ +NVTEFDGQDACGSNSWTVVDIDPPLRSNDPKSQNHPGWLMRGLKPWTQYAIFVKTLVTFS +DERRTYGAKSDIIYVQTDATNPSVPLDPISVSNSSSQIILKWKPPSDPNGNITHYLVFWE +RQAEDSELFELDYCLKGLKLPSRTWSPPFESEDSQKHNQSEYEDSAGECCSCPKTDSQIL +KELEESSFRKTFEDYLHNVVFVPRKTSSGTGAEDPRPSRKRRSLGDVGNVTVAVPTVAAF +PNTSSTSVPTSPEEHRPFEKVVNKESLVISGLRHFTGYRIELQACNQDTPEERCSVAAYV +SARTMPEAKADDIVGPVTHEIFENNVVHLMWQEPKEPNGLIVLYEVSYRRYGDEELHLCV +SRKHFALERGCRLRGLSPGNYSVRIRATSLAGNGSWTEPTYFYVTDYLDVPSNIAKIIIG +PLIFVFLFSVVIGSIYLFLRKRQPDGPLGPLYASSNPEYLSASDVFPCSVYVPDEWEVSR +EKITLLRELGQGSFGMVYEGNARDIIKGEAETRVAVKTVNESASLRERIEFLNEASVMKG +FTCHHVVRLLGVVSKGQPTLVVMELMAHGDLKSYLRSLRPEAENNPGRPPPTLQEMIQMA +AEIADGMAYLNAKKFVHRDLAARNCMVAHDFTVKIGDFGMTRDIYETDYYRKGGKGLLPV +RWMAPESLKDGVFTTSSDMWSFGVVLWEITSLAEQPYQGLSNEQVLKFVMDGGYLDQPDN +CPERVTDLMRMCWQFNPKMRPTFLEIVNLLKDDLHPSFPEVSFFHSEENKAPESEELEME +FEDMENVPLDRSSHCQREEAGGRDGGSSLGFKRSYEEHIPYTHMNGGKKNGRILTLPRSN +PS +>sp|P08100|OPSD_HUMAN Rhodopsin OS=Homo sapiens GN=RHO PE=1 SV=1 +MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLY +VTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLG +GEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIP +EGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQES +ATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAI +YNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/README --- a/tools/protein_analysis/README Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/README Fri Jan 25 06:08:31 2013 -0500 @@ -127,6 +127,9 @@ v0.1.0 - Added Promoter 2.0 wrapper (similar to SignalP & TMHMM wrappers) - Support Galaxy's tag for SignalP, TMHMM & Promoter v0.1.1 - Fixed an error in the header of the tabular output from Promoter +v0.1.2 - Use the new settings in the XML wrappers to catch errors + - Use SGE style $NSLOTS for thread count (otherwise default to 4) + Developers ========== diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/promoter2.py --- a/tools/protein_analysis/promoter2.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/promoter2.py Fri Jan 25 06:08:31 2013 -0500 @@ -30,20 +30,15 @@ import os import commands import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " "Got %i arguments." % (len(sys.argv)-1)) -try: - num_threads = int(sys.argv[1]) -except: - num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) +num_threads = thread_count(sys.argv[3],default=4) fasta_file = os.path.abspath(sys.argv[2]) tabular_file = os.path.abspath(sys.argv[3]) diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/promoter2.xml --- a/tools/protein_analysis/promoter2.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/promoter2.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,4 +1,4 @@ - + Find eukaryotic PolII promoters in DNA sequences @@ -9,6 +9,11 @@ ##which (on SGE at least) will set the $NSLOTS environment variable. ##If the environment variable isn't set, get "", and defaults to one. + + + + + diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/rxlr_motifs.xml --- a/tools/protein_analysis/rxlr_motifs.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/rxlr_motifs.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,9 +1,14 @@ - + Find RXLR Effectors of Plant Pathogenic Oomycetes rxlr_motifs.py $fasta_file 8 $model $tabular_file ##I want the number of threads to be a Galaxy config option... + + + + + diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/seq_analysis_utils.py --- a/tools/protein_analysis/seq_analysis_utils.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/seq_analysis_utils.py Fri Jan 25 06:08:31 2013 -0500 @@ -19,6 +19,56 @@ sys.stderr.write("%s\n" % msg) sys.exit(error_level) +try: + from multiprocessing import cpu_count +except ImportError: + #Must be under Python 2.5, this is copied from multiprocessing: + def cpu_count(): + """Returns the number of CPUs in the system.""" + if sys.platform == 'win32': + try: + num = int(os.environ['NUMBER_OF_PROCESSORS']) + except (ValueError, KeyError): + num = 0 + elif 'bsd' in sys.platform or sys.platform == 'darwin': + comm = '/sbin/sysctl -n hw.ncpu' + if sys.platform == 'darwin': + comm = '/usr' + comm + try: + with os.popen(comm) as p: + num = int(p.read()) + except ValueError: + num = 0 + else: + try: + num = os.sysconf('SC_NPROCESSORS_ONLN') + except (ValueError, OSError, AttributeError): + num = 0 + + if num >= 1: + return num + else: + raise NotImplementedError('cannot determine number of cpus') + + +def thread_count(command_line_arg, default=1): + try: + num = int(command_line_arg) + except: + num = default + if num < 1: + stop_err("Threads argument %r is not a positive integer" % command_line_arg) + #Cap this with the pysical limit of the machine, + try: + num = min(num, cpu_count()) + except NotImplementedError: + pass + #For debugging, + #hostname = os.environ.get("HOSTNAME", "this machine") + #sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) + return num + + def fasta_iterator(filename, max_len=None, truncate=None): """Simple FASTA parser yielding tuples of (title, sequence) strings.""" handle = open(filename) @@ -109,6 +159,11 @@ pending = jobs[:] running = [] results = {} + if threads == 1: + #Special case this for speed, don't need the waits + for cmd in jobs: + results[cmd] = subprocess.call(cmd, shell=True) + return results while pending or running: #See if any have finished for (cmd, process) in running: diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/signalp3.py --- a/tools/protein_analysis/signalp3.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/signalp3.py Fri Jan 25 06:08:31 2013 -0500 @@ -56,7 +56,8 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator +from seq_analysis_utils import stop_err, split_fasta, fasta_iterator +from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error @@ -78,15 +79,8 @@ if truncate < 0: stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) -try: - num_threads = int(sys.argv[3]) -except: - num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) - +num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4] - tabular_file = sys.argv[5] if len(sys.argv) == 8: diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/signalp3.xml --- a/tools/protein_analysis/signalp3.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/signalp3.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,4 +1,4 @@ - + Find signal peptides in protein sequences @@ -9,6 +9,11 @@ ##which (on SGE at least) will set the $NSLOTS environment variable. ##If the environment variable isn't set, get "", and defaults to one. + + + + + diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/tmhmm2.py --- a/tools/protein_analysis/tmhmm2.py Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/tmhmm2.py Fri Jan 25 06:08:31 2013 -0500 @@ -43,18 +43,14 @@ import sys import os import tempfile -from seq_analysis_utils import stop_err, split_fasta, run_jobs +from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count FASTA_CHUNK = 500 if len(sys.argv) != 4: stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") -try: - num_threads = int(sys.argv[1]) -except: - num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined -if num_threads < 1: - stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) + +num_threads = thread_count(sys.argv[1], default=4) fasta_file = sys.argv[2] tabular_file = sys.argv[3] diff -r 976a5f2833cd -r e52220a9ddad tools/protein_analysis/tmhmm2.xml --- a/tools/protein_analysis/tmhmm2.xml Mon Jul 30 12:56:54 2012 -0400 +++ b/tools/protein_analysis/tmhmm2.xml Fri Jan 25 06:08:31 2013 -0500 @@ -1,4 +1,4 @@ - + Find transmembrane domains in protein sequences @@ -9,6 +9,11 @@ ##which (on SGE at least) will set the $NSLOTS environment variable. ##If the environment variable isn't set, get "", and defaults to one. + + + + + + + +