Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/tmhmm2.py @ 1:3ff1dcbb9440
Migrated tool version 0.0.3 from old tool shed archive to new tool shed repository
| author | peterjc |
|---|---|
| date | Tue, 07 Jun 2011 18:04:05 -0400 |
| parents | bca9bc7fdaef |
| children | 6901298ac16c |
comparison
equal
deleted
inserted
replaced
| 0:bca9bc7fdaef | 1:3ff1dcbb9440 |
|---|---|
| 27 The second major potential feature is taking advantage of multiple cores | 27 The second major potential feature is taking advantage of multiple cores |
| 28 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file | 28 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file |
| 29 into chunks and running multiple copies of TMHMM in parallel. I would normally | 29 into chunks and running multiple copies of TMHMM in parallel. I would normally |
| 30 use Python's multiprocessing library in this situation but it requires at | 30 use Python's multiprocessing library in this situation but it requires at |
| 31 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. | 31 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. |
| 32 | |
| 33 Also tmhmm2 can fail without returning an error code, for example if run on a | |
| 34 64 bit machine with only the 32 bit binaries installed. This script will spot | |
| 35 when there is no output from tmhmm2, and raise an error. | |
| 32 """ | 36 """ |
| 33 import sys | 37 import sys |
| 34 import os | 38 import os |
| 35 from seq_analysis_utils import stop_err, split_fasta, run_jobs | 39 from seq_analysis_utils import stop_err, split_fasta, run_jobs |
| 36 | 40 |
| 46 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) | 50 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) |
| 47 fasta_file = sys.argv[2] | 51 fasta_file = sys.argv[2] |
| 48 tabular_file = sys.argv[3] | 52 tabular_file = sys.argv[3] |
| 49 | 53 |
| 50 def clean_tabular(raw_handle, out_handle): | 54 def clean_tabular(raw_handle, out_handle): |
| 51 """Clean up tabular TMHMM output.""" | 55 """Clean up tabular TMHMM output, returns output line count.""" |
| 56 count = 0 | |
| 52 for line in raw_handle: | 57 for line in raw_handle: |
| 53 if not line: | 58 if not line: |
| 54 continue | 59 continue |
| 55 parts = line.rstrip("\r\n").split("\t") | 60 parts = line.rstrip("\r\n").split("\t") |
| 56 try: | 61 try: |
| 66 first60 = first60[8:] | 71 first60 = first60[8:] |
| 67 assert predhel.startswith("PredHel="), line | 72 assert predhel.startswith("PredHel="), line |
| 68 predhel = predhel[8:] | 73 predhel = predhel[8:] |
| 69 assert topology.startswith("Topology="), line | 74 assert topology.startswith("Topology="), line |
| 70 topology = topology[9:] | 75 topology = topology[9:] |
| 71 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ | 76 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \ |
| 72 % (identifier, length, expAA, first60, predhel, topology)) | 77 % (identifier, length, expAA, first60, predhel, topology)) |
| 78 count += 1 | |
| 79 return count | |
| 73 | 80 |
| 81 #Note that if the input FASTA file contains no sequences, | |
| 82 #split_fasta returns an empty list (i.e. zero temp files). | |
| 74 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) | 83 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) |
| 75 temp_files = [f+".out" for f in fasta_files] | 84 temp_files = [f+".out" for f in fasta_files] |
| 76 jobs = ["tmhmm %s > %s" % (fasta, temp) | 85 jobs = ["tmhmm %s > %s" % (fasta, temp) |
| 77 for fasta, temp in zip(fasta_files, temp_files)] | 86 for fasta, temp in zip(fasta_files, temp_files)] |
| 78 | 87 |
| 101 | 110 |
| 102 out_handle = open(tabular_file, "w") | 111 out_handle = open(tabular_file, "w") |
| 103 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n") | 112 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n") |
| 104 for temp in temp_files: | 113 for temp in temp_files: |
| 105 data_handle = open(temp) | 114 data_handle = open(temp) |
| 106 clean_tabular(data_handle, out_handle) | 115 count = clean_tabular(data_handle, out_handle) |
| 107 data_handle.close() | 116 data_handle.close() |
| 117 if not count: | |
| 118 clean_up(fasta_files) | |
| 119 clean_up(temp_files) | |
| 120 stop_err("No output from tmhmm2") | |
| 108 out_handle.close() | 121 out_handle.close() |
| 109 | 122 |
| 110 clean_up(fasta_files) | 123 clean_up(fasta_files) |
| 111 clean_up(temp_files) | 124 clean_up(temp_files) |
