diff tools/protein_analysis/wolf_psort.py @ 5:0f1c61998b22

Migrated tool version 0.0.8 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:06:27 -0400
parents
children a290c6d4e658
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/wolf_psort.py	Tue Jun 07 18:06:27 2011 -0400
@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+"""Wrapper for WoLF PSORT v0.2 for use in Galaxy.
+
+This script takes exactly four command line arguments:
+ * the organism type (animal, plant or fungi)
+ * number of threads to use (integer)
+ * an input protein FASTA filename
+ * output tabular filename.
+
+It then calls the standalone WoLF PSORT v0.2 program runWolfPsortSummary
+(not the webservice), and coverts the output from something like this:
+
+# k used for kNN is: 27
+gi|301087619|ref|XP_002894699.1| extr 12, mito 4, E.R. 3, golg 3, mito_nucl 3
+gi|301087623|ref|XP_002894700.1| extr 21, mito 2, cyto 2, cyto_mito 2
+
+In order to make it easier to use in Galaxy, this wrapper script reformats
+this to use tab separators, with one line per compartment prediction:
+
+#ID	Compartment	Score	Rank
+gi|301087619|ref|XP_002894699.1|	extr	12	1
+gi|301087619|ref|XP_002894699.1|	mito	4	2
+gi|301087619|ref|XP_002894699.1|	E.R.	3	3
+gi|301087619|ref|XP_002894699.1|	golg	3	4
+gi|301087619|ref|XP_002894699.1|	mito_nucl	3	5
+gi|301087623|ref|XP_002894700.1|	extr	21	1
+gi|301087623|ref|XP_002894700.1|	mito	2	2
+gi|301087623|ref|XP_002894700.1|	cyto	2	3
+gi|301087623|ref|XP_002894700.1|	cyto_mito	2	4
+
+Additionally in order to take full advantage of multiple cores, by subdividing
+the input FASTA file multiple copies of WoLF PSORT are run in parallel. I would
+normally use Python's multiprocessing library in this situation but it requires
+at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
+"""
+import sys
+import os
+from seq_analysis_utils import stop_err, split_fasta, run_jobs
+
+FASTA_CHUNK = 500
+exe = "runWolfPsortSummary"
+
+"""
+Note: I had trouble getting runWolfPsortSummary on the path, so used a wrapper
+python script called runWolfPsortSummary as follows:
+
+#!/usr/bin/env python
+#Wrapper script to call WoLF PSORT from its own directory.
+import os
+import sys
+import subprocess
+saved_dir = os.path.abspath(os.curdir)
+os.chdir("/opt/WoLFPSORT_package_v0.2/bin")
+args = ["./runWolfPsortSummary"] + sys.argv[1:]
+return_code = subprocess.call(args)
+os.chdir(saved_dir)
+sys.exit(return_code)
+"""
+
+if len(sys.argv) != 5:
+   stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file")
+
+organism = sys.argv[1]
+if organism not in ["animal", "plant", "fungi"]:
+   stop_err("Organism argument %s is not one of animal, plant, fungi" % organism)
+
+try:
+   num_threads = int(sys.argv[2])
+except:
+   num_threads = 0
+if num_threads < 1:
+   stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
+
+fasta_file = sys.argv[3]
+
+tabular_file = sys.argv[4]
+
+def clean_tabular(raw_handle, out_handle):
+    """Clean up WoLF PSORT output to make it tabular."""
+    for line in raw_handle:
+        if not line or line.startswith("#"):
+            continue
+        name, data = line.rstrip("\r\n").split(None,1)
+        for rank, comp_data in enumerate(data.split(",")):
+            comp, score = comp_data.split()
+            out_handle.write("%s\t%s\t%s\t%i\n" \
+                             % (name, comp, score, rank+1))
+
+fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK)
+temp_files = [f+".out" for f in fasta_files]
+assert len(fasta_files) == len(temp_files)
+jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp)
+        for (fasta, temp) in zip(fasta_files, temp_files)]
+assert len(fasta_files) == len(temp_files) == len(jobs)
+
+def clean_up(file_list):
+    for f in file_list:
+        if os.path.isfile(f):
+            os.remove(f)
+
+if len(jobs) > 1 and num_threads > 1:
+    #A small "info" message for Galaxy to show the user.
+    print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
+results = run_jobs(jobs, num_threads)
+assert len(fasta_files) == len(temp_files) == len(jobs)
+for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
+    error_level = results[cmd]
+    try:
+        output = open(temp).readline()
+    except IOError:
+        output = ""
+    if error_level or output.lower().startswith("error running"):
+        clean_up(fasta_files)
+        clean_up(temp_files)
+        stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+                 error_level)
+del results
+
+out_handle = open(tabular_file, "w")
+out_handle.write("#ID\tCompartment\tScore\tRank\n")
+for temp in temp_files:
+    data_handle = open(temp)
+    clean_tabular(data_handle, out_handle)
+    data_handle.close()
+out_handle.close()
+
+clean_up(fasta_files)
+clean_up(temp_files)