Mercurial > repos > peterjc > tmhmm_and_signalp

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/empty.fasta	Tue Jun 07 18:04:05 2011 -0400
@@ -0,0 +1,2 @@
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/empty_signalp3.tabular	Tue Jun 07 18:04:05 2011 -0400
@@ -0,0 +1,1 @@
+#ID	NN_Cmax_score	NN_Cmax_pos	NN_Cmax_pred	NN_Ymax_score	NN_Ymax_pos	NN_Ymax_pred	NN_Smax_score	NN_Smax_pos	NN_Smax_pred	NN_Smean_score	NN_Smean_pred	NN_D_score	NN_D_pred	HMM_type	HMM_Cmax_score	HMM_Cmax_pos	HMM_Cmax_pred	HMM_Sprob_score	HMM_Sprob_pred
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/empty_tmhmm2.tabular	Tue Jun 07 18:04:05 2011 -0400
@@ -0,0 +1,1 @@
+#ID	len	ExpAA	First60	PredHel	Topology
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.signalp3.tabular	Tue Jun 07 18:04:05 2011 -0400
@@ -0,0 +1,5 @@
+#ID	NN_Cmax_score	NN_Cmax_pos	NN_Cmax_pred	NN_Ymax_score	NN_Ymax_pos	NN_Ymax_pred	NN_Smax_score	NN_Smax_pos	NN_Smax_pred	NN_Smean_score	NN_Smean_pred	NN_D_score	NN_D_pred	HMM_type	HMM_Cmax_score	HMM_Cmax_pos	HMM_Cmax_pred	HMM_Sprob_score	HMM_Sprob_pred
+sp|Q9BS26|ERP44_HUMAN	0.565	30	Y	0.686	30	Y	0.986	12	Y	0.818	Y	0.752	Y	S	0.945	30	Y	0.966	Y
+sp|Q9NSY1|BMP2K_HUMAN	0.153	869	N	0.050	270	N	0.229	12	N	0.008	N	0.029	N	Q	0.000	0	N	0.000	N
+sp|P06213|INSR_HUMAN	0.396	28	Y	0.561	28	Y	0.993	19	Y	0.902	Y	0.731	Y	Q	0.205	28	N	0.341	N
+sp|P08100|OPSD_HUMAN	0.211	52	N	0.344	52	Y	0.945	50	Y	0.245	N	0.295	N	Q	0.000	52	N	0.000	N
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/four_human_proteins.tmhmm2.tabular	Tue Jun 07 18:04:05 2011 -0400
@@ -0,0 +1,5 @@
+#ID	len	ExpAA	First60	PredHel	Topology
+sp|Q9BS26|ERP44_HUMAN	406	0.23	0.23	0	o
+sp|Q9NSY1|BMP2K_HUMAN	1161	0.35	0.26	0	o
+sp|P06213|INSR_HUMAN	1382	50.22	20.76	2	i9-31o957-979i
+sp|P08100|OPSD_HUMAN	348	157.99	21.69	7	o39-61i74-96o111-133i153-175o202-224i254-276o286-308i
--- a/tools/protein_analysis/README	Tue Jun 07 18:03:34 2011 -0400
+++ b/tools/protein_analysis/README	Tue Jun 07 18:04:05 2011 -0400
@@ -43,8 +43,11 @@
    subfolder test-data:

 four_human_proteins.fasta
-four_human_proteins.signalp3.tsv
-four_human_proteins.tmhmm2.tsv
+four_human_proteins_signalp3.tabular
+four_human_proteins_tmhmm2.tabular
+empty.fasta
+empty_tmhmm2.tabular
+empty_signalp3.tabular

 7. Run the Galaxy functional tests for these new wrappers with:

@@ -57,3 +60,42 @@
 ./run_functional_tests.sh -sid Protein_sequence_analysis-protein_analysis

 8. Restart Galaxy and check the new tools are shown and work.
+
+
+History
+=======
+
+v0.0.1 - Initial release
+v0.0.2 - Corrected some typos in the help text
+       - Renamed test output file to use Galaxy convention of *.tabular
+v0.0.3 - Check for tmhmm2 silent failures (no output)
+       - Additional unit tests
+
+Developers
+==========
+
+These wrappers are currently being developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/seq_analysis
+
+For making the "Galaxy Tool Shed" http://community.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder:
+
+tar -czf tmhmm_and_signalp.tar.gz tools/protein_analysis/LICENSE tools/protein_analysis/README tools/protein_analysis/suite_config.xml tools/protein_analysis/seq_analysis_utils.py tools/protein_analysis/signalp3.xml tools/protein_analysis/signalp3.py tools/protein_analysis/tmhmm2.xml tools/protein_analysis/tmhmm2.py test-data/four_human_proteins.* test-data/empty.fasta test-data/empty_tmhmm2.tabular test-data/empty_signalp3.tabular
+
+Check this worked:
+
+$ tar -tzf tmhmm_and_signalp.tar.gz
+tools/protein_analysis/LICENSE
+tools/protein_analysis/README
+tools/protein_analysis/suite_config.xml
+tools/protein_analysis/seq_analysis_utils.py
+tools/protein_analysis/signalp3.xml
+tools/protein_analysis/signalp3.py
+tools/protein_analysis/tmhmm2.xml
+tools/protein_analysis/tmhmm2.py
+test-data/four_human_proteins.fasta
+test-data/four_human_proteins.signalp3.tabular
+test-data/four_human_proteins.tmhmm2.tabular
+test-data/empty.fasta
+test-data/empty_tmhmm2.tabular
+test-data/empty_signalp3.tabular
--- a/tools/protein_analysis/signalp3.xml	Tue Jun 07 18:03:34 2011 -0400
+++ b/tools/protein_analysis/signalp3.xml	Tue Jun 07 18:04:05 2011 -0400
@@ -1,4 +1,4 @@
-<tool id="signalp3" name="SignalP 3.0" version="0.0.1">
+<tool id="signalp3" name="SignalP 3.0" version="0.0.3">
     <description>Find signal peptides in protein sequences</description>
     <command interpreter="python">
       signalp3.py $organism $truncate 8 $fasta_file $tabular_file
@@ -26,14 +26,32 @@
             <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta"/>
             <param name="organism" value="euk"/>
             <param name="truncate" value="0"/>
-            <output name="tabular_file" file="four_human_proteins.signalp3.tsv" ftype="tabular"/>
+            <output name="tabular_file" file="four_human_proteins.signalp3.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="fasta_file" value="empty.fasta" ftype="fasta"/>
+            <param name="organism" value="euk"/>
+            <param name="truncate" value="60"/>
+            <output name="tabular_file" file="empty_signalp3.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="fasta_file" value="empty.fasta" ftype="fasta"/>
+            <param name="organism" value="gram+"/>
+            <param name="truncate" value="80"/>
+            <output name="tabular_file" file="empty_signalp3.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="fasta_file" value="empty.fasta" ftype="fasta"/>
+            <param name="organism" value="gram-"/>
+            <param name="truncate" value="0"/>
+            <output name="tabular_file" file="empty_signalp3.tabular" ftype="tabular"/>
         </test>
     </tests>
     <help>

 **What it does**

-This calls the SignalP v3.0 tool for prediction of signal peptides, which uses both a neural network (NN) and Hidden Markmov Model (HMM) to produce two sets of scores.
+This calls the SignalP v3.0 tool for prediction of signal peptides, which uses both a Neural Network (NN) and Hidden Markov Model (HMM) to produce two sets of scores.

 The input is a FASTA file of protein sequences, and the output is tabular with twenty columns (one row per protein):

@@ -57,7 +75,7 @@

 The S-mean is the average of the S-score, ranging from the N-terminal amino acid to the amino acid assigned with the highest Y-max score, thus the S-mean score is calculated for the length of the predicted signal peptide. The S-mean score was in SignalP version 2.0 used as the criteria for discrimination of secretory and non-secretory proteins.

-The D-score is introduced in SignalP version 3.0 and is a simple average of the S-mean and Y-max score. The score shows superior discrimination performance of secretory and non-secretory proteins to that of the S-mean score which was used in SignalP version 1 and 2.
+The D-score was introduced in SignalP version 3.0 and is a simple average of the S-mean and Y-max score. The score shows superior discrimination performance of secretory and non-secretory proteins to that of the S-mean score which was used in SignalP version 1 and 2.

 For non-secretory proteins all the scores represented in the SignalP3-NN output should ideally be very low.
--- a/tools/protein_analysis/suite_config.xml	Tue Jun 07 18:03:34 2011 -0400
+++ b/tools/protein_analysis/suite_config.xml	Tue Jun 07 18:04:05 2011 -0400
@@ -1,9 +1,9 @@
-    <suite id="tmhmm_and_signalp" name="TMHMM and SignalP" version="0.0.1">
+    <suite id="tmhmm_and_signalp" name="TMHMM and SignalP" version="0.0.3">
         <description>Wrappers for TMHMM and SignalP</description>
-        <tool id="tmhmm2" name="TMHMM 2.0" version="0.0.1">
+        <tool id="tmhmm2" name="TMHMM 2.0" version="0.0.3">
             <description>Find transmembrane domains in protein sequences</description>
         </tool>
-        <tool id="signalp3" name="SignalP 3.0" version="0.0.1">
+        <tool id="signalp3" name="SignalP 3.0" version="0.0.3">
             <description>Find signal peptides in protein sequences</description>
         </tool>
     </suite>
--- a/tools/protein_analysis/tmhmm2.py	Tue Jun 07 18:03:34 2011 -0400
+++ b/tools/protein_analysis/tmhmm2.py	Tue Jun 07 18:04:05 2011 -0400
@@ -29,6 +29,10 @@
 into chunks and running multiple copies of TMHMM in parallel. I would normally
 use Python's multiprocessing library in this situation but it requires at
 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
+
+Also tmhmm2 can fail without returning an error code, for example if run on a
+64 bit machine with only the 32 bit binaries installed. This script will spot
+when there is no output from tmhmm2, and raise an error.
 """
 import sys
 import os
@@ -48,7 +52,8 @@
 tabular_file = sys.argv[3]

 def clean_tabular(raw_handle, out_handle):
-    """Clean up tabular TMHMM output."""
+    """Clean up tabular TMHMM output, returns output line count."""
+    count = 0
     for line in raw_handle:
         if not line:
             continue
@@ -68,9 +73,13 @@
         predhel = predhel[8:]
         assert topology.startswith("Topology="), line
         topology = topology[9:]
-	out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
+        out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
                    % (identifier, length, expAA, first60, predhel, topology))
+        count += 1
+    return count

+#Note that if the input FASTA file contains no sequences,
+#split_fasta returns an empty list (i.e. zero temp files).
 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK)
 temp_files = [f+".out" for f in fasta_files]
 jobs = ["tmhmm %s > %s" % (fasta, temp)
@@ -103,8 +112,12 @@
 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n")
 for temp in temp_files:
     data_handle = open(temp)
-    clean_tabular(data_handle, out_handle)
+    count = clean_tabular(data_handle, out_handle)
     data_handle.close()
+    if not count:
+        clean_up(fasta_files)
+        clean_up(temp_files)
+        stop_err("No output from tmhmm2")
 out_handle.close()

 clean_up(fasta_files)
--- a/tools/protein_analysis/tmhmm2.xml	Tue Jun 07 18:03:34 2011 -0400
+++ b/tools/protein_analysis/tmhmm2.xml	Tue Jun 07 18:04:05 2011 -0400
@@ -1,4 +1,4 @@
-<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.1">
+<tool id="tmhmm2" name="TMHMM 2.0" version="0.0.3">
     <description>Find transmembrane domains in protein sequences</description>
     <command interpreter="python">
       tmhmm2.py 8 $fasta_file $tabular_file
@@ -22,7 +22,11 @@
     <tests>
         <test>
             <param name="fasta_file" value="four_human_proteins.fasta" ftype="fasta"/>
-            <output name="tabular_file" file="four_human_proteins.tmhmm2.tsv" ftype="tabular"/>
+            <output name="tabular_file" file="four_human_proteins.tmhmm2.tabular" ftype="tabular"/>
+        </test>
+        <test>
+            <param name="fasta_file" value="empty.fasta" ftype="fasta"/>
+            <output name="tabular_file" file="empty_tmhmm2.tabular" ftype="tabular"/>
         </test>
     </tests>
     <help>
@@ -40,9 +44,9 @@
  5. Number of transmembrane helices predicted by N-best.
  6. Topology predicted by N-best (encoded as a strip using o for output and i for inside)

-Predicted TM segments in the n-terminal region sometime turn out to be signal peptides.
+Predicted TM segments in the n-terminal region sometimes turn out to be signal peptides.

-One of the most common mistakes by the program is to reverse the direction of proteins with one TM segment.
+One of the most common mistakes by the program is to reverse the direction of proteins with one TM segment (i.e. mixing up which end of the protein is outside and inside the membrane).

 Do not use the program to predict whether a non-membrane protein is cytoplasmic or not.