diff tools/protein_analysis/rxlr_motifs.py @ 18:eb6ac44d4b8e draft

Suite v0.2.8, record Promoter 2 verion + misc internal updates
author peterjc
date Tue, 01 Sep 2015 09:56:36 -0400
parents 7de64c8b258d
children f3ecd80850e2
line wrap: on
line diff
--- a/tools/protein_analysis/rxlr_motifs.py	Fri Nov 21 08:19:09 2014 -0500
+++ b/tools/protein_analysis/rxlr_motifs.py	Tue Sep 01 09:56:36 2015 -0400
@@ -40,10 +40,14 @@
 import sys
 import re
 import subprocess
-from seq_analysis_utils import stop_err, fasta_iterator
+from seq_analysis_utils import sys_exit, fasta_iterator
+
+if "-v" in sys.argv:
+    print("RXLR Motifs v0.0.10")
+    sys.exit(0)
 
 if len(sys.argv) != 5:
-   stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename")
+    sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename")
 
 fasta_file, threads, model, tabular_file = sys.argv[1:]
 hmm_output_file = tabular_file + ".hmm.tmp"
@@ -53,36 +57,36 @@
 hmmer_search = "hmmsearch2"
 
 if model == "Bhattacharjee2006":
-   signalp_trunc = 70
-   re_rxlr = re.compile("R.LR")
-   min_sp = 10
-   max_sp = 40
-   max_sp_rxlr = 100
-   min_rxlr_start = 1
-   #Allow signal peptide to be at most 40aa, and want RXLR to be
-   #within 100aa, therefore for the prescreen the max start is 140:
-   max_rxlr_start = max_sp + max_sp_rxlr
+    signalp_trunc = 70
+    re_rxlr = re.compile("R.LR")
+    min_sp = 10
+    max_sp = 40
+    max_sp_rxlr = 100
+    min_rxlr_start = 1
+    # Allow signal peptide to be at most 40aa, and want RXLR to be
+    # within 100aa, therefore for the prescreen the max start is 140:
+    max_rxlr_start = max_sp + max_sp_rxlr
 elif model == "Win2007":
-   signalp_trunc = 70
-   re_rxlr = re.compile("R.LR")
-   min_sp = 10
-   max_sp = 40
-   min_rxlr_start = 30
-   max_rxlr_start = 60
-   #No explicit limit on separation of signal peptide clevage
-   #and RXLR, but shortest signal peptide is 10, and furthest
-   #away RXLR is 60, so effectively limit is 50.
-   max_sp_rxlr = max_rxlr_start - min_sp + 1
+    signalp_trunc = 70
+    re_rxlr = re.compile("R.LR")
+    min_sp = 10
+    max_sp = 40
+    min_rxlr_start = 30
+    max_rxlr_start = 60
+    # No explicit limit on separation of signal peptide clevage
+    # and RXLR, but shortest signal peptide is 10, and furthest
+    # away RXLR is 60, so effectively limit is 50.
+    max_sp_rxlr = max_rxlr_start - min_sp + 1
 elif model == "Whisson2007":
-   signalp_trunc = 0 #zero for no truncation
-   re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]")
-   min_sp = 10
-   max_sp = 40
-   max_sp_rxlr = 100
-   min_rxlr_start = 1
-   max_rxlr_start = max_sp + max_sp_rxlr
+    signalp_trunc = 0  # zero for no truncation
+    re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]")
+    min_sp = 10
+    max_sp = 40
+    max_sp_rxlr = 100
+    min_rxlr_start = 1
+    max_rxlr_start = max_sp + max_sp_rxlr
 else:
-   stop_err("Did not recognise the model name %r\n"
+   sys_exit("Did not recognise the model name %r\n"
             "Use Bhattacharjee2006, Win2007, or Whisson2007" % model)
 
 
@@ -108,49 +112,49 @@
     hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
                        "whisson_et_al_rxlr_eer_cropped.hmm")
     if not os.path.isfile(hmm_file):
-        stop_err("Missing HMM file for Whisson et al. (2007)")
+        sys_exit("Missing HMM file for Whisson et al. (2007)")
     if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
-        stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher)
+        sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search)
 
     hmm_hits = set()
     valid_ids = set()
     for title, seq in fasta_iterator(fasta_file):
         name = title.split(None,1)[0]
         if name in valid_ids:
-            stop_err("Duplicated identifier %r" % name)
+            sys_exit("Duplicated identifier %r" % name)
         else:
             valid_ids.add(name)
     if not valid_ids:
-        #Special case, don't need to run HMMER if there are no sequences
+        # Special case, don't need to run HMMER if there are no sequences
         pass
     else:
-        #I've left the code to handle HMMER 3 in situ, in case
-        #we revisit the choice to insist on HMMER 2.
+        # I've left the code to handle HMMER 3 in situ, in case
+        # we revisit the choice to insist on HMMER 2.
         hmmer3 = (3 == get_hmmer_version(hmmer_search))
-        #Using zero (or 5.6?) for bitscore threshold
+        # Using zero (or 5.6?) for bitscore threshold
         if hmmer3:
-            #The HMMER3 table output is easy to parse
-            #In HMMER3 can't use both -T and -E
+            # The HMMER3 table output is easy to parse
+            # In HMMER3 can't use both -T and -E
             cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
                   % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
         else:
-            #For HMMER2 we are stuck with parsing stdout
-            #Put 1e6 to effectively have no expectation threshold (otherwise
-            #HMMER defaults to 10 and the calculated e-value depends on the
-            #input FASTA file, and we can loose hits of interest).
+            # For HMMER2 we are stuck with parsing stdout
+            # Put 1e6 to effectively have no expectation threshold (otherwise
+            # HMMER defaults to 10 and the calculated e-value depends on the
+            # input FASTA file, and we can loose hits of interest).
             cmd = "%s -T 0 -E 1e6 %s %s > %s" \
                   % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
         return_code = os.system(cmd)
         if return_code:
-            stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code)
+            sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code)
 
         handle = open(hmm_output_file)
         for line in handle:
             if not line.strip():
-                #We expect blank lines in the HMMER2 stdout
+                # We expect blank lines in the HMMER2 stdout
                 continue
             elif line.startswith("#"):
-                #Header
+                # Header
                 continue
             else:
                 name = line.split(None,1)[0]
@@ -159,18 +163,18 @@
                 if name in valid_ids:
                     hmm_hits.add(name)
                 elif hmmer3:
-                    stop_err("Unexpected identifer %r in hmmsearch output" % name)
+                    sys_exit("Unexpected identifer %r in hmmsearch output" % name)
         handle.close()
-        #if hmmer3:
-        #    print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
-        #else:
-        #    print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))  
-        #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
+        # if hmmer3:
+        #     print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
+        # else:
+        #     print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))  
+        # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
         os.remove(hmm_output_file)
     del valid_ids
 
 
-#Prepare short list of candidates containing RXLR to pass to SignalP
+# Prepare short list of candidates containing RXLR to pass to SignalP
 assert min_rxlr_start > 0, "Min value one, since zero based counting"
 count = 0
 total = 0
@@ -180,27 +184,28 @@
     name = title.split(None,1)[0]
     match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
     if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
-        #This is a potential RXLR, depending on the SignalP results.
-        #Might as well truncate the sequence now, makes the temp file smaller
+        # This is a potential RXLR, depending on the SignalP results.
+        # Might as well truncate the sequence now, makes the temp file smaller
         if signalp_trunc:
             handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc]))
         else:
-            #Does it matter we don't line wrap?
+            # Does it matter we don't line wrap?
             handle.write(">%s\n%s\n" % (name, seq))
         count += 1
 handle.close()
-#print "Running SignalP on %i/%i potentials." % (count, total)
+# print "Running SignalP on %i/%i potentials." % (count, total)
 
 
-#Run SignalP (using our wrapper script to get multi-core support etc)
+# Run SignalP (using our wrapper script to get multi-core support etc)
 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
 if not os.path.isfile(signalp_script):
-    stop_err("Error - missing signalp3.py script")
+    sys_exit("Error - missing signalp3.py script")
 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file)
 return_code = os.system(cmd)
 if return_code:
-    stop_err("Error %i from SignalP:\n%s" % (return_code, cmd))
-#print "SignalP done"
+    sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd))
+# print "SignalP done"
+
 
 def parse_signalp(filename):
     """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length.
@@ -217,7 +222,7 @@
     handle.close()
 
 
-#Parse SignalP results and apply the strict RXLR criteria
+# Parse SignalP results and apply the strict RXLR criteria
 total = 0
 tally = dict()
 handle = open(tabular_file, "w")
@@ -230,26 +235,26 @@
     match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
     if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
         del match
-        #This was the criteria for calling SignalP,
+        # This was the criteria for calling SignalP,
         #so it will be in the SignalP results.
         sp_id, sp_hmm_score, sp_nn_len = signalp_results.next()
         assert name == sp_id, "%s vs %s" % (name, sp_id)
         if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
             match = re_rxlr.search(seq[sp_nn_len:].upper())
-            if match and match.start() + 1 <= max_sp_rxlr: #1-based counting
+            if match and match.start() + 1 <= max_sp_rxlr:  # 1-based counting
                 rxlr_start = sp_nn_len + match.start() + 1
                 if min_rxlr_start <= rxlr_start <= max_rxlr_start:
                     rxlr = "Y"
     if model == "Whisson2007":
-        #Combine the signalp with regular expression heuristic and the HMM
+        # Combine the signalp with regular expression heuristic and the HMM
         if name in hmm_hits and rxlr == "N":
-            rxlr = "hmm" #HMM only
+            rxlr = "hmm"  # HMM only
         elif rxlr == "N":
-            rxlr = "neither" #Don't use N (no)
+            rxlr = "neither"  # Don't use N (no)
         elif name not in hmm_hits and rxlr == "Y":
-            rxlr = "re" #Heuristic only
-        #Now have a four way classifier: Y, hmm, re, neither
-        #and count is the number of Y results (both HMM and heuristic)
+            rxlr = "re"  # Heuristic only
+        # Now have a four way classifier: Y, hmm, re, neither
+        # and count is the number of Y results (both HMM and heuristic)
     handle.write("%s\t%s\n" % (name, rxlr))
     try:
         tally[rxlr] += 1
@@ -258,17 +263,17 @@
 handle.close()
 assert sum(tally.values()) == total
 
-#Check the iterator is finished
+# Check the iterator is finished
 try:
     signalp_results.next()
     assert False, "Unexpected data in SignalP output"
 except StopIteration:
     pass
 
-#Cleanup
+# Cleanup
 os.remove(signalp_input_file)
 os.remove(signalp_output_file)
 
-#Short summary to stdout for Galaxy's info display
+# Short summary to stdout for Galaxy's info display
 print "%s for %i sequences:" % (model, total)
 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))