diff tools/protein_analysis/signalp3.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"
author peterjc
date Thu, 17 Jun 2021 08:21:06 +0000
parents a19b3ded8f33
children e1996f0f4e85
line wrap: on
line diff
--- a/tools/protein_analysis/signalp3.py	Thu Sep 21 11:35:20 2017 -0400
+++ b/tools/protein_analysis/signalp3.py	Thu Jun 17 08:21:06 2021 +0000
@@ -71,10 +71,12 @@
     sys.exit(os.system("signalp -version"))
 
 if len(sys.argv) not in [6, 8]:
-    sys.exit("Require five (or 7) arguments, organism, truncate, threads, "
-             "input protein FASTA file & output tabular file (plus "
-             "optionally cut method and GFF3 output file). "
-             "Got %i arguments." % (len(sys.argv) - 1))
+    sys.exit(
+        "Require five (or 7) arguments, organism, truncate, threads, "
+        "input protein FASTA file & output tabular file (plus "
+        "optionally cut method and GFF3 output file). "
+        "Got %i arguments." % (len(sys.argv) - 1)
+    )
 
 organism = sys.argv[1]
 if organism not in ["euk", "gram+", "gram-"]:
@@ -111,8 +113,9 @@
             continue
         parts = line.rstrip("\r\n").split()
         assert len(parts) == 21, repr(line)
-        assert parts[14].startswith(parts[0]), \
+        assert parts[14].startswith(parts[0]), (
             "Bad entry in SignalP output, ID miss-match:\n%r" % line
+        )
         # Remove redundant truncated name column (col 0)
         # and put full name at start (col 14)
         parts = parts[14:15] + parts[1:14] + parts[15:]
@@ -121,11 +124,12 @@
 
 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
     """Make a GFF file."""
-    cut_col, score_col = {"NN_Cmax": (2, 1),
-                          "NN_Ymax": (5, 4),
-                          "NN_Smax": (8, 7),
-                          "HMM_Cmax": (16, 15),
-                          }[cut_method]
+    cut_col, score_col = {
+        "NN_Cmax": (2, 1),
+        "NN_Ymax": (5, 4),
+        "NN_Smax": (8, 7),
+        "HMM_Cmax": (16, 15),
+    }[cut_method]
 
     source = "SignalP"
     strand = "."  # not stranded
@@ -153,30 +157,62 @@
             cut = 1
         assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
         score = parts[score_col]
-        gff_handle.write("##sequence-region %s %i %i\n"
-                         % (seqid, 1, len(seq)))
+        gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq)))
         # If the cut is at the very begining, there is no signal peptide!
         if cut > 1:
             # signal_peptide = SO:0000418
-            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
-                             % (seqid, source,
-                                "signal_peptide", 1, cut - 1,
-                                score, strand, phase, tags))
+            gff_handle.write(
+                "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
+                % (
+                    seqid,
+                    source,
+                    "signal_peptide",
+                    1,
+                    cut - 1,
+                    score,
+                    strand,
+                    phase,
+                    tags,
+                )
+            )
         # mature_protein_region = SO:0000419
-        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
-                         % (seqid, source,
-                            "mature_protein_region", cut, len(seq),
-                            score, strand, phase, tags))
+        gff_handle.write(
+            "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
+            % (
+                seqid,
+                source,
+                "mature_protein_region",
+                cut,
+                len(seq),
+                score,
+                strand,
+                phase,
+                tags,
+            )
+        )
     tab_handle.close()
     gff_handle.close()
 
 
-fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
-                          n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
+if num_threads == 1:
+    # Still want to call split_fasta to apply truncation, but
+    # no reason to make multiple files - and more chance of
+    # hitting file system glitches if we do. So,
+    FASTA_CHUNK = sys.maxsize
+
+fasta_files = split_fasta(
+    fasta_file,
+    os.path.join(tmp_dir, "signalp"),
+    n=FASTA_CHUNK,
+    truncate=truncate,
+    max_len=MAX_LEN,
+)
 temp_files = [f + ".out" for f in fasta_files]
 assert len(fasta_files) == len(temp_files)
-jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
-        for (fasta, temp) in zip(fasta_files, temp_files)]
+jobs = [
+    "signalp -short -t %s %s > %s" % (organism, fasta, temp)
+    for (fasta, temp) in zip(fasta_files, temp_files)
+]
 assert len(fasta_files) == len(temp_files) == len(jobs)
 
 
@@ -205,9 +241,15 @@
     if error_level or output.lower().startswith("error running"):
         clean_up(fasta_files + temp_files)
         if output:
-            sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output))
+            sys.stderr.write(
+                "One or more tasks failed, e.g. %i from %r gave:\n%s"
+                % (error_level, cmd, output)
+            )
         else:
-            sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd))
+            sys.stderr.write(
+                "One or more tasks failed, e.g. %i from %r with no output\n"
+                % (error_level, cmd)
+            )
         sys.exit(error_level)
 del results
 
@@ -218,8 +260,16 @@
     fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name])
 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])
 # HMM results:
-fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred",
-               "HMM_Sprob_score", "HMM_Sprob_pred"])
+fields.extend(
+    [
+        "HMM_type",
+        "HMM_Cmax_score",
+        "HMM_Cmax_pos",
+        "HMM_Cmax_pred",
+        "HMM_Sprob_score",
+        "HMM_Sprob_pred",
+    ]
+)
 out_handle.write("#" + "\t".join(fields) + "\n")
 for temp in temp_files:
     data_handle = open(temp)