diff tools/protein_analysis/psortb.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"
author peterjc
date Thu, 17 Jun 2021 08:21:06 +0000
parents a19b3ded8f33
children
line wrap: on
line diff
--- a/tools/protein_analysis/psortb.py	Thu Sep 21 11:35:20 2017 -0400
+++ b/tools/protein_analysis/psortb.py	Thu Jun 17 08:21:06 2021 +0000
@@ -37,9 +37,11 @@
     sys.exit(os.system("psort --version"))
 
 if len(sys.argv) != 8:
-    sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), "
-             "output (e.g. terse/normal/long), cutoff, divergent, input protein "
-             "FASTA file & output tabular file")
+    sys.exit(
+        "Require 7 arguments, number of threads (int), type (e.g. archaea), "
+        "output (e.g. terse/normal/long), cutoff, divergent, input protein "
+        "FASTA file & output tabular file"
+    )
 
 num_threads = thread_count(sys.argv[1], default=4)
 org_type = sys.argv[2]
@@ -58,47 +60,117 @@
 tabular_file = sys.argv[7]
 
 if out_type == "terse":
-    header = ['SeqID', 'Localization', 'Score']
+    header = ["SeqID", "Localization", "Score"]
 elif out_type == "normal":
     sys.exit("Normal output not implemented yet, sorry.")
 elif out_type == "long":
     if org_type == "-n":
         # Gram negative bacteria
-        header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details',
-                  'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details',
-                  'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details',
-                  'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details',
-                  'Profile-_Localization', 'Profile-_Details',
-                  'SCL-BLAST-_Localization', 'SCL-BLAST-_Details',
-                  'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details',
-                  'Signal-_Localization', 'Signal-_Details',
-                  'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score',
-                  'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
-                  'Secondary_Localization', 'PSortb_Version']
+        header = [
+            "SeqID",
+            "CMSVM-_Localization",
+            "CMSVM-_Details",
+            "CytoSVM-_Localization",
+            "CytoSVM-_Details",
+            "ECSVM-_Localization",
+            "ECSVM-_Details",
+            "ModHMM-_Localization",
+            "ModHMM-_Details",
+            "Motif-_Localization",
+            "Motif-_Details",
+            "OMPMotif-_Localization",
+            "OMPMotif-_Details",
+            "OMSVM-_Localization",
+            "OMSVM-_Details",
+            "PPSVM-_Localization",
+            "PPSVM-_Details",
+            "Profile-_Localization",
+            "Profile-_Details",
+            "SCL-BLAST-_Localization",
+            "SCL-BLAST-_Details",
+            "SCL-BLASTe-_Localization",
+            "SCL-BLASTe-_Details",
+            "Signal-_Localization",
+            "Signal-_Details",
+            "Cytoplasmic_Score",
+            "CytoplasmicMembrane_Score",
+            "Periplasmic_Score",
+            "OuterMembrane_Score",
+            "Extracellular_Score",
+            "Final_Localization",
+            "Final_Localization_Details",
+            "Final_Score",
+            "Secondary_Localization",
+            "PSortb_Version",
+        ]
     elif org_type == "-p":
         # Gram positive bacteria
-        header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details',
-                  'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details',
-                  'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details',
-                  'Profile+_Localization', 'Profile+_Details',
-                  'SCL-BLAST+_Localization', 'SCL-BLAST+_Details',
-                  'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details',
-                  'Signal+_Localization', 'Signal+_Details',
-                  'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
-                  'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
-                  'Secondary_Localization', 'PSortb_Version']
+        header = [
+            "SeqID",
+            "CMSVM+_Localization",
+            "CMSVM+_Details",
+            "CWSVM+_Localization",
+            "CWSVM+_Details",
+            "CytoSVM+_Localization",
+            "CytoSVM+_Details",
+            "ECSVM+_Localization",
+            "ECSVM+_Details",
+            "ModHMM+_Localization",
+            "ModHMM+_Details",
+            "Motif+_Localization",
+            "Motif+_Details",
+            "Profile+_Localization",
+            "Profile+_Details",
+            "SCL-BLAST+_Localization",
+            "SCL-BLAST+_Details",
+            "SCL-BLASTe+_Localization",
+            "SCL-BLASTe+_Details",
+            "Signal+_Localization",
+            "Signal+_Details",
+            "Cytoplasmic_Score",
+            "CytoplasmicMembrane_Score",
+            "Cellwall_Score",
+            "Extracellular_Score",
+            "Final_Localization",
+            "Final_Localization_Details",
+            "Final_Score",
+            "Secondary_Localization",
+            "PSortb_Version",
+        ]
     elif org_type == "-a":
         # Archaea
-        header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details',
-                  'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details',
-                  'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details',
-                  'Profile_a_Localization', 'Profile_a_Details',
-                  'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details',
-                  'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details',
-                  'Signal_a_Localization', 'Signal_a_Details',
-                  'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
-                  'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
-                  'Secondary_Localization', 'PSortb_Version']
+        header = [
+            "SeqID",
+            "CMSVM_a_Localization",
+            "CMSVM_a_Details",
+            "CWSVM_a_Localization",
+            "CWSVM_a_Details",
+            "CytoSVM_a_Localization",
+            "CytoSVM_a_Details",
+            "ECSVM_a_Localization",
+            "ECSVM_a_Details",
+            "ModHMM_a_Localization",
+            "ModHMM_a_Details",
+            "Motif_a_Localization",
+            "Motif_a_Details",
+            "Profile_a_Localization",
+            "Profile_a_Details",
+            "SCL-BLAST_a_Localization",
+            "SCL-BLAST_a_Details",
+            "SCL-BLASTe_a_Localization",
+            "SCL-BLASTe_a_Details",
+            "Signal_a_Localization",
+            "Signal_a_Details",
+            "Cytoplasmic_Score",
+            "CytoplasmicMembrane_Score",
+            "Cellwall_Score",
+            "Extracellular_Score",
+            "Final_Localization",
+            "Final_Localization_Details",
+            "Final_Score",
+            "Secondary_Localization",
+            "PSortb_Version",
+        ]
     else:
         sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type)
 else:
@@ -123,8 +195,11 @@
             # Ignore dummy blank extra column, e.g.
             # "...2.0\t\tPSORTb version 3.0\t\n"
             parts = parts[:-1]
-        assert len(parts) == len(header), \
-            "%i fields, not %i, in line:\n%r" % (len(line), len(header), line)
+        assert len(parts) == len(header), "%i fields, not %i, in line:\n%r" % (
+            len(line),
+            len(header),
+            line,
+        )
         out_handle.write(line)
         count += 1
     return count
@@ -134,8 +209,11 @@
 # split_fasta returns an empty list (i.e. zero temp files).
 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
 temp_files = [f + ".out" for f in fasta_files]
-jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp)
-        for fasta, temp in zip(fasta_files, temp_files)]
+jobs = [
+    "psort %s %s %s -o %s %s > %s"
+    % (org_type, cutoff, divergent, out_type, fasta, temp)
+    for fasta, temp in zip(fasta_files, temp_files)
+]
 
 
 def clean_up(file_list):
@@ -160,8 +238,11 @@
         except IOError:
             output = ""
         clean_up(fasta_files + temp_files)
-        sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
-                 error_level)
+        sys.exit(
+            "One or more tasks failed, e.g. %i from %r gave:\n%s"
+            % (error_level, cmd, output),
+            error_level,
+        )
 del results
 del jobs