Mercurial > repos > galaxyp > mqppep_preproc

diff mqppep_mrgfltr.py @ 1:b76c75521d91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author: galaxyp
date: Fri, 28 Oct 2022 18:26:42 +0000
parents: 8dfd5d2b5903
--- a/mqppep_mrgfltr.py	Mon Jul 11 19:22:54 2022 +0000
+++ b/mqppep_mrgfltr.py	Fri Oct 28 18:26:42 2022 +0000
@@ -87,7 +87,10 @@
         nargs=1,
         required=True,
         dest="phosphopeptides",
-        help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
+        help=" ".join([
+            "Phosphopeptide data for experimental results, including the",
+            "intensities and the mapping to kinase domains, in tabular format"
+        ]),
     )
     #   UniProtKB/SwissProt DB input, SQLite
     parser.add_argument(
@@ -106,7 +109,10 @@
         required=False,
         default=[],
         dest="species",
-        help="limit PhosphoSitePlus records to indicated species (field may be empty)",
+        help=" ".join([
+            "limit PhosphoSitePlus records to indicated species",
+            "(field may be empty)"
+        ]),
     )
 
     # outputs:
@@ -174,7 +180,7 @@
     # determine species to limit records from PSP_Regulatory_Sites
     if options.species is None:
         exit(
-            'Argument "species" is required (and may be empty) but not supplied'
+            'Argument "species" is required (& may be empty) but not supplied'
         )
     try:
         if len(options.species) > 0:
@@ -216,20 +222,25 @@
         FUNCTION_PHOSPHORESIDUE = (
             "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
         )
-        GENE_NAME = "Gene_Name"  # Gene Name from UniProtKB
-        ON_FUNCTION = (
-            "ON_FUNCTION"  # ON_FUNCTION column from PSP_Regulatory_Sites
-        )
-        ON_NOTES = "NOTES"  # NOTES column from PSP_Regulatory_Sites
-        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"  # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
-        ON_PROCESS = (
-            "ON_PROCESS"  # ON_PROCESS column from PSP_Regulatory_Sites
-        )
-        ON_PROT_INTERACT = "ON_PROT_INTERACT"  # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+        # Gene Name from UniProtKB
+        GENE_NAME = "Gene_Name"
+        # ON_FUNCTION column from PSP_Regulatory_Sites
+        ON_FUNCTION = ("ON_FUNCTION")
+        # NOTES column from PSP_Regulatory_Sites
+        ON_NOTES = "NOTES"
+        # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
+        ON_OTHER_INTERACT = "ON_OTHER_INTERACT"
+        # ON_PROCESS column from PSP_Regulatory_Sites
+        ON_PROCESS = ("ON_PROCESS")
+        # ON_PROT_INTERACT column from PSP_Regulatory_Sites
+        ON_PROT_INTERACT = "ON_PROT_INTERACT"
         PHOSPHOPEPTIDE = "Phosphopeptide"
         PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
         PHOSPHORESIDUE = "Phosphoresidue"
-        PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
+        PUTATIVE_UPSTREAM_DOMAINS = " ".join([
+            "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/",
+            "Phosphatases/Binding Domains"
+        ])
         SEQUENCE = "Sequence"
         SEQUENCE10 = "Sequence10"
         SEQUENCE7 = "Sequence7"
@@ -328,8 +339,26 @@
             CitationData
           ) VALUES (?,?)
           """
-        CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
-        CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
+        CITATION_INSERT_PSP = " ".join([
+            "PhosphoSitePlus(R) (PSP) was created by Cell Signaling",
+            "Technology Inc. It is licensed under a Creative Commons",
+            "Attribution-NonCommercial-ShareAlike 3.0 Unported License.",
+            "When using PSP data or analyses in printed publications or",
+            "in online resources, the following acknowledgements must be",
+            "included: (a) the words",
+            '"PhosphoSitePlus(R), www.phosphosite.org" must',
+            "be included at appropriate places in the text or webpage,",
+            "and (b) the following citation must be included in the",
+            'bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser',
+            "JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,",
+            "PTMs and recalibrations. Nucleic Acids Res. 2015",
+            '43:D512-20. PMID: 25514926."'
+        ])
+        CITATION_INSERT_PSP_REF = " ".join([
+            'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and',
+            'recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298,',
+            "https://doi.org/10.1093/nar/gkr1122"
+        ])
 
         MRGFLTR_METADATA_COLUMNS = [
             "ppep_id",
@@ -388,7 +417,8 @@
         file1.close()
         file1_encoded.close()
 
-        # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
+        # Get the list of phosphopeptides with the p's that represent
+        #     the phosphorylation sites removed
         re_phos = re.compile("p")
 
         end_time = time.process_time()  # timer
@@ -397,7 +427,7 @@
             file=sys.stderr,
         )
 
-        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # -------- Get SwissProt data from SQLite database (start) -----------
         # build UniProt sequence LUT and list of unique SwissProt sequences
 
         # Open SwissProt SQLite database
@@ -465,7 +495,7 @@
         Description = ""
         Gene_Name = ""
 
-        # ----------- Get SwissProt data from SQLite database (finish) -----------
+        # -------- Get SwissProt data from SQLite database (finish) -----------
 
         end_time = time.process_time()  # timer
         print(
@@ -473,12 +503,13 @@
             file=sys.stderr,
         )
 
-        # ----------- Get SwissProt data from SQLite database (start) -----------
+        # -------- Get SwissProt data from SQLite database (start) -----------
         # Open SwissProt SQLite database
         conn = sql.connect(uniprot_sqlite)
         cur = conn.cursor()
 
-        # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
+        # Set up dictionary to aggregate results for phosphopeptides
+        #     corresponding to dephosphoeptide
         DephosphoPep_UniProtSeq_LUT = {}
 
         # Set up dictionary to accumulate results
@@ -548,7 +579,7 @@
                     # Get one tuple for each `phospho_pep`
                     #   in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
                     for (upid, gn, desc) in r:
-                        # Append pseudo-tuple per UniProt_ID but only when it is not present
+                        # Append pseudo-tuple per UniProt_ID iff not present
                         if (
                             upid
                             not in DephosphoPep_UniProtSeq_LUT[
@@ -571,7 +602,7 @@
         phospho_pep = dephospho_pep = sequence = 0
         upid = gn = desc = r = ""
 
-        # ----------- Get SwissProt data from SQLite database (finish) -----------
+        # -------- Get SwissProt data from SQLite database (finish) -----------
 
         end_time = time.process_time()  # timer
         print(
@@ -608,33 +639,6 @@
             file=sys.stderr,
         )  # timer
 
-        # ########################################################################
-        # # trim upstream_data to include only the upstream map columns
-        # old_cols = upstream_data.columns.tolist()
-        # i = 0
-        # first_intensity = -1
-        # last_intensity = -1
-        # intensity_re = re.compile("Intensity.*")
-        # for col_name in old_cols:
-        #     m = intensity_re.match(col_name)
-        #     if m:
-        #         last_intensity = i
-        #         if first_intensity == -1:
-        #             first_intensity = i
-        #     i += 1
-        # # print('last intensity = %d' % last_intensity)
-        # col_PKCalpha = last_intensity + 2
-        #
-        # data_in_cols = [old_cols[0]] + old_cols[
-        #     first_intensity: last_intensity + 1
-        # ]
-        #
-        # if upstream_data.empty:
-        #     print("upstream_data is empty")
-        #     exit(0)
-        #
-        # data_in = upstream_data.copy(deep=True)[data_in_cols]
-        ########################################################################
         # trim upstream_data to include only the upstream map columns
         old_cols = upstream_data.columns.tolist()
         i = 0
@@ -666,7 +670,7 @@
         data_in.columns = data_col_names
         print("data_in")
         print(data_in)
-        ########################################################################
+        ######################################################################
 
         # Convert floating-point integers to int64 integers
         #   ref: https://stackoverflow.com/a/68497603/15509512
@@ -689,7 +693,7 @@
         )  # timer
 
         # Produce a dictionary of metadata for a single phosphopeptide.
-        #   This is a replacement of `UniProtInfo_subdict` in the original code.
+        #   This is replaces `UniProtInfo_subdict` from the original code.
         def pseq_to_subdict(phospho_pep):
             # Strip "p" from phosphopeptide sequence
             dephospho_pep = re_phos.sub("", phospho_pep)
@@ -733,19 +737,28 @@
             if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
                 raise PreconditionError(
                     phospho_pep,
-                    "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
+                    " ".join([
+                        "no matching phosphopeptide found in",
+                        "PhosphoPep_UniProtSeq_LUT"
+                    ])
                 )
             if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
                 raise PreconditionError(
                     dephospho_pep,
-                    "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
+                    " ".join([
+                        "dephosphorylated phosphopeptide not found",
+                        "in DephosphoPep_UniProtSeq_LUT"
+                    ])
                 )
             if (
-                dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+                dephospho_pep != PhosphoPep_UniProtSeq_LUT[
+                    (phospho_pep, DEPHOSPHOPEP)]
             ):
                 my_err_msg = "dephosphorylated phosphopeptide does not match "
-                my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
-                my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
+                my_err_msg += "PhosphoPep_UniProtSeq_LUT"
+                my_err_msg += "[(phospho_pep,DEPHOSPHOPEP)] = "
+                my_err_msg += PhosphoPep_UniProtSeq_LUT[
+                    (phospho_pep, DEPHOSPHOPEP)]
                 raise PreconditionError(dephospho_pep, my_err_msg)
 
             result[SEQUENCE] = [dephospho_pep]
@@ -761,20 +774,18 @@
             if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
                 raise PreconditionError(
                     dephospho_pep,
-                    "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
+                    "".join(
+                        "no matching phosphopeptide found",
+                        "in DephosphoPep_UniProtSeq_LUT")
                 )
             UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
                 (dephospho_pep, SEQUENCE)
             ]
             if len(UniProtSeqList) < 1:
                 print(
-                    "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
-                    % dephospho_pep
+                    "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] %s"
+                    % (dephospho_pep, "because value has zero length")
                 )
-                # raise PreconditionError(
-                #     "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
-                #      'value has zero length'
-                #      )
             for UniProtSeq in UniProtSeqList:
                 i = 0
                 phosphoresidues = []
@@ -854,13 +865,15 @@
                     # add Sequence10
                     if psite < 10:  # phospho_pep at N terminus
                         seq10 = (
-                            str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11]
+                            str(UniProtSeq)[:psite] + "p" + str(
+                                UniProtSeq)[psite: psite + 11]
                         )
                     elif (
                         len(UniProtSeq) - psite < 11
                     ):  # phospho_pep at C terminus
                         seq10 = (
-                            str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:]
+                            str(UniProtSeq)[psite - 10: psite] + "p" + str(
+                                UniProtSeq)[psite:]
                         )
                     else:
                         seq10 = str(UniProtSeq)[psite - 10: psite + 11]
@@ -922,23 +935,6 @@
             newstring = "; ".join(
                 [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
             )
-            # #separate the isoforms in PHOSPHORESIDUE column with ";"
-            # oldstring = result[PHOSPHORESIDUE]
-            # oldlist = list(oldstring)
-            # newstring = ""
-            # i = 0
-            # for e in oldlist:
-            #     if e == ";":
-            #         if numps > 1:
-            #             if i%numps:
-            #                 newstring = newstring + ";"
-            #             else:
-            #                 newstring = newstring + ","
-            #         else:
-            #             newstring = newstring + ";"
-            #         i +=1
-            #     else:
-            #         newstring = newstring + e
             result[PHOSPHORESIDUE] = newstring
 
             # separate sequence7's by |
@@ -973,7 +969,8 @@
         )  # timer
 
         # Construct dictionary from list of lists
-        #   ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
+        #   ref: https://www.8bitavenue.com/\
+        #        how-to-convert-list-of-lists-to-dictionary-in-python/
         UniProt_Info = {
             result[0]: result[1]
             for result in result_list
@@ -982,8 +979,8 @@
 
         end_time = time.process_time()  # timer
         print(
-            "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
-            % (end_time - start_time,),
+            "%0.6f create dictionary mapping phosphopeptide %s"
+            % (end_time - start_time, "to metadata dictionary [C]"),
             file=sys.stderr,
         )  # timer
 
@@ -1042,14 +1039,13 @@
         ).reset_index()
         seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
 
-        # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
-        # read in PhosphoSitePlus Regulatory Sites dataset
-        # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
+        # read in PhosphoSitePlus Regulatory Sites dataset from SQLite
+        # --- -------------- begin read PSP_Regulatory_sites -----
         conn = sql.connect(uniprot_sqlite)
         regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
         # Close SwissProt SQLite database
         conn.close()
-        # ... -------------- end read PSP_Regulatory_sites ------------------------------------
+        # ... -------------- end read PSP_Regulatory_sites -------
 
         # keep only the human entries in dataframe
         if len(species) > 0:
@@ -1127,7 +1123,8 @@
             fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
             fp_series[i] = fp_series[i].replace("; ; ", "; ")
 
-            # turn blanks into N_A to signify the info was searched for but cannot be found
+            # turn blanks into N_A to signify the info
+            #     that was searched for but cannot be found
             if fp_series[i] == "":
                 fp_series[i] = N_A
 
@@ -1203,7 +1200,6 @@
             inplace=True,
         )
 
-        # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
         res2 = sorted(
             data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
         )
@@ -1229,21 +1225,6 @@
             file=sys.stderr,
         )  # timer
 
-        # #rename upstream columns in new list
-        # new_cols = []
-        # for name in cols:
-        #     if "_NetworKIN" in name:
-        #         name = name.split("_")[0]
-        #     if " motif" in name:
-        #         name = name.split(" motif")[0]
-        #     if " sequence " in name:
-        #         name = name.split(" sequence")[0]
-        #     if "_Phosida" in name:
-        #         name = name.split("_")[0]
-        #     if "_PhosphoSite" in name:
-        #         name = name.split("_")[0]
-        #     new_cols.append(name)
-
         # rename upstream columns in new list
         def col_rename(name):
             if "_NetworKIN" in name:
@@ -1276,8 +1257,8 @@
         upstream_data_cast.columns = new_cols_cast
         upstream_data_cast["p_peptide"] = upstream_data.index
 
-        # --- -------------- begin read upstream_data_melt ------------------------------------
-        # ----------- Get melted kinase mapping data from SQLite database (start) -----------
+        # Get melted kinase mapping data from SQLite database
+        # --- begin read upstream_data_melt ---------------------------------
         conn = sql.connect(uniprot_sqlite)
         upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
         # Close SwissProt SQLite database
@@ -1303,8 +1284,7 @@
             % (end_time - start_time, len(upstream_data_melt.axes[0])),
             file=sys.stderr,
         )
-        # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
-        # ... -------------- end read upstream_data_melt --------------------------------------
+        # ... end read upstream_data_melt ---------------------------------
 
         end_time = time.process_time()  # timer
         print(
@@ -1332,10 +1312,13 @@
             if p_peptide in melt_dict:
                 melt_dict[p_peptide].append(characterization)
             else:
-                exit(
-                    'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
-                    % (p_peptide)
-                )
+                los = [
+                    "Phosphopeptide %s" % p_peptide,
+                    "not found in ppep_mapping_db:",
+                    '"phopsphopeptides" and "ppep_mapping_db" must both',
+                    "originate from the same run of mqppep_kinase_mapping"
+                ]
+                exit(" ".join(los))
 
         end_time = time.process_time()  # timer
         print(
@@ -1397,29 +1380,12 @@
             ]
         ]
 
-        # cols_output_prelim = output_df.columns.tolist()
-        #
-        # print("cols_output_prelim")
-        # print(cols_output_prelim)
-        #
-        # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
-        #
-        # print("cols_output with p-peptide")
-        # print(cols_output)
-        #
-        # cols_output = [col for col in cols_output if not col == "p-peptide"]
-        #
-        # print("cols_output")
-        # print(cols_output)
-        #
-        # output_df = output_df[cols_output]
-
         # join output_df back to quantitative columns in data_in df
         quant_cols = data_in.columns.tolist()
         quant_cols = quant_cols[1:]
         quant_data = data_in[quant_cols]
 
-        # ----------- Write merge/filter metadata to SQLite database (start) -----------
+        # ---- Write merge/filter metadata to SQLite database (start) ----
         # Open SwissProt SQLite database
         conn = sql.connect(output_sqlite)
         cur = conn.cursor()
@@ -1467,7 +1433,7 @@
 
         # Close SwissProt SQLite database
         conn.close()
-        # ----------- Write merge/filter metadata to SQLite database (finish) -----------
+        # ---- Write merge/filter metadata to SQLite database (finish) ----
 
         output_df = output_df.merge(
             quant_data,
@@ -1480,15 +1446,18 @@
         output_df = output_df[output_cols]
 
         # cosmetic changes to Upstream column
+        # fill the NaN with "" for those Phosphopeptides that got a
+        #   "WARNING: Failed match for " in the upstream mapping
         output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
             PUTATIVE_UPSTREAM_DOMAINS
         ].fillna(
             ""
-        )  # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
+        )
         us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
         i = 0
         while i < len(us_series):
-            # turn blanks into N_A to signify the info was searched for but cannot be found
+            # turn blanks into N_A to signify the info
+            #   that was searched for but cannot be found
             if us_series[i] == "":
                 us_series[i] = N_A
             i += 1
@@ -1530,8 +1499,9 @@
         # Rev. 7/1/2016
         # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
         # Rev. 7/3/2016:  renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
-        # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
-        #                read from SwissProt SQLite database
+        # Rev. 12/2/2021: Converted to Python from ipynb; use fast \
+        #                 Aho-Corasick searching; \
+        #                 read from SwissProt SQLite database
         # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
 
         #
author	galaxyp
date	Fri, 28 Oct 2022 18:26:42 +0000
parents	8dfd5d2b5903
children