Mercurial > repos > galaxyp > mqppep_preproc
view mqppep_mrgfltr.py @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
line wrap: on
line source
#!/usr/bin/env python # Import the packages needed import argparse import operator # for operator.itemgetter import os.path import re import shutil # for shutil.copyfile(src, dest) import sqlite3 as sql import sys # import the sys module for exc_info import time import traceback # for formatting stack-trace from codecs import getreader as cx_getreader import numpy as np import pandas # global constants N_A = "N/A" # ref: https://stackoverflow.com/a/8915613/15509512 # answers: "How to handle exceptions in a list comprehensions" # usage: # from math import log # eggs = [1,3,0,3,2] # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) # producing: # for <built-in function log> # with args (0,) # exception: math domain error # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] def catch(func, *args, handle=lambda e: e, **kwargs): try: return func(*args, **kwargs) except Exception as e: print("For %s" % str(func)) print(" with args %s" % str(args)) print(" caught exception: %s" % str(e)) (ty, va, tb) = sys.exc_info() print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) exit(-1) return None def whine(func, *args, handle=lambda e: e, **kwargs): try: return func(*args, **kwargs) except Exception as e: print("Warning: For %s" % str(func)) print(" with args %s" % str(args)) print(" caught exception: %s" % str(e)) (ty, va, tb) = sys.exc_info() print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) return None def ppep_join(x): x = [i for i in x if N_A != i] result = "%s" % " | ".join(x) if result != "": return result else: return N_A def melt_join(x): tmp = {key.lower(): key for key in x} result = "%s" % " | ".join([tmp[key] for key in tmp]) return result def __main__(): # Parse Command Line parser = argparse.ArgumentParser( description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." ) # inputs: # Phosphopeptide data for experimental results, including the intensities # and the mapping to kinase domains, in tabular format. parser.add_argument( "--phosphopeptides", "-p", nargs=1, required=True, dest="phosphopeptides", help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", ) # UniProtKB/SwissProt DB input, SQLite parser.add_argument( "--ppep_mapping_db", "-d", nargs=1, required=True, dest="ppep_mapping_db", help="UniProtKB/SwissProt SQLite Database", ) # species to limit records chosed from PhosPhositesPlus parser.add_argument( "--species", "-x", nargs=1, required=False, default=[], dest="species", help="limit PhosphoSitePlus records to indicated species (field may be empty)", ) # outputs: # tabular output parser.add_argument( "--mrgfltr_tab", "-o", nargs=1, required=True, dest="mrgfltr_tab", help="Tabular output file for results", ) # CSV output parser.add_argument( "--mrgfltr_csv", "-c", nargs=1, required=True, dest="mrgfltr_csv", help="CSV output file for results", ) # SQLite output parser.add_argument( "--mrgfltr_sqlite", "-S", nargs=1, required=True, dest="mrgfltr_sqlite", help="SQLite output file for results", ) # "Make it so!" (parse the arguments) options = parser.parse_args() print("options: " + str(options)) # determine phosphopeptide ("upstream map") input tabular file access if options.phosphopeptides is None: exit('Argument "phosphopeptides" is required but not supplied') try: upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) input_file = open(upstream_map_filename_tab, "r") input_file.close() except Exception as e: exit("Error parsing phosphopeptides argument: %s" % str(e)) # determine input SQLite access if options.ppep_mapping_db is None: exit('Argument "ppep_mapping_db" is required but not supplied') try: uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) input_file = open(uniprot_sqlite, "rb") input_file.close() except Exception as e: exit("Error parsing ppep_mapping_db argument: %s" % str(e)) # copy input SQLite dataset to output SQLite dataset if options.mrgfltr_sqlite is None: exit('Argument "mrgfltr_sqlite" is required but not supplied') try: output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) shutil.copyfile(uniprot_sqlite, output_sqlite) except Exception as e: exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) # determine species to limit records from PSP_Regulatory_Sites if options.species is None: exit( 'Argument "species" is required (and may be empty) but not supplied' ) try: if len(options.species) > 0: species = options.species[0] else: species = "" except Exception as e: exit("Error parsing species argument: %s" % str(e)) # determine tabular output destination if options.mrgfltr_tab is None: exit('Argument "mrgfltr_tab" is required but not supplied') try: output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) output_file = open(output_filename_tab, "w") output_file.close() except Exception as e: exit("Error parsing mrgfltr_tab argument: %s" % str(e)) # determine CSV output destination if options.mrgfltr_csv is None: exit('Argument "mrgfltr_csv" is required but not supplied') try: output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) output_file = open(output_filename_csv, "w") output_file.close() except Exception as e: exit("Error parsing mrgfltr_csv argument: %s" % str(e)) def mqpep_getswissprot(): # # copied from Excel Output Script.ipynb BEGIN # # # String Constants ################# DEPHOSPHOPEP = "DephosphoPep" DESCRIPTION = "Description" FUNCTION_PHOSPHORESIDUE = ( "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" ) GENE_NAME = "Gene_Name" # Gene Name from UniProtKB ON_FUNCTION = ( "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites ) ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites ON_PROCESS = ( "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites ) ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites PHOSPHOPEPTIDE = "Phosphopeptide" PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" PHOSPHORESIDUE = "Phosphoresidue" PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" SEQUENCE = "Sequence" SEQUENCE10 = "Sequence10" SEQUENCE7 = "Sequence7" SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" UNIPROT_ID = "UniProt_ID" UNIPROT_SEQ_AND_META_SQL = """ select Uniprot_ID, Description, Gene_Name, Sequence, Organism_Name, Organism_ID, PE, SV from UniProtKB order by Sequence, UniProt_ID """ UNIPROT_UNIQUE_SEQ_SQL = """ select distinct Sequence from UniProtKB group by Sequence """ PPEP_PEP_UNIPROTSEQ_SQL = """ select distinct phosphopeptide, peptide, sequence from uniprotkb_pep_ppep_view order by sequence """ PPEP_MELT_SQL = """ SELECT DISTINCT phospho_peptide AS 'p_peptide', kinase_map AS 'characterization', 'X' AS 'X' FROM ppep_gene_site_view """ # CREATE TABLE PSP_Regulatory_site ( # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, # domain TEXT, # ON_FUNCTION TEXT, # ON_PROCESS TEXT, # ON_PROT_INTERACT TEXT, # ON_OTHER_INTERACT TEXT, # notes TEXT, # organism TEXT # ); PSP_REGSITE_SQL = """ SELECT DISTINCT SITE_PLUSMINUS_7AA , DOMAIN , ON_FUNCTION , ON_PROCESS , ON_PROT_INTERACT , ON_OTHER_INTERACT , NOTES , ORGANISM FROM PSP_Regulatory_site """ PPEP_ID_SQL = """ SELECT id AS 'ppep_id', seq AS 'ppep_seq' FROM ppep """ MRGFLTR_DDL = """ DROP VIEW IF EXISTS mrgfltr_metadata_view; DROP TABLE IF EXISTS mrgfltr_metadata; CREATE TABLE mrgfltr_metadata ( ppep_id INTEGER REFERENCES ppep(id) , Sequence10 TEXT , Sequence7 TEXT , GeneName TEXT , Phosphoresidue TEXT , UniProtID TEXT , Description TEXT , FunctionPhosphoresidue TEXT , PutativeUpstreamDomains TEXT , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE ) ; CREATE VIEW mrgfltr_metadata_view AS SELECT DISTINCT ppep.seq AS phospho_peptide , Sequence10 , Sequence7 , GeneName , Phosphoresidue , UniProtID , Description , FunctionPhosphoresidue , PutativeUpstreamDomains FROM ppep, mrgfltr_metadata WHERE mrgfltr_metadata.ppep_id = ppep.id ORDER BY ppep.seq ; """ CITATION_INSERT_STMT = """ INSERT INTO Citation ( ObjectName, CitationData ) VALUES (?,?) """ CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' MRGFLTR_METADATA_COLUMNS = [ "ppep_id", "Sequence10", "Sequence7", "GeneName", "Phosphoresidue", "UniProtID", "Description", "FunctionPhosphoresidue", "PutativeUpstreamDomains", ] # String Constants (end) ############ class Error(Exception): """Base class for exceptions in this module.""" pass class PreconditionError(Error): """Exception raised for errors in the input. Attributes: expression -- input expression in which the error occurred message -- explanation of the error """ def __init__(self, expression, message): self.expression = expression self.message = message # start_time = time.clock() #timer start_time = time.process_time() # timer # get keys from upstream tabular file using readline() # ref: https://stackoverflow.com/a/16713581/15509512 # answer to "Use codecs to read file with correct encoding" file1_encoded = open(upstream_map_filename_tab, "rb") file1 = cx_getreader("latin-1")(file1_encoded) count = 0 upstream_map_p_peptide_list = [] re_tab = re.compile("^[^\t]*") while True: count += 1 # Get next line from file line = file1.readline() # if line is empty # end of file is reached if not line: break if count > 1: m = re_tab.match(line) upstream_map_p_peptide_list.append(m[0]) file1.close() file1_encoded.close() # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed re_phos = re.compile("p") end_time = time.process_time() # timer print( "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), file=sys.stderr, ) # ----------- Get SwissProt data from SQLite database (start) ----------- # build UniProt sequence LUT and list of unique SwissProt sequences # Open SwissProt SQLite database conn = sql.connect(uniprot_sqlite) cur = conn.cursor() # Set up structures to hold SwissProt data uniprot_Sequence_List = [] UniProtSeqLUT = {} # Execute query for unique seqs without fetching the results yet uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) while 1: batch = uniprot_unique_seq_cur.fetchmany(size=50) if not batch: # handle case where no records are returned break for row in batch: Sequence = row[0] UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] UniProtSeqLUT[(Sequence, GENE_NAME)] = [] UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] UniProtSeqLUT[Sequence] = [] # Execute query for seqs and metadata without fetching the results yet uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) while 1: batch = uniprot_seq_and_meta.fetchmany(size=50) if not batch: # handle case where no records are returned break for ( UniProt_ID, Description, Gene_Name, Sequence, OS, OX, PE, SV, ) in batch: uniprot_Sequence_List.append(Sequence) UniProtSeqLUT[Sequence] = Sequence UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) if OS != N_A: Description += " OS=" + OS if OX != -1: Description += " OX=" + str(OX) if Gene_Name != N_A: Description += " GN=" + Gene_Name if PE != N_A: Description += " PE=" + PE if SV != N_A: Description += " SV=" + SV UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) # Close SwissProt SQLite database; clean up local variables conn.close() Sequence = "" UniProt_ID = "" Description = "" Gene_Name = "" # ----------- Get SwissProt data from SQLite database (finish) ----------- end_time = time.process_time() # timer print( "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), file=sys.stderr, ) # ----------- Get SwissProt data from SQLite database (start) ----------- # Open SwissProt SQLite database conn = sql.connect(uniprot_sqlite) cur = conn.cursor() # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide DephosphoPep_UniProtSeq_LUT = {} # Set up dictionary to accumulate results PhosphoPep_UniProtSeq_LUT = {} # Execute query for tuples without fetching the results yet ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) while 1: batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) if not batch: # handle case where no records are returned break for (phospho_pep, dephospho_pep, sequence) in batch: # do interesting stuff here... PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep PhosphoPep_UniProtSeq_LUT[ (phospho_pep, DEPHOSPHOPEP) ] = dephospho_pep if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, DESCRIPTION) ] = [] DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, GENE_NAME) ] = [] DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, UNIPROT_ID) ] = [] DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) if ( sequence not in DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, SEQUENCE) ] ): DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, SEQUENCE) ].append(sequence) for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: if phospho_pep != phospho_pep: print( "phospho_pep:'%s' phospho_pep:'%s'" % (phospho_pep, phospho_pep) ) if phospho_pep not in PhosphoPep_UniProtSeq_LUT: PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep PhosphoPep_UniProtSeq_LUT[ (phospho_pep, DEPHOSPHOPEP) ] = dephospho_pep r = list( zip( [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], [ s for s in UniProtSeqLUT[(sequence, DESCRIPTION)] ], ) ) # Sort by `UniProt_ID` # ref: https://stackoverflow.com/a/4174955/15509512 r = sorted(r, key=operator.itemgetter(0)) # Get one tuple for each `phospho_pep` # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] for (upid, gn, desc) in r: # Append pseudo-tuple per UniProt_ID but only when it is not present if ( upid not in DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, UNIPROT_ID) ] ): DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, UNIPROT_ID) ].append(upid) DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, DESCRIPTION) ].append(desc) DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, GENE_NAME) ].append(gn) # Close SwissProt SQLite database; clean up local variables conn.close() # wipe local variables phospho_pep = dephospho_pep = sequence = 0 upid = gn = desc = r = "" # ----------- Get SwissProt data from SQLite database (finish) ----------- end_time = time.process_time() # timer print( "%0.6f finished reading and decoding '%s' [0.4]" % (end_time - start_time, upstream_map_filename_tab), file=sys.stderr, ) print( "{:>10} unique upstream phosphopeptides tested".format( str(len(upstream_map_p_peptide_list)) ) ) # Read in Upstream tabular file # We are discarding the intensity data; so read it as text upstream_data = pandas.read_table( upstream_map_filename_tab, dtype="str", index_col=0 ) end_time = time.process_time() # timer print( "%0.6f read Upstream Map from file [1g_1]" % (end_time - start_time,), file=sys.stderr, ) # timer upstream_data.index = upstream_map_p_peptide_list end_time = time.process_time() # timer print( "%0.6f added index to Upstream Map [1g_2]" % (end_time - start_time,), file=sys.stderr, ) # timer # ######################################################################## # # trim upstream_data to include only the upstream map columns # old_cols = upstream_data.columns.tolist() # i = 0 # first_intensity = -1 # last_intensity = -1 # intensity_re = re.compile("Intensity.*") # for col_name in old_cols: # m = intensity_re.match(col_name) # if m: # last_intensity = i # if first_intensity == -1: # first_intensity = i # i += 1 # # print('last intensity = %d' % last_intensity) # col_PKCalpha = last_intensity + 2 # # data_in_cols = [old_cols[0]] + old_cols[ # first_intensity: last_intensity + 1 # ] # # if upstream_data.empty: # print("upstream_data is empty") # exit(0) # # data_in = upstream_data.copy(deep=True)[data_in_cols] ######################################################################## # trim upstream_data to include only the upstream map columns old_cols = upstream_data.columns.tolist() i = 0 first_intensity = -1 last_intensity = -1 intensity_re = re.compile("Intensity.*") for col_name in old_cols: m = intensity_re.match(col_name) if m: last_intensity = i if first_intensity == -1: first_intensity = i i += 1 # print('last intensity = %d' % last_intensity) col_PKCalpha = last_intensity + 2 data_in_cols = [old_cols[0]] + old_cols[ first_intensity - 1: last_intensity ] data_col_names = [old_cols[0]] + old_cols[ first_intensity: last_intensity + 1 ] if upstream_data.empty: print("upstream_data is empty") exit(0) data_in = upstream_data.copy(deep=True)[data_in_cols] data_in.columns = data_col_names print("data_in") print(data_in) ######################################################################## # Convert floating-point integers to int64 integers # ref: https://stackoverflow.com/a/68497603/15509512 data_in[list(data_in.columns[1:])] = ( data_in[list(data_in.columns[1:])] .astype("float64") .apply(np.int64) ) # create another phosphopeptide column that will be used to join later; # MAY need to change depending on Phosphopeptide column position # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index end_time = time.process_time() # timer print( "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" % (end_time - start_time,), file=sys.stderr, ) # timer # Produce a dictionary of metadata for a single phosphopeptide. # This is a replacement of `UniProtInfo_subdict` in the original code. def pseq_to_subdict(phospho_pep): # Strip "p" from phosphopeptide sequence dephospho_pep = re_phos.sub("", phospho_pep) # Determine number of phosphoresidues in phosphopeptide numps = len(phospho_pep) - len(dephospho_pep) # Determine location(s) of phosphoresidue(s) in phosphopeptide # (used later for Phosphoresidue, Sequence7, and Sequence10) ploc = [] # list of p locations i = 0 p = phospho_pep while i < numps: ploc.append(p.find("p")) p = p[: p.find("p")] + p[p.find("p") + 1:] i += 1 # Establish nested dictionary result = {} result[SEQUENCE] = [] result[UNIPROT_ID] = [] result[DESCRIPTION] = [] result[GENE_NAME] = [] result[PHOSPHORESIDUE] = [] result[SEQUENCE7] = [] result[SEQUENCE10] = [] # Add stripped sequence to dictionary result[SEQUENCE].append(dephospho_pep) # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT # Caller may elect to: # try: # ... # except PreconditionError as pe: # print("'{expression}': {message}".format( # expression = pe.expression, # message = pe.message)) # ) # ) if phospho_pep not in PhosphoPep_UniProtSeq_LUT: raise PreconditionError( phospho_pep, "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", ) if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", ) if ( dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] ): my_err_msg = "dephosphorylated phosphopeptide does not match " my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] raise PreconditionError(dephospho_pep, my_err_msg) result[SEQUENCE] = [dephospho_pep] result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, UNIPROT_ID) ] result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, DESCRIPTION) ] result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, GENE_NAME) ] if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: raise PreconditionError( dephospho_pep, "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", ) UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ (dephospho_pep, SEQUENCE) ] if len(UniProtSeqList) < 1: print( "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" % dephospho_pep ) # raise PreconditionError( # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", # 'value has zero length' # ) for UniProtSeq in UniProtSeqList: i = 0 phosphoresidues = [] seq7s_set = set() seq7s = [] seq10s_set = set() seq10s = [] while i < len(ploc): start = UniProtSeq.find(dephospho_pep) # handle case where no sequence was found for dep-pep if start < 0: i += 1 continue psite = ( start + ploc[i] ) # location of phosphoresidue on protein sequence # add Phosphoresidue phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) phosphoresidues.append(phosphosite) # Add Sequence7 if psite < 7: # phospho_pep at N terminus seq7 = str(UniProtSeq)[: psite + 8] if seq7[psite] == "S": # if phosphosresidue is serine pres = "s" elif ( seq7[psite] == "T" ): # if phosphosresidue is threonine pres = "t" elif ( seq7[psite] == "Y" ): # if phosphoresidue is tyrosine pres = "y" else: # if not pSTY pres = "?" seq7 = ( seq7[:psite] + pres + seq7[psite + 1: psite + 8] ) while ( len(seq7) < 15 ): # add appropriate number of "_" to the front seq7 = "_" + seq7 elif ( len(UniProtSeq) - psite < 8 ): # phospho_pep at C terminus seq7 = str(UniProtSeq)[psite - 7:] if seq7[7] == "S": pres = "s" elif seq7[7] == "T": pres = "t" elif seq7[7] == "Y": pres = "y" else: pres = "?" seq7 = seq7[:7] + pres + seq7[8:] while ( len(seq7) < 15 ): # add appropriate number of "_" to the back seq7 = seq7 + "_" else: seq7 = str(UniProtSeq)[psite - 7: psite + 8] pres = "" # phosphoresidue if seq7[7] == "S": # if phosphosresidue is serine pres = "s" elif seq7[7] == "T": # if phosphosresidue is threonine pres = "t" elif seq7[7] == "Y": # if phosphoresidue is tyrosine pres = "y" else: # if not pSTY pres = "?" seq7 = seq7[:7] + pres + seq7[8:] if seq7 not in seq7s_set: seq7s.append(seq7) seq7s_set.add(seq7) # add Sequence10 if psite < 10: # phospho_pep at N terminus seq10 = ( str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] ) elif ( len(UniProtSeq) - psite < 11 ): # phospho_pep at C terminus seq10 = ( str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] ) else: seq10 = str(UniProtSeq)[psite - 10: psite + 11] seq10 = seq10[:10] + "p" + seq10[10:] if seq10 not in seq10s_set: seq10s.append(seq10) seq10s_set.add(seq10) i += 1 result[PHOSPHORESIDUE].append(phosphoresidues) result[SEQUENCE7].append(seq7s) # result[SEQUENCE10] is a list of lists of strings result[SEQUENCE10].append(seq10s) r = list( zip( result[UNIPROT_ID], result[GENE_NAME], result[DESCRIPTION], result[PHOSPHORESIDUE], ) ) # Sort by `UniProt_ID` # ref: https://stackoverflow.com//4174955/15509512 s = sorted(r, key=operator.itemgetter(0)) result[UNIPROT_ID] = [] result[GENE_NAME] = [] result[DESCRIPTION] = [] result[PHOSPHORESIDUE] = [] for r in s: result[UNIPROT_ID].append(r[0]) result[GENE_NAME].append(r[1]) result[DESCRIPTION].append(r[2]) result[PHOSPHORESIDUE].append(r[3]) # convert lists to strings in the dictionary for key, value in result.items(): if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: result[key] = "; ".join(map(str, value)) elif key in [SEQUENCE10]: # result[SEQUENCE10] is a list of lists of strings joined_value = "" joined_set = set() sep = "" for valL in value: # valL is a list of strings for val in valL: # val is a string if val not in joined_set: joined_set.add(val) joined_value += sep + val sep = "; " # joined_value is a string result[key] = joined_value newstring = "; ".join( [", ".join(prez) for prez in result[PHOSPHORESIDUE]] ) # #separate the isoforms in PHOSPHORESIDUE column with ";" # oldstring = result[PHOSPHORESIDUE] # oldlist = list(oldstring) # newstring = "" # i = 0 # for e in oldlist: # if e == ";": # if numps > 1: # if i%numps: # newstring = newstring + ";" # else: # newstring = newstring + "," # else: # newstring = newstring + ";" # i +=1 # else: # newstring = newstring + e result[PHOSPHORESIDUE] = newstring # separate sequence7's by | oldstring = result[SEQUENCE7] oldlist = oldstring newstring = "" for ol in oldlist: for e in ol: if e == ";": newstring = newstring + " |" elif len(newstring) > 0 and 1 > newstring.count(e): newstring = newstring + " | " + e elif 1 > newstring.count(e): newstring = newstring + e result[SEQUENCE7] = newstring return [phospho_pep, result] # Construct list of [string, dictionary] lists # where the dictionary provides the SwissProt metadata # for a phosphopeptide result_list = [ whine(pseq_to_subdict, psequence) for psequence in data_in[PHOSPHOPEPTIDE_MATCH] ] end_time = time.process_time() # timer print( "%0.6f added SwissProt annotations to phosphopeptides [B]" % (end_time - start_time,), file=sys.stderr, ) # timer # Construct dictionary from list of lists # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ UniProt_Info = { result[0]: result[1] for result in result_list if result is not None } end_time = time.process_time() # timer print( "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" % (end_time - start_time,), file=sys.stderr, ) # timer # cosmetic: add N_A to phosphopeptide rows with no hits p_peptide_list = [] for key in UniProt_Info: p_peptide_list.append(key) for nestedKey in UniProt_Info[key]: if UniProt_Info[key][nestedKey] == "": UniProt_Info[key][nestedKey] = N_A end_time = time.process_time() # timer print( "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), file=sys.stderr, ) # timer # convert UniProt_Info dictionary to dataframe uniprot_df = pandas.DataFrame.transpose( pandas.DataFrame.from_dict(UniProt_Info) ) # reorder columns to match expected output file uniprot_df[ PHOSPHOPEPTIDE ] = uniprot_df.index # make index a column too cols = uniprot_df.columns.tolist() # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] # uniprot_df = uniprot_df[cols] uniprot_df = uniprot_df[ [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, UNIPROT_ID, DESCRIPTION, ] ] end_time = time.process_time() # timer print( "%0.6f reordered columns to match expected output file [1]" % (end_time - start_time,), file=sys.stderr, ) # timer # concat to split then groupby to collapse seq7_df = pandas.concat( [ pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) for _, row in uniprot_df.iterrows() ] ).reset_index() seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] # --- -------------- begin read PSP_Regulatory_sites --------------------------------- # read in PhosphoSitePlus Regulatory Sites dataset # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- conn = sql.connect(uniprot_sqlite) regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) # Close SwissProt SQLite database conn.close() # ... -------------- end read PSP_Regulatory_sites ------------------------------------ # keep only the human entries in dataframe if len(species) > 0: print( 'Limit PhosphoSitesPlus records to species "' + species + '"' ) regsites_df = regsites_df[regsites_df.ORGANISM == species] # merge the seq7 df with the regsites df based off of the sequence7 merge_df = seq7_df.merge( regsites_df, left_on=SEQUENCE7, right_on=SITE_PLUSMINUS_7AA_SQL, how="left", ) # after merging df, select only the columns of interest; # note that PROTEIN is absent here merge_df = merge_df[ [ PHOSPHOPEPTIDE, SEQUENCE7, ON_FUNCTION, ON_PROCESS, ON_PROT_INTERACT, ON_OTHER_INTERACT, ON_NOTES, ] ] # combine column values of interest # into one FUNCTION_PHOSPHORESIDUE column" merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( merge_df[ON_PROCESS], sep="; ", na_rep="" ) merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ FUNCTION_PHOSPHORESIDUE ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ FUNCTION_PHOSPHORESIDUE ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ FUNCTION_PHOSPHORESIDUE ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") # remove the columns that were combined merge_df = merge_df[ [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] ] end_time = time.process_time() # timer print( "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), file=sys.stderr, ) # timer # cosmetic changes to Function Phosphoresidue column fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) end_time = time.process_time() # timer print( "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), file=sys.stderr, ) # timer i = 0 while i < len(fp_series): # remove the extra ";" so that it looks more professional if fp_series[i] == "; ; ; ; ": # remove ; from empty hits fp_series[i] = "" while fp_series[i].endswith("; "): # remove ; from the ends fp_series[i] = fp_series[i][:-2] while fp_series[i].startswith("; "): # remove ; from the beginning fp_series[i] = fp_series[i][2:] fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") fp_series[i] = fp_series[i].replace("; ; ; ", "; ") fp_series[i] = fp_series[i].replace("; ; ", "; ") # turn blanks into N_A to signify the info was searched for but cannot be found if fp_series[i] == "": fp_series[i] = N_A i += 1 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series end_time = time.process_time() # timer print( "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), file=sys.stderr, ) # timer # merge uniprot df with merge df uniprot_regsites_merged_df = uniprot_df.merge( merge_df, left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE, how="left", ) # collapse the merged df uniprot_regsites_collapsed_df = pandas.DataFrame( uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ FUNCTION_PHOSPHORESIDUE ].apply(lambda x: ppep_join(x)) ) # .apply(lambda x: "%s" % ' | '.join(x))) end_time = time.process_time() # timer print( "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), file=sys.stderr, ) # timer uniprot_regsites_collapsed_df[ PHOSPHOPEPTIDE ] = ( uniprot_regsites_collapsed_df.index ) # add df index as its own column # rename columns uniprot_regsites_collapsed_df.columns = [ FUNCTION_PHOSPHORESIDUE, "ppp", ] end_time = time.process_time() # timer print( "%0.6f selected columns to be merged to uniprot_df [1e]" % (end_time - start_time,), file=sys.stderr, ) # timer # add columns based on Sequence7 matching site_+/-7_AA uniprot_regsite_df = pandas.merge( left=uniprot_df, right=uniprot_regsites_collapsed_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="ppp", ) end_time = time.process_time() # timer print( "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" % (end_time - start_time,), file=sys.stderr, ) # timer data_in.rename( {"Protein description": PHOSPHOPEPTIDE}, axis="columns", inplace=True, ) # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') res2 = sorted( data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() ) data_in = data_in.loc[res2] end_time = time.process_time() # timer print( "%0.6f sorting time [1f]" % (end_time - start_time,), file=sys.stderr, ) # timer print("old_cols[:col_PKCalpha]") print(old_cols[:col_PKCalpha]) cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] upstream_data = upstream_data[cols] print("upstream_data.columns") print(upstream_data.columns) end_time = time.process_time() # timer print( "%0.6f refactored columns for Upstream Map [1g]" % (end_time - start_time,), file=sys.stderr, ) # timer # #rename upstream columns in new list # new_cols = [] # for name in cols: # if "_NetworKIN" in name: # name = name.split("_")[0] # if " motif" in name: # name = name.split(" motif")[0] # if " sequence " in name: # name = name.split(" sequence")[0] # if "_Phosida" in name: # name = name.split("_")[0] # if "_PhosphoSite" in name: # name = name.split("_")[0] # new_cols.append(name) # rename upstream columns in new list def col_rename(name): if "_NetworKIN" in name: name = name.split("_")[0] if " motif" in name: name = name.split(" motif")[0] if " sequence " in name: name = name.split(" sequence")[0] if "_Phosida" in name: name = name.split("_")[0] if "_PhosphoSite" in name: name = name.split("_")[0] return name new_cols = [col_rename(col) for col in cols] upstream_data.columns = new_cols end_time = time.process_time() # timer print( "%0.6f renamed columns for Upstream Map [1h_1]" % (end_time - start_time,), file=sys.stderr, ) # timer # Create upstream_data_cast as a copy of upstream_data # but with first column substituted by the phosphopeptide sequence upstream_data_cast = upstream_data.copy() new_cols_cast = new_cols new_cols_cast[0] = "p_peptide" upstream_data_cast.columns = new_cols_cast upstream_data_cast["p_peptide"] = upstream_data.index # --- -------------- begin read upstream_data_melt ------------------------------------ # ----------- Get melted kinase mapping data from SQLite database (start) ----------- conn = sql.connect(uniprot_sqlite) upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) # Close SwissProt SQLite database conn.close() upstream_data_melt = upstream_data_melt_df.copy() upstream_data_melt.columns = ["p_peptide", "characterization", "X"] upstream_data_melt["characterization"] = [ col_rename(s) for s in upstream_data_melt["characterization"] ] print( "%0.6f upstream_data_melt_df initially has %d rows" % (end_time - start_time, len(upstream_data_melt.axes[0])), file=sys.stderr, ) # ref: https://stackoverflow.com/a/27360130/15509512 # e.g. df.drop(df[df.score < 50].index, inplace=True) upstream_data_melt.drop( upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True ) print( "%0.6f upstream_data_melt_df pre-dedup has %d rows" % (end_time - start_time, len(upstream_data_melt.axes[0])), file=sys.stderr, ) # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- # ... -------------- end read upstream_data_melt -------------------------------------- end_time = time.process_time() # timer print( "%0.6f melted and minimized Upstream Map dataframe [1h_2]" % (end_time - start_time,), file=sys.stderr, ) # timer # ... end read upstream_data_melt end_time = time.process_time() # timer print( "%0.6f indexed melted Upstream Map [1h_2a]" % (end_time - start_time,), file=sys.stderr, ) # timer upstream_delta_melt_LoL = upstream_data_melt.values.tolist() melt_dict = {} for key in upstream_map_p_peptide_list: melt_dict[key] = [] for el in upstream_delta_melt_LoL: (p_peptide, characterization, X) = tuple(el) if p_peptide in melt_dict: melt_dict[p_peptide].append(characterization) else: exit( 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' % (p_peptide) ) end_time = time.process_time() # timer print( "%0.6f appended peptide characterizations [1h_2b]" % (end_time - start_time,), file=sys.stderr, ) # timer # for key in upstream_map_p_peptide_list: # melt_dict[key] = ' | '.join(melt_dict[key]) for key in upstream_map_p_peptide_list: melt_dict[key] = melt_join(melt_dict[key]) end_time = time.process_time() # timer print( "%0.6f concatenated multiple characterizations [1h_2c]" % (end_time - start_time,), file=sys.stderr, ) # timer # map_dict is a dictionary of dictionaries map_dict = {} for key in upstream_map_p_peptide_list: map_dict[key] = {} map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] end_time = time.process_time() # timer print( "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), file=sys.stderr, ) # timer # convert map_dict to dataframe map_df = pandas.DataFrame.transpose( pandas.DataFrame.from_dict(map_dict) ) map_df["p-peptide"] = map_df.index # make index a column too cols_map_df = map_df.columns.tolist() cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] map_df = map_df[cols_map_df] # join map_df to uniprot_regsite_df output_df = uniprot_regsite_df.merge( map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" ) output_df = output_df[ [ PHOSPHOPEPTIDE, SEQUENCE10, SEQUENCE7, GENE_NAME, PHOSPHORESIDUE, UNIPROT_ID, DESCRIPTION, FUNCTION_PHOSPHORESIDUE, PUTATIVE_UPSTREAM_DOMAINS, ] ] # cols_output_prelim = output_df.columns.tolist() # # print("cols_output_prelim") # print(cols_output_prelim) # # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] # # print("cols_output with p-peptide") # print(cols_output) # # cols_output = [col for col in cols_output if not col == "p-peptide"] # # print("cols_output") # print(cols_output) # # output_df = output_df[cols_output] # join output_df back to quantitative columns in data_in df quant_cols = data_in.columns.tolist() quant_cols = quant_cols[1:] quant_data = data_in[quant_cols] # ----------- Write merge/filter metadata to SQLite database (start) ----------- # Open SwissProt SQLite database conn = sql.connect(output_sqlite) cur = conn.cursor() cur.executescript(MRGFLTR_DDL) cur.execute( CITATION_INSERT_STMT, ("mrgfltr_metadata_view", CITATION_INSERT_PSP), ) cur.execute( CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) ) cur.execute( CITATION_INSERT_STMT, ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), ) cur.execute( CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) ) # Read ppep-to-sequence LUT ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) # write only metadata for merged/filtered records to SQLite mrgfltr_metadata_df = output_df.copy() # replace phosphopeptide seq with ppep.id mrgfltr_metadata_df = ppep_lut_df.merge( mrgfltr_metadata_df, left_on="ppep_seq", right_on=PHOSPHOPEPTIDE, how="inner", ) mrgfltr_metadata_df.drop( columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True ) # rename columns mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS mrgfltr_metadata_df.to_sql( "mrgfltr_metadata", con=conn, if_exists="append", index=False, method="multi", ) # Close SwissProt SQLite database conn.close() # ----------- Write merge/filter metadata to SQLite database (finish) ----------- output_df = output_df.merge( quant_data, how="right", left_on=PHOSPHOPEPTIDE, right_on=PHOSPHOPEPTIDE_MATCH, ) output_cols = output_df.columns.tolist() output_cols = output_cols[:-1] output_df = output_df[output_cols] # cosmetic changes to Upstream column output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ PUTATIVE_UPSTREAM_DOMAINS ].fillna( "" ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) i = 0 while i < len(us_series): # turn blanks into N_A to signify the info was searched for but cannot be found if us_series[i] == "": us_series[i] = N_A i += 1 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series end_time = time.process_time() # timer print( "%0.6f establisheed output [3]" % (end_time - start_time,), file=sys.stderr, ) # timer (output_rows, output_cols) = output_df.shape output_df = output_df.convert_dtypes(convert_integer=True) # Output onto Final CSV file output_df.to_csv(output_filename_csv, index=False) output_df.to_csv( output_filename_tab, quoting=None, sep="\t", index=False ) end_time = time.process_time() # timer print( "%0.6f wrote output [4]" % (end_time - start_time,), file=sys.stderr, ) # timer print( "{:>10} phosphopeptides written to output".format(str(output_rows)) ) end_time = time.process_time() # timer print( "%0.6f seconds of non-system CPU time were consumed" % (end_time - start_time,), file=sys.stderr, ) # timer # Rev. 7/1/2016 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ # read from SwissProt SQLite database # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper # # copied from Excel Output Script.ipynb END # # try: catch( mqpep_getswissprot, ) exit(0) except Exception as e: exit("Internal error running mqpep_getswissprot(): %s" % (e)) if __name__ == "__main__": __main__()