comparison mqppep_mrgfltr.py @ 1:b76c75521d91 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author galaxyp
date Fri, 28 Oct 2022 18:26:42 +0000
parents 8dfd5d2b5903
children
comparison
equal deleted inserted replaced
0:8dfd5d2b5903 1:b76c75521d91
85 "--phosphopeptides", 85 "--phosphopeptides",
86 "-p", 86 "-p",
87 nargs=1, 87 nargs=1,
88 required=True, 88 required=True,
89 dest="phosphopeptides", 89 dest="phosphopeptides",
90 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", 90 help=" ".join([
91 "Phosphopeptide data for experimental results, including the",
92 "intensities and the mapping to kinase domains, in tabular format"
93 ]),
91 ) 94 )
92 # UniProtKB/SwissProt DB input, SQLite 95 # UniProtKB/SwissProt DB input, SQLite
93 parser.add_argument( 96 parser.add_argument(
94 "--ppep_mapping_db", 97 "--ppep_mapping_db",
95 "-d", 98 "-d",
104 "-x", 107 "-x",
105 nargs=1, 108 nargs=1,
106 required=False, 109 required=False,
107 default=[], 110 default=[],
108 dest="species", 111 dest="species",
109 help="limit PhosphoSitePlus records to indicated species (field may be empty)", 112 help=" ".join([
113 "limit PhosphoSitePlus records to indicated species",
114 "(field may be empty)"
115 ]),
110 ) 116 )
111 117
112 # outputs: 118 # outputs:
113 # tabular output 119 # tabular output
114 parser.add_argument( 120 parser.add_argument(
172 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) 178 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e))
173 179
174 # determine species to limit records from PSP_Regulatory_Sites 180 # determine species to limit records from PSP_Regulatory_Sites
175 if options.species is None: 181 if options.species is None:
176 exit( 182 exit(
177 'Argument "species" is required (and may be empty) but not supplied' 183 'Argument "species" is required (& may be empty) but not supplied'
178 ) 184 )
179 try: 185 try:
180 if len(options.species) > 0: 186 if len(options.species) > 0:
181 species = options.species[0] 187 species = options.species[0]
182 else: 188 else:
214 DEPHOSPHOPEP = "DephosphoPep" 220 DEPHOSPHOPEP = "DephosphoPep"
215 DESCRIPTION = "Description" 221 DESCRIPTION = "Description"
216 FUNCTION_PHOSPHORESIDUE = ( 222 FUNCTION_PHOSPHORESIDUE = (
217 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" 223 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
218 ) 224 )
219 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB 225 # Gene Name from UniProtKB
220 ON_FUNCTION = ( 226 GENE_NAME = "Gene_Name"
221 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites 227 # ON_FUNCTION column from PSP_Regulatory_Sites
222 ) 228 ON_FUNCTION = ("ON_FUNCTION")
223 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites 229 # NOTES column from PSP_Regulatory_Sites
224 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites 230 ON_NOTES = "NOTES"
225 ON_PROCESS = ( 231 # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
226 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites 232 ON_OTHER_INTERACT = "ON_OTHER_INTERACT"
227 ) 233 # ON_PROCESS column from PSP_Regulatory_Sites
228 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites 234 ON_PROCESS = ("ON_PROCESS")
235 # ON_PROT_INTERACT column from PSP_Regulatory_Sites
236 ON_PROT_INTERACT = "ON_PROT_INTERACT"
229 PHOSPHOPEPTIDE = "Phosphopeptide" 237 PHOSPHOPEPTIDE = "Phosphopeptide"
230 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" 238 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
231 PHOSPHORESIDUE = "Phosphoresidue" 239 PHOSPHORESIDUE = "Phosphoresidue"
232 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" 240 PUTATIVE_UPSTREAM_DOMAINS = " ".join([
241 "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/",
242 "Phosphatases/Binding Domains"
243 ])
233 SEQUENCE = "Sequence" 244 SEQUENCE = "Sequence"
234 SEQUENCE10 = "Sequence10" 245 SEQUENCE10 = "Sequence10"
235 SEQUENCE7 = "Sequence7" 246 SEQUENCE7 = "Sequence7"
236 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" 247 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA"
237 UNIPROT_ID = "UniProt_ID" 248 UNIPROT_ID = "UniProt_ID"
326 INSERT INTO Citation ( 337 INSERT INTO Citation (
327 ObjectName, 338 ObjectName,
328 CitationData 339 CitationData
329 ) VALUES (?,?) 340 ) VALUES (?,?)
330 """ 341 """
331 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' 342 CITATION_INSERT_PSP = " ".join([
332 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' 343 "PhosphoSitePlus(R) (PSP) was created by Cell Signaling",
344 "Technology Inc. It is licensed under a Creative Commons",
345 "Attribution-NonCommercial-ShareAlike 3.0 Unported License.",
346 "When using PSP data or analyses in printed publications or",
347 "in online resources, the following acknowledgements must be",
348 "included: (a) the words",
349 '"PhosphoSitePlus(R), www.phosphosite.org" must',
350 "be included at appropriate places in the text or webpage,",
351 "and (b) the following citation must be included in the",
352 'bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser',
353 "JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,",
354 "PTMs and recalibrations. Nucleic Acids Res. 2015",
355 '43:D512-20. PMID: 25514926."'
356 ])
357 CITATION_INSERT_PSP_REF = " ".join([
358 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and',
359 'recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298,',
360 "https://doi.org/10.1093/nar/gkr1122"
361 ])
333 362
334 MRGFLTR_METADATA_COLUMNS = [ 363 MRGFLTR_METADATA_COLUMNS = [
335 "ppep_id", 364 "ppep_id",
336 "Sequence10", 365 "Sequence10",
337 "Sequence7", 366 "Sequence7",
386 m = re_tab.match(line) 415 m = re_tab.match(line)
387 upstream_map_p_peptide_list.append(m[0]) 416 upstream_map_p_peptide_list.append(m[0])
388 file1.close() 417 file1.close()
389 file1_encoded.close() 418 file1_encoded.close()
390 419
391 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed 420 # Get the list of phosphopeptides with the p's that represent
421 # the phosphorylation sites removed
392 re_phos = re.compile("p") 422 re_phos = re.compile("p")
393 423
394 end_time = time.process_time() # timer 424 end_time = time.process_time() # timer
395 print( 425 print(
396 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), 426 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,),
397 file=sys.stderr, 427 file=sys.stderr,
398 ) 428 )
399 429
400 # ----------- Get SwissProt data from SQLite database (start) ----------- 430 # -------- Get SwissProt data from SQLite database (start) -----------
401 # build UniProt sequence LUT and list of unique SwissProt sequences 431 # build UniProt sequence LUT and list of unique SwissProt sequences
402 432
403 # Open SwissProt SQLite database 433 # Open SwissProt SQLite database
404 conn = sql.connect(uniprot_sqlite) 434 conn = sql.connect(uniprot_sqlite)
405 cur = conn.cursor() 435 cur = conn.cursor()
463 Sequence = "" 493 Sequence = ""
464 UniProt_ID = "" 494 UniProt_ID = ""
465 Description = "" 495 Description = ""
466 Gene_Name = "" 496 Gene_Name = ""
467 497
468 # ----------- Get SwissProt data from SQLite database (finish) ----------- 498 # -------- Get SwissProt data from SQLite database (finish) -----------
469 499
470 end_time = time.process_time() # timer 500 end_time = time.process_time() # timer
471 print( 501 print(
472 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), 502 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,),
473 file=sys.stderr, 503 file=sys.stderr,
474 ) 504 )
475 505
476 # ----------- Get SwissProt data from SQLite database (start) ----------- 506 # -------- Get SwissProt data from SQLite database (start) -----------
477 # Open SwissProt SQLite database 507 # Open SwissProt SQLite database
478 conn = sql.connect(uniprot_sqlite) 508 conn = sql.connect(uniprot_sqlite)
479 cur = conn.cursor() 509 cur = conn.cursor()
480 510
481 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide 511 # Set up dictionary to aggregate results for phosphopeptides
512 # corresponding to dephosphoeptide
482 DephosphoPep_UniProtSeq_LUT = {} 513 DephosphoPep_UniProtSeq_LUT = {}
483 514
484 # Set up dictionary to accumulate results 515 # Set up dictionary to accumulate results
485 PhosphoPep_UniProtSeq_LUT = {} 516 PhosphoPep_UniProtSeq_LUT = {}
486 517
546 # ref: https://stackoverflow.com/a/4174955/15509512 577 # ref: https://stackoverflow.com/a/4174955/15509512
547 r = sorted(r, key=operator.itemgetter(0)) 578 r = sorted(r, key=operator.itemgetter(0))
548 # Get one tuple for each `phospho_pep` 579 # Get one tuple for each `phospho_pep`
549 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] 580 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
550 for (upid, gn, desc) in r: 581 for (upid, gn, desc) in r:
551 # Append pseudo-tuple per UniProt_ID but only when it is not present 582 # Append pseudo-tuple per UniProt_ID iff not present
552 if ( 583 if (
553 upid 584 upid
554 not in DephosphoPep_UniProtSeq_LUT[ 585 not in DephosphoPep_UniProtSeq_LUT[
555 (dephospho_pep, UNIPROT_ID) 586 (dephospho_pep, UNIPROT_ID)
556 ] 587 ]
569 conn.close() 600 conn.close()
570 # wipe local variables 601 # wipe local variables
571 phospho_pep = dephospho_pep = sequence = 0 602 phospho_pep = dephospho_pep = sequence = 0
572 upid = gn = desc = r = "" 603 upid = gn = desc = r = ""
573 604
574 # ----------- Get SwissProt data from SQLite database (finish) ----------- 605 # -------- Get SwissProt data from SQLite database (finish) -----------
575 606
576 end_time = time.process_time() # timer 607 end_time = time.process_time() # timer
577 print( 608 print(
578 "%0.6f finished reading and decoding '%s' [0.4]" 609 "%0.6f finished reading and decoding '%s' [0.4]"
579 % (end_time - start_time, upstream_map_filename_tab), 610 % (end_time - start_time, upstream_map_filename_tab),
606 "%0.6f added index to Upstream Map [1g_2]" 637 "%0.6f added index to Upstream Map [1g_2]"
607 % (end_time - start_time,), 638 % (end_time - start_time,),
608 file=sys.stderr, 639 file=sys.stderr,
609 ) # timer 640 ) # timer
610 641
611 # ########################################################################
612 # # trim upstream_data to include only the upstream map columns
613 # old_cols = upstream_data.columns.tolist()
614 # i = 0
615 # first_intensity = -1
616 # last_intensity = -1
617 # intensity_re = re.compile("Intensity.*")
618 # for col_name in old_cols:
619 # m = intensity_re.match(col_name)
620 # if m:
621 # last_intensity = i
622 # if first_intensity == -1:
623 # first_intensity = i
624 # i += 1
625 # # print('last intensity = %d' % last_intensity)
626 # col_PKCalpha = last_intensity + 2
627 #
628 # data_in_cols = [old_cols[0]] + old_cols[
629 # first_intensity: last_intensity + 1
630 # ]
631 #
632 # if upstream_data.empty:
633 # print("upstream_data is empty")
634 # exit(0)
635 #
636 # data_in = upstream_data.copy(deep=True)[data_in_cols]
637 ########################################################################
638 # trim upstream_data to include only the upstream map columns 642 # trim upstream_data to include only the upstream map columns
639 old_cols = upstream_data.columns.tolist() 643 old_cols = upstream_data.columns.tolist()
640 i = 0 644 i = 0
641 first_intensity = -1 645 first_intensity = -1
642 last_intensity = -1 646 last_intensity = -1
664 668
665 data_in = upstream_data.copy(deep=True)[data_in_cols] 669 data_in = upstream_data.copy(deep=True)[data_in_cols]
666 data_in.columns = data_col_names 670 data_in.columns = data_col_names
667 print("data_in") 671 print("data_in")
668 print(data_in) 672 print(data_in)
669 ######################################################################## 673 ######################################################################
670 674
671 # Convert floating-point integers to int64 integers 675 # Convert floating-point integers to int64 integers
672 # ref: https://stackoverflow.com/a/68497603/15509512 676 # ref: https://stackoverflow.com/a/68497603/15509512
673 data_in[list(data_in.columns[1:])] = ( 677 data_in[list(data_in.columns[1:])] = (
674 data_in[list(data_in.columns[1:])] 678 data_in[list(data_in.columns[1:])]
687 % (end_time - start_time,), 691 % (end_time - start_time,),
688 file=sys.stderr, 692 file=sys.stderr,
689 ) # timer 693 ) # timer
690 694
691 # Produce a dictionary of metadata for a single phosphopeptide. 695 # Produce a dictionary of metadata for a single phosphopeptide.
692 # This is a replacement of `UniProtInfo_subdict` in the original code. 696 # This is replaces `UniProtInfo_subdict` from the original code.
693 def pseq_to_subdict(phospho_pep): 697 def pseq_to_subdict(phospho_pep):
694 # Strip "p" from phosphopeptide sequence 698 # Strip "p" from phosphopeptide sequence
695 dephospho_pep = re_phos.sub("", phospho_pep) 699 dephospho_pep = re_phos.sub("", phospho_pep)
696 700
697 # Determine number of phosphoresidues in phosphopeptide 701 # Determine number of phosphoresidues in phosphopeptide
731 # ) 735 # )
732 # ) 736 # )
733 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: 737 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
734 raise PreconditionError( 738 raise PreconditionError(
735 phospho_pep, 739 phospho_pep,
736 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", 740 " ".join([
741 "no matching phosphopeptide found in",
742 "PhosphoPep_UniProtSeq_LUT"
743 ])
737 ) 744 )
738 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: 745 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
739 raise PreconditionError( 746 raise PreconditionError(
740 dephospho_pep, 747 dephospho_pep,
741 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", 748 " ".join([
749 "dephosphorylated phosphopeptide not found",
750 "in DephosphoPep_UniProtSeq_LUT"
751 ])
742 ) 752 )
743 if ( 753 if (
744 dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] 754 dephospho_pep != PhosphoPep_UniProtSeq_LUT[
755 (phospho_pep, DEPHOSPHOPEP)]
745 ): 756 ):
746 my_err_msg = "dephosphorylated phosphopeptide does not match " 757 my_err_msg = "dephosphorylated phosphopeptide does not match "
747 my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " 758 my_err_msg += "PhosphoPep_UniProtSeq_LUT"
748 my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] 759 my_err_msg += "[(phospho_pep,DEPHOSPHOPEP)] = "
760 my_err_msg += PhosphoPep_UniProtSeq_LUT[
761 (phospho_pep, DEPHOSPHOPEP)]
749 raise PreconditionError(dephospho_pep, my_err_msg) 762 raise PreconditionError(dephospho_pep, my_err_msg)
750 763
751 result[SEQUENCE] = [dephospho_pep] 764 result[SEQUENCE] = [dephospho_pep]
752 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ 765 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[
753 (dephospho_pep, UNIPROT_ID) 766 (dephospho_pep, UNIPROT_ID)
759 (dephospho_pep, GENE_NAME) 772 (dephospho_pep, GENE_NAME)
760 ] 773 ]
761 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: 774 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
762 raise PreconditionError( 775 raise PreconditionError(
763 dephospho_pep, 776 dephospho_pep,
764 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", 777 "".join(
778 "no matching phosphopeptide found",
779 "in DephosphoPep_UniProtSeq_LUT")
765 ) 780 )
766 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ 781 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
767 (dephospho_pep, SEQUENCE) 782 (dephospho_pep, SEQUENCE)
768 ] 783 ]
769 if len(UniProtSeqList) < 1: 784 if len(UniProtSeqList) < 1:
770 print( 785 print(
771 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" 786 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] %s"
772 % dephospho_pep 787 % (dephospho_pep, "because value has zero length")
773 ) 788 )
774 # raise PreconditionError(
775 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
776 # 'value has zero length'
777 # )
778 for UniProtSeq in UniProtSeqList: 789 for UniProtSeq in UniProtSeqList:
779 i = 0 790 i = 0
780 phosphoresidues = [] 791 phosphoresidues = []
781 seq7s_set = set() 792 seq7s_set = set()
782 seq7s = [] 793 seq7s = []
852 seq7s_set.add(seq7) 863 seq7s_set.add(seq7)
853 864
854 # add Sequence10 865 # add Sequence10
855 if psite < 10: # phospho_pep at N terminus 866 if psite < 10: # phospho_pep at N terminus
856 seq10 = ( 867 seq10 = (
857 str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] 868 str(UniProtSeq)[:psite] + "p" + str(
869 UniProtSeq)[psite: psite + 11]
858 ) 870 )
859 elif ( 871 elif (
860 len(UniProtSeq) - psite < 11 872 len(UniProtSeq) - psite < 11
861 ): # phospho_pep at C terminus 873 ): # phospho_pep at C terminus
862 seq10 = ( 874 seq10 = (
863 str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] 875 str(UniProtSeq)[psite - 10: psite] + "p" + str(
876 UniProtSeq)[psite:]
864 ) 877 )
865 else: 878 else:
866 seq10 = str(UniProtSeq)[psite - 10: psite + 11] 879 seq10 = str(UniProtSeq)[psite - 10: psite + 11]
867 seq10 = seq10[:10] + "p" + seq10[10:] 880 seq10 = seq10[:10] + "p" + seq10[10:]
868 if seq10 not in seq10s_set: 881 if seq10 not in seq10s_set:
920 result[key] = joined_value 933 result[key] = joined_value
921 934
922 newstring = "; ".join( 935 newstring = "; ".join(
923 [", ".join(prez) for prez in result[PHOSPHORESIDUE]] 936 [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
924 ) 937 )
925 # #separate the isoforms in PHOSPHORESIDUE column with ";"
926 # oldstring = result[PHOSPHORESIDUE]
927 # oldlist = list(oldstring)
928 # newstring = ""
929 # i = 0
930 # for e in oldlist:
931 # if e == ";":
932 # if numps > 1:
933 # if i%numps:
934 # newstring = newstring + ";"
935 # else:
936 # newstring = newstring + ","
937 # else:
938 # newstring = newstring + ";"
939 # i +=1
940 # else:
941 # newstring = newstring + e
942 result[PHOSPHORESIDUE] = newstring 938 result[PHOSPHORESIDUE] = newstring
943 939
944 # separate sequence7's by | 940 # separate sequence7's by |
945 oldstring = result[SEQUENCE7] 941 oldstring = result[SEQUENCE7]
946 oldlist = oldstring 942 oldlist = oldstring
971 % (end_time - start_time,), 967 % (end_time - start_time,),
972 file=sys.stderr, 968 file=sys.stderr,
973 ) # timer 969 ) # timer
974 970
975 # Construct dictionary from list of lists 971 # Construct dictionary from list of lists
976 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ 972 # ref: https://www.8bitavenue.com/\
973 # how-to-convert-list-of-lists-to-dictionary-in-python/
977 UniProt_Info = { 974 UniProt_Info = {
978 result[0]: result[1] 975 result[0]: result[1]
979 for result in result_list 976 for result in result_list
980 if result is not None 977 if result is not None
981 } 978 }
982 979
983 end_time = time.process_time() # timer 980 end_time = time.process_time() # timer
984 print( 981 print(
985 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" 982 "%0.6f create dictionary mapping phosphopeptide %s"
986 % (end_time - start_time,), 983 % (end_time - start_time, "to metadata dictionary [C]"),
987 file=sys.stderr, 984 file=sys.stderr,
988 ) # timer 985 ) # timer
989 986
990 # cosmetic: add N_A to phosphopeptide rows with no hits 987 # cosmetic: add N_A to phosphopeptide rows with no hits
991 p_peptide_list = [] 988 p_peptide_list = []
1040 for _, row in uniprot_df.iterrows() 1037 for _, row in uniprot_df.iterrows()
1041 ] 1038 ]
1042 ).reset_index() 1039 ).reset_index()
1043 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] 1040 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
1044 1041
1045 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- 1042 # read in PhosphoSitePlus Regulatory Sites dataset from SQLite
1046 # read in PhosphoSitePlus Regulatory Sites dataset 1043 # --- -------------- begin read PSP_Regulatory_sites -----
1047 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
1048 conn = sql.connect(uniprot_sqlite) 1044 conn = sql.connect(uniprot_sqlite)
1049 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) 1045 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
1050 # Close SwissProt SQLite database 1046 # Close SwissProt SQLite database
1051 conn.close() 1047 conn.close()
1052 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ 1048 # ... -------------- end read PSP_Regulatory_sites -------
1053 1049
1054 # keep only the human entries in dataframe 1050 # keep only the human entries in dataframe
1055 if len(species) > 0: 1051 if len(species) > 0:
1056 print( 1052 print(
1057 'Limit PhosphoSitesPlus records to species "' + species + '"' 1053 'Limit PhosphoSitesPlus records to species "' + species + '"'
1125 fp_series[i] = fp_series[i][2:] 1121 fp_series[i] = fp_series[i][2:]
1126 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") 1122 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
1127 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") 1123 fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
1128 fp_series[i] = fp_series[i].replace("; ; ", "; ") 1124 fp_series[i] = fp_series[i].replace("; ; ", "; ")
1129 1125
1130 # turn blanks into N_A to signify the info was searched for but cannot be found 1126 # turn blanks into N_A to signify the info
1127 # that was searched for but cannot be found
1131 if fp_series[i] == "": 1128 if fp_series[i] == "":
1132 fp_series[i] = N_A 1129 fp_series[i] = N_A
1133 1130
1134 i += 1 1131 i += 1
1135 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series 1132 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
1201 {"Protein description": PHOSPHOPEPTIDE}, 1198 {"Protein description": PHOSPHOPEPTIDE},
1202 axis="columns", 1199 axis="columns",
1203 inplace=True, 1200 inplace=True,
1204 ) 1201 )
1205 1202
1206 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
1207 res2 = sorted( 1203 res2 = sorted(
1208 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() 1204 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
1209 ) 1205 )
1210 data_in = data_in.loc[res2] 1206 data_in = data_in.loc[res2]
1211 1207
1226 print( 1222 print(
1227 "%0.6f refactored columns for Upstream Map [1g]" 1223 "%0.6f refactored columns for Upstream Map [1g]"
1228 % (end_time - start_time,), 1224 % (end_time - start_time,),
1229 file=sys.stderr, 1225 file=sys.stderr,
1230 ) # timer 1226 ) # timer
1231
1232 # #rename upstream columns in new list
1233 # new_cols = []
1234 # for name in cols:
1235 # if "_NetworKIN" in name:
1236 # name = name.split("_")[0]
1237 # if " motif" in name:
1238 # name = name.split(" motif")[0]
1239 # if " sequence " in name:
1240 # name = name.split(" sequence")[0]
1241 # if "_Phosida" in name:
1242 # name = name.split("_")[0]
1243 # if "_PhosphoSite" in name:
1244 # name = name.split("_")[0]
1245 # new_cols.append(name)
1246 1227
1247 # rename upstream columns in new list 1228 # rename upstream columns in new list
1248 def col_rename(name): 1229 def col_rename(name):
1249 if "_NetworKIN" in name: 1230 if "_NetworKIN" in name:
1250 name = name.split("_")[0] 1231 name = name.split("_")[0]
1274 new_cols_cast = new_cols 1255 new_cols_cast = new_cols
1275 new_cols_cast[0] = "p_peptide" 1256 new_cols_cast[0] = "p_peptide"
1276 upstream_data_cast.columns = new_cols_cast 1257 upstream_data_cast.columns = new_cols_cast
1277 upstream_data_cast["p_peptide"] = upstream_data.index 1258 upstream_data_cast["p_peptide"] = upstream_data.index
1278 1259
1279 # --- -------------- begin read upstream_data_melt ------------------------------------ 1260 # Get melted kinase mapping data from SQLite database
1280 # ----------- Get melted kinase mapping data from SQLite database (start) ----------- 1261 # --- begin read upstream_data_melt ---------------------------------
1281 conn = sql.connect(uniprot_sqlite) 1262 conn = sql.connect(uniprot_sqlite)
1282 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) 1263 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
1283 # Close SwissProt SQLite database 1264 # Close SwissProt SQLite database
1284 conn.close() 1265 conn.close()
1285 upstream_data_melt = upstream_data_melt_df.copy() 1266 upstream_data_melt = upstream_data_melt_df.copy()
1301 print( 1282 print(
1302 "%0.6f upstream_data_melt_df pre-dedup has %d rows" 1283 "%0.6f upstream_data_melt_df pre-dedup has %d rows"
1303 % (end_time - start_time, len(upstream_data_melt.axes[0])), 1284 % (end_time - start_time, len(upstream_data_melt.axes[0])),
1304 file=sys.stderr, 1285 file=sys.stderr,
1305 ) 1286 )
1306 # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- 1287 # ... end read upstream_data_melt ---------------------------------
1307 # ... -------------- end read upstream_data_melt --------------------------------------
1308 1288
1309 end_time = time.process_time() # timer 1289 end_time = time.process_time() # timer
1310 print( 1290 print(
1311 "%0.6f melted and minimized Upstream Map dataframe [1h_2]" 1291 "%0.6f melted and minimized Upstream Map dataframe [1h_2]"
1312 % (end_time - start_time,), 1292 % (end_time - start_time,),
1330 for el in upstream_delta_melt_LoL: 1310 for el in upstream_delta_melt_LoL:
1331 (p_peptide, characterization, X) = tuple(el) 1311 (p_peptide, characterization, X) = tuple(el)
1332 if p_peptide in melt_dict: 1312 if p_peptide in melt_dict:
1333 melt_dict[p_peptide].append(characterization) 1313 melt_dict[p_peptide].append(characterization)
1334 else: 1314 else:
1335 exit( 1315 los = [
1336 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' 1316 "Phosphopeptide %s" % p_peptide,
1337 % (p_peptide) 1317 "not found in ppep_mapping_db:",
1338 ) 1318 '"phopsphopeptides" and "ppep_mapping_db" must both',
1319 "originate from the same run of mqppep_kinase_mapping"
1320 ]
1321 exit(" ".join(los))
1339 1322
1340 end_time = time.process_time() # timer 1323 end_time = time.process_time() # timer
1341 print( 1324 print(
1342 "%0.6f appended peptide characterizations [1h_2b]" 1325 "%0.6f appended peptide characterizations [1h_2b]"
1343 % (end_time - start_time,), 1326 % (end_time - start_time,),
1395 FUNCTION_PHOSPHORESIDUE, 1378 FUNCTION_PHOSPHORESIDUE,
1396 PUTATIVE_UPSTREAM_DOMAINS, 1379 PUTATIVE_UPSTREAM_DOMAINS,
1397 ] 1380 ]
1398 ] 1381 ]
1399 1382
1400 # cols_output_prelim = output_df.columns.tolist()
1401 #
1402 # print("cols_output_prelim")
1403 # print(cols_output_prelim)
1404 #
1405 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
1406 #
1407 # print("cols_output with p-peptide")
1408 # print(cols_output)
1409 #
1410 # cols_output = [col for col in cols_output if not col == "p-peptide"]
1411 #
1412 # print("cols_output")
1413 # print(cols_output)
1414 #
1415 # output_df = output_df[cols_output]
1416
1417 # join output_df back to quantitative columns in data_in df 1383 # join output_df back to quantitative columns in data_in df
1418 quant_cols = data_in.columns.tolist() 1384 quant_cols = data_in.columns.tolist()
1419 quant_cols = quant_cols[1:] 1385 quant_cols = quant_cols[1:]
1420 quant_data = data_in[quant_cols] 1386 quant_data = data_in[quant_cols]
1421 1387
1422 # ----------- Write merge/filter metadata to SQLite database (start) ----------- 1388 # ---- Write merge/filter metadata to SQLite database (start) ----
1423 # Open SwissProt SQLite database 1389 # Open SwissProt SQLite database
1424 conn = sql.connect(output_sqlite) 1390 conn = sql.connect(output_sqlite)
1425 cur = conn.cursor() 1391 cur = conn.cursor()
1426 1392
1427 cur.executescript(MRGFLTR_DDL) 1393 cur.executescript(MRGFLTR_DDL)
1465 method="multi", 1431 method="multi",
1466 ) 1432 )
1467 1433
1468 # Close SwissProt SQLite database 1434 # Close SwissProt SQLite database
1469 conn.close() 1435 conn.close()
1470 # ----------- Write merge/filter metadata to SQLite database (finish) ----------- 1436 # ---- Write merge/filter metadata to SQLite database (finish) ----
1471 1437
1472 output_df = output_df.merge( 1438 output_df = output_df.merge(
1473 quant_data, 1439 quant_data,
1474 how="right", 1440 how="right",
1475 left_on=PHOSPHOPEPTIDE, 1441 left_on=PHOSPHOPEPTIDE,
1478 output_cols = output_df.columns.tolist() 1444 output_cols = output_df.columns.tolist()
1479 output_cols = output_cols[:-1] 1445 output_cols = output_cols[:-1]
1480 output_df = output_df[output_cols] 1446 output_df = output_df[output_cols]
1481 1447
1482 # cosmetic changes to Upstream column 1448 # cosmetic changes to Upstream column
1449 # fill the NaN with "" for those Phosphopeptides that got a
1450 # "WARNING: Failed match for " in the upstream mapping
1483 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ 1451 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
1484 PUTATIVE_UPSTREAM_DOMAINS 1452 PUTATIVE_UPSTREAM_DOMAINS
1485 ].fillna( 1453 ].fillna(
1486 "" 1454 ""
1487 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping 1455 )
1488 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) 1456 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
1489 i = 0 1457 i = 0
1490 while i < len(us_series): 1458 while i < len(us_series):
1491 # turn blanks into N_A to signify the info was searched for but cannot be found 1459 # turn blanks into N_A to signify the info
1460 # that was searched for but cannot be found
1492 if us_series[i] == "": 1461 if us_series[i] == "":
1493 us_series[i] = N_A 1462 us_series[i] = N_A
1494 i += 1 1463 i += 1
1495 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series 1464 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
1496 1465
1528 ) # timer 1497 ) # timer
1529 1498
1530 # Rev. 7/1/2016 1499 # Rev. 7/1/2016
1531 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's 1500 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
1532 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS 1501 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
1533 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ 1502 # Rev. 12/2/2021: Converted to Python from ipynb; use fast \
1534 # read from SwissProt SQLite database 1503 # Aho-Corasick searching; \
1504 # read from SwissProt SQLite database
1535 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper 1505 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
1536 1506
1537 # 1507 #
1538 # copied from Excel Output Script.ipynb END # 1508 # copied from Excel Output Script.ipynb END #
1539 # 1509 #