Mercurial > repos > galaxyp > mqppep_preproc
comparison mqppep_mrgfltr.py @ 1:b76c75521d91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 43e7a43b545c24b2dc33d039198551c032aa79be
author | galaxyp |
---|---|
date | Fri, 28 Oct 2022 18:26:42 +0000 |
parents | 8dfd5d2b5903 |
children |
comparison
equal
deleted
inserted
replaced
0:8dfd5d2b5903 | 1:b76c75521d91 |
---|---|
85 "--phosphopeptides", | 85 "--phosphopeptides", |
86 "-p", | 86 "-p", |
87 nargs=1, | 87 nargs=1, |
88 required=True, | 88 required=True, |
89 dest="phosphopeptides", | 89 dest="phosphopeptides", |
90 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", | 90 help=" ".join([ |
91 "Phosphopeptide data for experimental results, including the", | |
92 "intensities and the mapping to kinase domains, in tabular format" | |
93 ]), | |
91 ) | 94 ) |
92 # UniProtKB/SwissProt DB input, SQLite | 95 # UniProtKB/SwissProt DB input, SQLite |
93 parser.add_argument( | 96 parser.add_argument( |
94 "--ppep_mapping_db", | 97 "--ppep_mapping_db", |
95 "-d", | 98 "-d", |
104 "-x", | 107 "-x", |
105 nargs=1, | 108 nargs=1, |
106 required=False, | 109 required=False, |
107 default=[], | 110 default=[], |
108 dest="species", | 111 dest="species", |
109 help="limit PhosphoSitePlus records to indicated species (field may be empty)", | 112 help=" ".join([ |
113 "limit PhosphoSitePlus records to indicated species", | |
114 "(field may be empty)" | |
115 ]), | |
110 ) | 116 ) |
111 | 117 |
112 # outputs: | 118 # outputs: |
113 # tabular output | 119 # tabular output |
114 parser.add_argument( | 120 parser.add_argument( |
172 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) | 178 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) |
173 | 179 |
174 # determine species to limit records from PSP_Regulatory_Sites | 180 # determine species to limit records from PSP_Regulatory_Sites |
175 if options.species is None: | 181 if options.species is None: |
176 exit( | 182 exit( |
177 'Argument "species" is required (and may be empty) but not supplied' | 183 'Argument "species" is required (& may be empty) but not supplied' |
178 ) | 184 ) |
179 try: | 185 try: |
180 if len(options.species) > 0: | 186 if len(options.species) > 0: |
181 species = options.species[0] | 187 species = options.species[0] |
182 else: | 188 else: |
214 DEPHOSPHOPEP = "DephosphoPep" | 220 DEPHOSPHOPEP = "DephosphoPep" |
215 DESCRIPTION = "Description" | 221 DESCRIPTION = "Description" |
216 FUNCTION_PHOSPHORESIDUE = ( | 222 FUNCTION_PHOSPHORESIDUE = ( |
217 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" | 223 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" |
218 ) | 224 ) |
219 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB | 225 # Gene Name from UniProtKB |
220 ON_FUNCTION = ( | 226 GENE_NAME = "Gene_Name" |
221 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites | 227 # ON_FUNCTION column from PSP_Regulatory_Sites |
222 ) | 228 ON_FUNCTION = ("ON_FUNCTION") |
223 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites | 229 # NOTES column from PSP_Regulatory_Sites |
224 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites | 230 ON_NOTES = "NOTES" |
225 ON_PROCESS = ( | 231 # ON_OTHER_INTERACT column from PSP_Regulatory_Sites |
226 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites | 232 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" |
227 ) | 233 # ON_PROCESS column from PSP_Regulatory_Sites |
228 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites | 234 ON_PROCESS = ("ON_PROCESS") |
235 # ON_PROT_INTERACT column from PSP_Regulatory_Sites | |
236 ON_PROT_INTERACT = "ON_PROT_INTERACT" | |
229 PHOSPHOPEPTIDE = "Phosphopeptide" | 237 PHOSPHOPEPTIDE = "Phosphopeptide" |
230 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" | 238 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" |
231 PHOSPHORESIDUE = "Phosphoresidue" | 239 PHOSPHORESIDUE = "Phosphoresidue" |
232 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" | 240 PUTATIVE_UPSTREAM_DOMAINS = " ".join([ |
241 "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/", | |
242 "Phosphatases/Binding Domains" | |
243 ]) | |
233 SEQUENCE = "Sequence" | 244 SEQUENCE = "Sequence" |
234 SEQUENCE10 = "Sequence10" | 245 SEQUENCE10 = "Sequence10" |
235 SEQUENCE7 = "Sequence7" | 246 SEQUENCE7 = "Sequence7" |
236 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" | 247 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" |
237 UNIPROT_ID = "UniProt_ID" | 248 UNIPROT_ID = "UniProt_ID" |
326 INSERT INTO Citation ( | 337 INSERT INTO Citation ( |
327 ObjectName, | 338 ObjectName, |
328 CitationData | 339 CitationData |
329 ) VALUES (?,?) | 340 ) VALUES (?,?) |
330 """ | 341 """ |
331 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' | 342 CITATION_INSERT_PSP = " ".join([ |
332 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' | 343 "PhosphoSitePlus(R) (PSP) was created by Cell Signaling", |
344 "Technology Inc. It is licensed under a Creative Commons", | |
345 "Attribution-NonCommercial-ShareAlike 3.0 Unported License.", | |
346 "When using PSP data or analyses in printed publications or", | |
347 "in online resources, the following acknowledgements must be", | |
348 "included: (a) the words", | |
349 '"PhosphoSitePlus(R), www.phosphosite.org" must', | |
350 "be included at appropriate places in the text or webpage,", | |
351 "and (b) the following citation must be included in the", | |
352 'bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser', | |
353 "JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations,", | |
354 "PTMs and recalibrations. Nucleic Acids Res. 2015", | |
355 '43:D512-20. PMID: 25514926."' | |
356 ]) | |
357 CITATION_INSERT_PSP_REF = " ".join([ | |
358 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and', | |
359 'recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298,', | |
360 "https://doi.org/10.1093/nar/gkr1122" | |
361 ]) | |
333 | 362 |
334 MRGFLTR_METADATA_COLUMNS = [ | 363 MRGFLTR_METADATA_COLUMNS = [ |
335 "ppep_id", | 364 "ppep_id", |
336 "Sequence10", | 365 "Sequence10", |
337 "Sequence7", | 366 "Sequence7", |
386 m = re_tab.match(line) | 415 m = re_tab.match(line) |
387 upstream_map_p_peptide_list.append(m[0]) | 416 upstream_map_p_peptide_list.append(m[0]) |
388 file1.close() | 417 file1.close() |
389 file1_encoded.close() | 418 file1_encoded.close() |
390 | 419 |
391 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed | 420 # Get the list of phosphopeptides with the p's that represent |
421 # the phosphorylation sites removed | |
392 re_phos = re.compile("p") | 422 re_phos = re.compile("p") |
393 | 423 |
394 end_time = time.process_time() # timer | 424 end_time = time.process_time() # timer |
395 print( | 425 print( |
396 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), | 426 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), |
397 file=sys.stderr, | 427 file=sys.stderr, |
398 ) | 428 ) |
399 | 429 |
400 # ----------- Get SwissProt data from SQLite database (start) ----------- | 430 # -------- Get SwissProt data from SQLite database (start) ----------- |
401 # build UniProt sequence LUT and list of unique SwissProt sequences | 431 # build UniProt sequence LUT and list of unique SwissProt sequences |
402 | 432 |
403 # Open SwissProt SQLite database | 433 # Open SwissProt SQLite database |
404 conn = sql.connect(uniprot_sqlite) | 434 conn = sql.connect(uniprot_sqlite) |
405 cur = conn.cursor() | 435 cur = conn.cursor() |
463 Sequence = "" | 493 Sequence = "" |
464 UniProt_ID = "" | 494 UniProt_ID = "" |
465 Description = "" | 495 Description = "" |
466 Gene_Name = "" | 496 Gene_Name = "" |
467 | 497 |
468 # ----------- Get SwissProt data from SQLite database (finish) ----------- | 498 # -------- Get SwissProt data from SQLite database (finish) ----------- |
469 | 499 |
470 end_time = time.process_time() # timer | 500 end_time = time.process_time() # timer |
471 print( | 501 print( |
472 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), | 502 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), |
473 file=sys.stderr, | 503 file=sys.stderr, |
474 ) | 504 ) |
475 | 505 |
476 # ----------- Get SwissProt data from SQLite database (start) ----------- | 506 # -------- Get SwissProt data from SQLite database (start) ----------- |
477 # Open SwissProt SQLite database | 507 # Open SwissProt SQLite database |
478 conn = sql.connect(uniprot_sqlite) | 508 conn = sql.connect(uniprot_sqlite) |
479 cur = conn.cursor() | 509 cur = conn.cursor() |
480 | 510 |
481 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide | 511 # Set up dictionary to aggregate results for phosphopeptides |
512 # corresponding to dephosphoeptide | |
482 DephosphoPep_UniProtSeq_LUT = {} | 513 DephosphoPep_UniProtSeq_LUT = {} |
483 | 514 |
484 # Set up dictionary to accumulate results | 515 # Set up dictionary to accumulate results |
485 PhosphoPep_UniProtSeq_LUT = {} | 516 PhosphoPep_UniProtSeq_LUT = {} |
486 | 517 |
546 # ref: https://stackoverflow.com/a/4174955/15509512 | 577 # ref: https://stackoverflow.com/a/4174955/15509512 |
547 r = sorted(r, key=operator.itemgetter(0)) | 578 r = sorted(r, key=operator.itemgetter(0)) |
548 # Get one tuple for each `phospho_pep` | 579 # Get one tuple for each `phospho_pep` |
549 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] | 580 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] |
550 for (upid, gn, desc) in r: | 581 for (upid, gn, desc) in r: |
551 # Append pseudo-tuple per UniProt_ID but only when it is not present | 582 # Append pseudo-tuple per UniProt_ID iff not present |
552 if ( | 583 if ( |
553 upid | 584 upid |
554 not in DephosphoPep_UniProtSeq_LUT[ | 585 not in DephosphoPep_UniProtSeq_LUT[ |
555 (dephospho_pep, UNIPROT_ID) | 586 (dephospho_pep, UNIPROT_ID) |
556 ] | 587 ] |
569 conn.close() | 600 conn.close() |
570 # wipe local variables | 601 # wipe local variables |
571 phospho_pep = dephospho_pep = sequence = 0 | 602 phospho_pep = dephospho_pep = sequence = 0 |
572 upid = gn = desc = r = "" | 603 upid = gn = desc = r = "" |
573 | 604 |
574 # ----------- Get SwissProt data from SQLite database (finish) ----------- | 605 # -------- Get SwissProt data from SQLite database (finish) ----------- |
575 | 606 |
576 end_time = time.process_time() # timer | 607 end_time = time.process_time() # timer |
577 print( | 608 print( |
578 "%0.6f finished reading and decoding '%s' [0.4]" | 609 "%0.6f finished reading and decoding '%s' [0.4]" |
579 % (end_time - start_time, upstream_map_filename_tab), | 610 % (end_time - start_time, upstream_map_filename_tab), |
606 "%0.6f added index to Upstream Map [1g_2]" | 637 "%0.6f added index to Upstream Map [1g_2]" |
607 % (end_time - start_time,), | 638 % (end_time - start_time,), |
608 file=sys.stderr, | 639 file=sys.stderr, |
609 ) # timer | 640 ) # timer |
610 | 641 |
611 # ######################################################################## | |
612 # # trim upstream_data to include only the upstream map columns | |
613 # old_cols = upstream_data.columns.tolist() | |
614 # i = 0 | |
615 # first_intensity = -1 | |
616 # last_intensity = -1 | |
617 # intensity_re = re.compile("Intensity.*") | |
618 # for col_name in old_cols: | |
619 # m = intensity_re.match(col_name) | |
620 # if m: | |
621 # last_intensity = i | |
622 # if first_intensity == -1: | |
623 # first_intensity = i | |
624 # i += 1 | |
625 # # print('last intensity = %d' % last_intensity) | |
626 # col_PKCalpha = last_intensity + 2 | |
627 # | |
628 # data_in_cols = [old_cols[0]] + old_cols[ | |
629 # first_intensity: last_intensity + 1 | |
630 # ] | |
631 # | |
632 # if upstream_data.empty: | |
633 # print("upstream_data is empty") | |
634 # exit(0) | |
635 # | |
636 # data_in = upstream_data.copy(deep=True)[data_in_cols] | |
637 ######################################################################## | |
638 # trim upstream_data to include only the upstream map columns | 642 # trim upstream_data to include only the upstream map columns |
639 old_cols = upstream_data.columns.tolist() | 643 old_cols = upstream_data.columns.tolist() |
640 i = 0 | 644 i = 0 |
641 first_intensity = -1 | 645 first_intensity = -1 |
642 last_intensity = -1 | 646 last_intensity = -1 |
664 | 668 |
665 data_in = upstream_data.copy(deep=True)[data_in_cols] | 669 data_in = upstream_data.copy(deep=True)[data_in_cols] |
666 data_in.columns = data_col_names | 670 data_in.columns = data_col_names |
667 print("data_in") | 671 print("data_in") |
668 print(data_in) | 672 print(data_in) |
669 ######################################################################## | 673 ###################################################################### |
670 | 674 |
671 # Convert floating-point integers to int64 integers | 675 # Convert floating-point integers to int64 integers |
672 # ref: https://stackoverflow.com/a/68497603/15509512 | 676 # ref: https://stackoverflow.com/a/68497603/15509512 |
673 data_in[list(data_in.columns[1:])] = ( | 677 data_in[list(data_in.columns[1:])] = ( |
674 data_in[list(data_in.columns[1:])] | 678 data_in[list(data_in.columns[1:])] |
687 % (end_time - start_time,), | 691 % (end_time - start_time,), |
688 file=sys.stderr, | 692 file=sys.stderr, |
689 ) # timer | 693 ) # timer |
690 | 694 |
691 # Produce a dictionary of metadata for a single phosphopeptide. | 695 # Produce a dictionary of metadata for a single phosphopeptide. |
692 # This is a replacement of `UniProtInfo_subdict` in the original code. | 696 # This is replaces `UniProtInfo_subdict` from the original code. |
693 def pseq_to_subdict(phospho_pep): | 697 def pseq_to_subdict(phospho_pep): |
694 # Strip "p" from phosphopeptide sequence | 698 # Strip "p" from phosphopeptide sequence |
695 dephospho_pep = re_phos.sub("", phospho_pep) | 699 dephospho_pep = re_phos.sub("", phospho_pep) |
696 | 700 |
697 # Determine number of phosphoresidues in phosphopeptide | 701 # Determine number of phosphoresidues in phosphopeptide |
731 # ) | 735 # ) |
732 # ) | 736 # ) |
733 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | 737 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: |
734 raise PreconditionError( | 738 raise PreconditionError( |
735 phospho_pep, | 739 phospho_pep, |
736 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", | 740 " ".join([ |
741 "no matching phosphopeptide found in", | |
742 "PhosphoPep_UniProtSeq_LUT" | |
743 ]) | |
737 ) | 744 ) |
738 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | 745 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: |
739 raise PreconditionError( | 746 raise PreconditionError( |
740 dephospho_pep, | 747 dephospho_pep, |
741 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", | 748 " ".join([ |
749 "dephosphorylated phosphopeptide not found", | |
750 "in DephosphoPep_UniProtSeq_LUT" | |
751 ]) | |
742 ) | 752 ) |
743 if ( | 753 if ( |
744 dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] | 754 dephospho_pep != PhosphoPep_UniProtSeq_LUT[ |
755 (phospho_pep, DEPHOSPHOPEP)] | |
745 ): | 756 ): |
746 my_err_msg = "dephosphorylated phosphopeptide does not match " | 757 my_err_msg = "dephosphorylated phosphopeptide does not match " |
747 my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " | 758 my_err_msg += "PhosphoPep_UniProtSeq_LUT" |
748 my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] | 759 my_err_msg += "[(phospho_pep,DEPHOSPHOPEP)] = " |
760 my_err_msg += PhosphoPep_UniProtSeq_LUT[ | |
761 (phospho_pep, DEPHOSPHOPEP)] | |
749 raise PreconditionError(dephospho_pep, my_err_msg) | 762 raise PreconditionError(dephospho_pep, my_err_msg) |
750 | 763 |
751 result[SEQUENCE] = [dephospho_pep] | 764 result[SEQUENCE] = [dephospho_pep] |
752 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ | 765 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ |
753 (dephospho_pep, UNIPROT_ID) | 766 (dephospho_pep, UNIPROT_ID) |
759 (dephospho_pep, GENE_NAME) | 772 (dephospho_pep, GENE_NAME) |
760 ] | 773 ] |
761 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: | 774 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: |
762 raise PreconditionError( | 775 raise PreconditionError( |
763 dephospho_pep, | 776 dephospho_pep, |
764 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", | 777 "".join( |
778 "no matching phosphopeptide found", | |
779 "in DephosphoPep_UniProtSeq_LUT") | |
765 ) | 780 ) |
766 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ | 781 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ |
767 (dephospho_pep, SEQUENCE) | 782 (dephospho_pep, SEQUENCE) |
768 ] | 783 ] |
769 if len(UniProtSeqList) < 1: | 784 if len(UniProtSeqList) < 1: |
770 print( | 785 print( |
771 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" | 786 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] %s" |
772 % dephospho_pep | 787 % (dephospho_pep, "because value has zero length") |
773 ) | 788 ) |
774 # raise PreconditionError( | |
775 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", | |
776 # 'value has zero length' | |
777 # ) | |
778 for UniProtSeq in UniProtSeqList: | 789 for UniProtSeq in UniProtSeqList: |
779 i = 0 | 790 i = 0 |
780 phosphoresidues = [] | 791 phosphoresidues = [] |
781 seq7s_set = set() | 792 seq7s_set = set() |
782 seq7s = [] | 793 seq7s = [] |
852 seq7s_set.add(seq7) | 863 seq7s_set.add(seq7) |
853 | 864 |
854 # add Sequence10 | 865 # add Sequence10 |
855 if psite < 10: # phospho_pep at N terminus | 866 if psite < 10: # phospho_pep at N terminus |
856 seq10 = ( | 867 seq10 = ( |
857 str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] | 868 str(UniProtSeq)[:psite] + "p" + str( |
869 UniProtSeq)[psite: psite + 11] | |
858 ) | 870 ) |
859 elif ( | 871 elif ( |
860 len(UniProtSeq) - psite < 11 | 872 len(UniProtSeq) - psite < 11 |
861 ): # phospho_pep at C terminus | 873 ): # phospho_pep at C terminus |
862 seq10 = ( | 874 seq10 = ( |
863 str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] | 875 str(UniProtSeq)[psite - 10: psite] + "p" + str( |
876 UniProtSeq)[psite:] | |
864 ) | 877 ) |
865 else: | 878 else: |
866 seq10 = str(UniProtSeq)[psite - 10: psite + 11] | 879 seq10 = str(UniProtSeq)[psite - 10: psite + 11] |
867 seq10 = seq10[:10] + "p" + seq10[10:] | 880 seq10 = seq10[:10] + "p" + seq10[10:] |
868 if seq10 not in seq10s_set: | 881 if seq10 not in seq10s_set: |
920 result[key] = joined_value | 933 result[key] = joined_value |
921 | 934 |
922 newstring = "; ".join( | 935 newstring = "; ".join( |
923 [", ".join(prez) for prez in result[PHOSPHORESIDUE]] | 936 [", ".join(prez) for prez in result[PHOSPHORESIDUE]] |
924 ) | 937 ) |
925 # #separate the isoforms in PHOSPHORESIDUE column with ";" | |
926 # oldstring = result[PHOSPHORESIDUE] | |
927 # oldlist = list(oldstring) | |
928 # newstring = "" | |
929 # i = 0 | |
930 # for e in oldlist: | |
931 # if e == ";": | |
932 # if numps > 1: | |
933 # if i%numps: | |
934 # newstring = newstring + ";" | |
935 # else: | |
936 # newstring = newstring + "," | |
937 # else: | |
938 # newstring = newstring + ";" | |
939 # i +=1 | |
940 # else: | |
941 # newstring = newstring + e | |
942 result[PHOSPHORESIDUE] = newstring | 938 result[PHOSPHORESIDUE] = newstring |
943 | 939 |
944 # separate sequence7's by | | 940 # separate sequence7's by | |
945 oldstring = result[SEQUENCE7] | 941 oldstring = result[SEQUENCE7] |
946 oldlist = oldstring | 942 oldlist = oldstring |
971 % (end_time - start_time,), | 967 % (end_time - start_time,), |
972 file=sys.stderr, | 968 file=sys.stderr, |
973 ) # timer | 969 ) # timer |
974 | 970 |
975 # Construct dictionary from list of lists | 971 # Construct dictionary from list of lists |
976 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ | 972 # ref: https://www.8bitavenue.com/\ |
973 # how-to-convert-list-of-lists-to-dictionary-in-python/ | |
977 UniProt_Info = { | 974 UniProt_Info = { |
978 result[0]: result[1] | 975 result[0]: result[1] |
979 for result in result_list | 976 for result in result_list |
980 if result is not None | 977 if result is not None |
981 } | 978 } |
982 | 979 |
983 end_time = time.process_time() # timer | 980 end_time = time.process_time() # timer |
984 print( | 981 print( |
985 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" | 982 "%0.6f create dictionary mapping phosphopeptide %s" |
986 % (end_time - start_time,), | 983 % (end_time - start_time, "to metadata dictionary [C]"), |
987 file=sys.stderr, | 984 file=sys.stderr, |
988 ) # timer | 985 ) # timer |
989 | 986 |
990 # cosmetic: add N_A to phosphopeptide rows with no hits | 987 # cosmetic: add N_A to phosphopeptide rows with no hits |
991 p_peptide_list = [] | 988 p_peptide_list = [] |
1040 for _, row in uniprot_df.iterrows() | 1037 for _, row in uniprot_df.iterrows() |
1041 ] | 1038 ] |
1042 ).reset_index() | 1039 ).reset_index() |
1043 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] | 1040 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] |
1044 | 1041 |
1045 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- | 1042 # read in PhosphoSitePlus Regulatory Sites dataset from SQLite |
1046 # read in PhosphoSitePlus Regulatory Sites dataset | 1043 # --- -------------- begin read PSP_Regulatory_sites ----- |
1047 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- | |
1048 conn = sql.connect(uniprot_sqlite) | 1044 conn = sql.connect(uniprot_sqlite) |
1049 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) | 1045 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) |
1050 # Close SwissProt SQLite database | 1046 # Close SwissProt SQLite database |
1051 conn.close() | 1047 conn.close() |
1052 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ | 1048 # ... -------------- end read PSP_Regulatory_sites ------- |
1053 | 1049 |
1054 # keep only the human entries in dataframe | 1050 # keep only the human entries in dataframe |
1055 if len(species) > 0: | 1051 if len(species) > 0: |
1056 print( | 1052 print( |
1057 'Limit PhosphoSitesPlus records to species "' + species + '"' | 1053 'Limit PhosphoSitesPlus records to species "' + species + '"' |
1125 fp_series[i] = fp_series[i][2:] | 1121 fp_series[i] = fp_series[i][2:] |
1126 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") | 1122 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") |
1127 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") | 1123 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") |
1128 fp_series[i] = fp_series[i].replace("; ; ", "; ") | 1124 fp_series[i] = fp_series[i].replace("; ; ", "; ") |
1129 | 1125 |
1130 # turn blanks into N_A to signify the info was searched for but cannot be found | 1126 # turn blanks into N_A to signify the info |
1127 # that was searched for but cannot be found | |
1131 if fp_series[i] == "": | 1128 if fp_series[i] == "": |
1132 fp_series[i] = N_A | 1129 fp_series[i] = N_A |
1133 | 1130 |
1134 i += 1 | 1131 i += 1 |
1135 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series | 1132 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series |
1201 {"Protein description": PHOSPHOPEPTIDE}, | 1198 {"Protein description": PHOSPHOPEPTIDE}, |
1202 axis="columns", | 1199 axis="columns", |
1203 inplace=True, | 1200 inplace=True, |
1204 ) | 1201 ) |
1205 | 1202 |
1206 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') | |
1207 res2 = sorted( | 1203 res2 = sorted( |
1208 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() | 1204 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() |
1209 ) | 1205 ) |
1210 data_in = data_in.loc[res2] | 1206 data_in = data_in.loc[res2] |
1211 | 1207 |
1226 print( | 1222 print( |
1227 "%0.6f refactored columns for Upstream Map [1g]" | 1223 "%0.6f refactored columns for Upstream Map [1g]" |
1228 % (end_time - start_time,), | 1224 % (end_time - start_time,), |
1229 file=sys.stderr, | 1225 file=sys.stderr, |
1230 ) # timer | 1226 ) # timer |
1231 | |
1232 # #rename upstream columns in new list | |
1233 # new_cols = [] | |
1234 # for name in cols: | |
1235 # if "_NetworKIN" in name: | |
1236 # name = name.split("_")[0] | |
1237 # if " motif" in name: | |
1238 # name = name.split(" motif")[0] | |
1239 # if " sequence " in name: | |
1240 # name = name.split(" sequence")[0] | |
1241 # if "_Phosida" in name: | |
1242 # name = name.split("_")[0] | |
1243 # if "_PhosphoSite" in name: | |
1244 # name = name.split("_")[0] | |
1245 # new_cols.append(name) | |
1246 | 1227 |
1247 # rename upstream columns in new list | 1228 # rename upstream columns in new list |
1248 def col_rename(name): | 1229 def col_rename(name): |
1249 if "_NetworKIN" in name: | 1230 if "_NetworKIN" in name: |
1250 name = name.split("_")[0] | 1231 name = name.split("_")[0] |
1274 new_cols_cast = new_cols | 1255 new_cols_cast = new_cols |
1275 new_cols_cast[0] = "p_peptide" | 1256 new_cols_cast[0] = "p_peptide" |
1276 upstream_data_cast.columns = new_cols_cast | 1257 upstream_data_cast.columns = new_cols_cast |
1277 upstream_data_cast["p_peptide"] = upstream_data.index | 1258 upstream_data_cast["p_peptide"] = upstream_data.index |
1278 | 1259 |
1279 # --- -------------- begin read upstream_data_melt ------------------------------------ | 1260 # Get melted kinase mapping data from SQLite database |
1280 # ----------- Get melted kinase mapping data from SQLite database (start) ----------- | 1261 # --- begin read upstream_data_melt --------------------------------- |
1281 conn = sql.connect(uniprot_sqlite) | 1262 conn = sql.connect(uniprot_sqlite) |
1282 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) | 1263 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) |
1283 # Close SwissProt SQLite database | 1264 # Close SwissProt SQLite database |
1284 conn.close() | 1265 conn.close() |
1285 upstream_data_melt = upstream_data_melt_df.copy() | 1266 upstream_data_melt = upstream_data_melt_df.copy() |
1301 print( | 1282 print( |
1302 "%0.6f upstream_data_melt_df pre-dedup has %d rows" | 1283 "%0.6f upstream_data_melt_df pre-dedup has %d rows" |
1303 % (end_time - start_time, len(upstream_data_melt.axes[0])), | 1284 % (end_time - start_time, len(upstream_data_melt.axes[0])), |
1304 file=sys.stderr, | 1285 file=sys.stderr, |
1305 ) | 1286 ) |
1306 # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- | 1287 # ... end read upstream_data_melt --------------------------------- |
1307 # ... -------------- end read upstream_data_melt -------------------------------------- | |
1308 | 1288 |
1309 end_time = time.process_time() # timer | 1289 end_time = time.process_time() # timer |
1310 print( | 1290 print( |
1311 "%0.6f melted and minimized Upstream Map dataframe [1h_2]" | 1291 "%0.6f melted and minimized Upstream Map dataframe [1h_2]" |
1312 % (end_time - start_time,), | 1292 % (end_time - start_time,), |
1330 for el in upstream_delta_melt_LoL: | 1310 for el in upstream_delta_melt_LoL: |
1331 (p_peptide, characterization, X) = tuple(el) | 1311 (p_peptide, characterization, X) = tuple(el) |
1332 if p_peptide in melt_dict: | 1312 if p_peptide in melt_dict: |
1333 melt_dict[p_peptide].append(characterization) | 1313 melt_dict[p_peptide].append(characterization) |
1334 else: | 1314 else: |
1335 exit( | 1315 los = [ |
1336 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' | 1316 "Phosphopeptide %s" % p_peptide, |
1337 % (p_peptide) | 1317 "not found in ppep_mapping_db:", |
1338 ) | 1318 '"phopsphopeptides" and "ppep_mapping_db" must both', |
1319 "originate from the same run of mqppep_kinase_mapping" | |
1320 ] | |
1321 exit(" ".join(los)) | |
1339 | 1322 |
1340 end_time = time.process_time() # timer | 1323 end_time = time.process_time() # timer |
1341 print( | 1324 print( |
1342 "%0.6f appended peptide characterizations [1h_2b]" | 1325 "%0.6f appended peptide characterizations [1h_2b]" |
1343 % (end_time - start_time,), | 1326 % (end_time - start_time,), |
1395 FUNCTION_PHOSPHORESIDUE, | 1378 FUNCTION_PHOSPHORESIDUE, |
1396 PUTATIVE_UPSTREAM_DOMAINS, | 1379 PUTATIVE_UPSTREAM_DOMAINS, |
1397 ] | 1380 ] |
1398 ] | 1381 ] |
1399 | 1382 |
1400 # cols_output_prelim = output_df.columns.tolist() | |
1401 # | |
1402 # print("cols_output_prelim") | |
1403 # print(cols_output_prelim) | |
1404 # | |
1405 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] | |
1406 # | |
1407 # print("cols_output with p-peptide") | |
1408 # print(cols_output) | |
1409 # | |
1410 # cols_output = [col for col in cols_output if not col == "p-peptide"] | |
1411 # | |
1412 # print("cols_output") | |
1413 # print(cols_output) | |
1414 # | |
1415 # output_df = output_df[cols_output] | |
1416 | |
1417 # join output_df back to quantitative columns in data_in df | 1383 # join output_df back to quantitative columns in data_in df |
1418 quant_cols = data_in.columns.tolist() | 1384 quant_cols = data_in.columns.tolist() |
1419 quant_cols = quant_cols[1:] | 1385 quant_cols = quant_cols[1:] |
1420 quant_data = data_in[quant_cols] | 1386 quant_data = data_in[quant_cols] |
1421 | 1387 |
1422 # ----------- Write merge/filter metadata to SQLite database (start) ----------- | 1388 # ---- Write merge/filter metadata to SQLite database (start) ---- |
1423 # Open SwissProt SQLite database | 1389 # Open SwissProt SQLite database |
1424 conn = sql.connect(output_sqlite) | 1390 conn = sql.connect(output_sqlite) |
1425 cur = conn.cursor() | 1391 cur = conn.cursor() |
1426 | 1392 |
1427 cur.executescript(MRGFLTR_DDL) | 1393 cur.executescript(MRGFLTR_DDL) |
1465 method="multi", | 1431 method="multi", |
1466 ) | 1432 ) |
1467 | 1433 |
1468 # Close SwissProt SQLite database | 1434 # Close SwissProt SQLite database |
1469 conn.close() | 1435 conn.close() |
1470 # ----------- Write merge/filter metadata to SQLite database (finish) ----------- | 1436 # ---- Write merge/filter metadata to SQLite database (finish) ---- |
1471 | 1437 |
1472 output_df = output_df.merge( | 1438 output_df = output_df.merge( |
1473 quant_data, | 1439 quant_data, |
1474 how="right", | 1440 how="right", |
1475 left_on=PHOSPHOPEPTIDE, | 1441 left_on=PHOSPHOPEPTIDE, |
1478 output_cols = output_df.columns.tolist() | 1444 output_cols = output_df.columns.tolist() |
1479 output_cols = output_cols[:-1] | 1445 output_cols = output_cols[:-1] |
1480 output_df = output_df[output_cols] | 1446 output_df = output_df[output_cols] |
1481 | 1447 |
1482 # cosmetic changes to Upstream column | 1448 # cosmetic changes to Upstream column |
1449 # fill the NaN with "" for those Phosphopeptides that got a | |
1450 # "WARNING: Failed match for " in the upstream mapping | |
1483 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ | 1451 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ |
1484 PUTATIVE_UPSTREAM_DOMAINS | 1452 PUTATIVE_UPSTREAM_DOMAINS |
1485 ].fillna( | 1453 ].fillna( |
1486 "" | 1454 "" |
1487 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping | 1455 ) |
1488 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) | 1456 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) |
1489 i = 0 | 1457 i = 0 |
1490 while i < len(us_series): | 1458 while i < len(us_series): |
1491 # turn blanks into N_A to signify the info was searched for but cannot be found | 1459 # turn blanks into N_A to signify the info |
1460 # that was searched for but cannot be found | |
1492 if us_series[i] == "": | 1461 if us_series[i] == "": |
1493 us_series[i] = N_A | 1462 us_series[i] = N_A |
1494 i += 1 | 1463 i += 1 |
1495 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series | 1464 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series |
1496 | 1465 |
1528 ) # timer | 1497 ) # timer |
1529 | 1498 |
1530 # Rev. 7/1/2016 | 1499 # Rev. 7/1/2016 |
1531 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's | 1500 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's |
1532 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS | 1501 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS |
1533 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ | 1502 # Rev. 12/2/2021: Converted to Python from ipynb; use fast \ |
1534 # read from SwissProt SQLite database | 1503 # Aho-Corasick searching; \ |
1504 # read from SwissProt SQLite database | |
1535 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper | 1505 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper |
1536 | 1506 |
1537 # | 1507 # |
1538 # copied from Excel Output Script.ipynb END # | 1508 # copied from Excel Output Script.ipynb END # |
1539 # | 1509 # |