comparison mqppep_mrgfltr.py @ 0:8dfd5d2b5903 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author galaxyp
date Mon, 11 Jul 2022 19:22:54 +0000
parents
children b76c75521d91
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfd5d2b5903
1 #!/usr/bin/env python
2
3 # Import the packages needed
4 import argparse
5 import operator # for operator.itemgetter
6 import os.path
7 import re
8 import shutil # for shutil.copyfile(src, dest)
9 import sqlite3 as sql
10 import sys # import the sys module for exc_info
11 import time
12 import traceback # for formatting stack-trace
13 from codecs import getreader as cx_getreader
14
15 import numpy as np
16 import pandas
17
18 # global constants
19 N_A = "N/A"
20
21
22 # ref: https://stackoverflow.com/a/8915613/15509512
23 # answers: "How to handle exceptions in a list comprehensions"
24 # usage:
25 # from math import log
26 # eggs = [1,3,0,3,2]
27 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None])
28 # producing:
29 # for <built-in function log>
30 # with args (0,)
31 # exception: math domain error
32 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453]
33 def catch(func, *args, handle=lambda e: e, **kwargs):
34
35 try:
36 return func(*args, **kwargs)
37 except Exception as e:
38 print("For %s" % str(func))
39 print(" with args %s" % str(args))
40 print(" caught exception: %s" % str(e))
41 (ty, va, tb) = sys.exc_info()
42 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
43 exit(-1)
44 return None
45
46
47 def whine(func, *args, handle=lambda e: e, **kwargs):
48
49 try:
50 return func(*args, **kwargs)
51 except Exception as e:
52 print("Warning: For %s" % str(func))
53 print(" with args %s" % str(args))
54 print(" caught exception: %s" % str(e))
55 (ty, va, tb) = sys.exc_info()
56 print(" stack trace: " + str(traceback.format_exception(ty, va, tb)))
57 return None
58
59
60 def ppep_join(x):
61 x = [i for i in x if N_A != i]
62 result = "%s" % " | ".join(x)
63 if result != "":
64 return result
65 else:
66 return N_A
67
68
69 def melt_join(x):
70 tmp = {key.lower(): key for key in x}
71 result = "%s" % " | ".join([tmp[key] for key in tmp])
72 return result
73
74
75 def __main__():
76 # Parse Command Line
77 parser = argparse.ArgumentParser(
78 description="Phopsphoproteomic Enrichment Pipeline Merge and Filter."
79 )
80
81 # inputs:
82 # Phosphopeptide data for experimental results, including the intensities
83 # and the mapping to kinase domains, in tabular format.
84 parser.add_argument(
85 "--phosphopeptides",
86 "-p",
87 nargs=1,
88 required=True,
89 dest="phosphopeptides",
90 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format",
91 )
92 # UniProtKB/SwissProt DB input, SQLite
93 parser.add_argument(
94 "--ppep_mapping_db",
95 "-d",
96 nargs=1,
97 required=True,
98 dest="ppep_mapping_db",
99 help="UniProtKB/SwissProt SQLite Database",
100 )
101 # species to limit records chosed from PhosPhositesPlus
102 parser.add_argument(
103 "--species",
104 "-x",
105 nargs=1,
106 required=False,
107 default=[],
108 dest="species",
109 help="limit PhosphoSitePlus records to indicated species (field may be empty)",
110 )
111
112 # outputs:
113 # tabular output
114 parser.add_argument(
115 "--mrgfltr_tab",
116 "-o",
117 nargs=1,
118 required=True,
119 dest="mrgfltr_tab",
120 help="Tabular output file for results",
121 )
122 # CSV output
123 parser.add_argument(
124 "--mrgfltr_csv",
125 "-c",
126 nargs=1,
127 required=True,
128 dest="mrgfltr_csv",
129 help="CSV output file for results",
130 )
131 # SQLite output
132 parser.add_argument(
133 "--mrgfltr_sqlite",
134 "-S",
135 nargs=1,
136 required=True,
137 dest="mrgfltr_sqlite",
138 help="SQLite output file for results",
139 )
140
141 # "Make it so!" (parse the arguments)
142 options = parser.parse_args()
143 print("options: " + str(options))
144
145 # determine phosphopeptide ("upstream map") input tabular file access
146 if options.phosphopeptides is None:
147 exit('Argument "phosphopeptides" is required but not supplied')
148 try:
149 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0])
150 input_file = open(upstream_map_filename_tab, "r")
151 input_file.close()
152 except Exception as e:
153 exit("Error parsing phosphopeptides argument: %s" % str(e))
154
155 # determine input SQLite access
156 if options.ppep_mapping_db is None:
157 exit('Argument "ppep_mapping_db" is required but not supplied')
158 try:
159 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0])
160 input_file = open(uniprot_sqlite, "rb")
161 input_file.close()
162 except Exception as e:
163 exit("Error parsing ppep_mapping_db argument: %s" % str(e))
164
165 # copy input SQLite dataset to output SQLite dataset
166 if options.mrgfltr_sqlite is None:
167 exit('Argument "mrgfltr_sqlite" is required but not supplied')
168 try:
169 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0])
170 shutil.copyfile(uniprot_sqlite, output_sqlite)
171 except Exception as e:
172 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e))
173
174 # determine species to limit records from PSP_Regulatory_Sites
175 if options.species is None:
176 exit(
177 'Argument "species" is required (and may be empty) but not supplied'
178 )
179 try:
180 if len(options.species) > 0:
181 species = options.species[0]
182 else:
183 species = ""
184 except Exception as e:
185 exit("Error parsing species argument: %s" % str(e))
186
187 # determine tabular output destination
188 if options.mrgfltr_tab is None:
189 exit('Argument "mrgfltr_tab" is required but not supplied')
190 try:
191 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0])
192 output_file = open(output_filename_tab, "w")
193 output_file.close()
194 except Exception as e:
195 exit("Error parsing mrgfltr_tab argument: %s" % str(e))
196
197 # determine CSV output destination
198 if options.mrgfltr_csv is None:
199 exit('Argument "mrgfltr_csv" is required but not supplied')
200 try:
201 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0])
202 output_file = open(output_filename_csv, "w")
203 output_file.close()
204 except Exception as e:
205 exit("Error parsing mrgfltr_csv argument: %s" % str(e))
206
207 def mqpep_getswissprot():
208
209 #
210 # copied from Excel Output Script.ipynb BEGIN #
211 #
212
213 # String Constants #################
214 DEPHOSPHOPEP = "DephosphoPep"
215 DESCRIPTION = "Description"
216 FUNCTION_PHOSPHORESIDUE = (
217 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)"
218 )
219 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB
220 ON_FUNCTION = (
221 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites
222 )
223 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites
224 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites
225 ON_PROCESS = (
226 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites
227 )
228 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites
229 PHOSPHOPEPTIDE = "Phosphopeptide"
230 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match"
231 PHOSPHORESIDUE = "Phosphoresidue"
232 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains"
233 SEQUENCE = "Sequence"
234 SEQUENCE10 = "Sequence10"
235 SEQUENCE7 = "Sequence7"
236 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA"
237 UNIPROT_ID = "UniProt_ID"
238 UNIPROT_SEQ_AND_META_SQL = """
239 select Uniprot_ID, Description, Gene_Name, Sequence,
240 Organism_Name, Organism_ID, PE, SV
241 from UniProtKB
242 order by Sequence, UniProt_ID
243 """
244 UNIPROT_UNIQUE_SEQ_SQL = """
245 select distinct Sequence
246 from UniProtKB
247 group by Sequence
248 """
249 PPEP_PEP_UNIPROTSEQ_SQL = """
250 select distinct phosphopeptide, peptide, sequence
251 from uniprotkb_pep_ppep_view
252 order by sequence
253 """
254 PPEP_MELT_SQL = """
255 SELECT DISTINCT
256 phospho_peptide AS 'p_peptide',
257 kinase_map AS 'characterization',
258 'X' AS 'X'
259 FROM ppep_gene_site_view
260 """
261 # CREATE TABLE PSP_Regulatory_site (
262 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE,
263 # domain TEXT,
264 # ON_FUNCTION TEXT,
265 # ON_PROCESS TEXT,
266 # ON_PROT_INTERACT TEXT,
267 # ON_OTHER_INTERACT TEXT,
268 # notes TEXT,
269 # organism TEXT
270 # );
271 PSP_REGSITE_SQL = """
272 SELECT DISTINCT
273 SITE_PLUSMINUS_7AA ,
274 DOMAIN ,
275 ON_FUNCTION ,
276 ON_PROCESS ,
277 ON_PROT_INTERACT ,
278 ON_OTHER_INTERACT ,
279 NOTES ,
280 ORGANISM
281 FROM PSP_Regulatory_site
282 """
283 PPEP_ID_SQL = """
284 SELECT
285 id AS 'ppep_id',
286 seq AS 'ppep_seq'
287 FROM ppep
288 """
289 MRGFLTR_DDL = """
290 DROP VIEW IF EXISTS mrgfltr_metadata_view;
291 DROP TABLE IF EXISTS mrgfltr_metadata;
292 CREATE TABLE mrgfltr_metadata
293 ( ppep_id INTEGER REFERENCES ppep(id)
294 , Sequence10 TEXT
295 , Sequence7 TEXT
296 , GeneName TEXT
297 , Phosphoresidue TEXT
298 , UniProtID TEXT
299 , Description TEXT
300 , FunctionPhosphoresidue TEXT
301 , PutativeUpstreamDomains TEXT
302 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE
303 )
304 ;
305 CREATE VIEW mrgfltr_metadata_view AS
306 SELECT DISTINCT
307 ppep.seq AS phospho_peptide
308 , Sequence10
309 , Sequence7
310 , GeneName
311 , Phosphoresidue
312 , UniProtID
313 , Description
314 , FunctionPhosphoresidue
315 , PutativeUpstreamDomains
316 FROM
317 ppep, mrgfltr_metadata
318 WHERE
319 mrgfltr_metadata.ppep_id = ppep.id
320 ORDER BY
321 ppep.seq
322 ;
323 """
324
325 CITATION_INSERT_STMT = """
326 INSERT INTO Citation (
327 ObjectName,
328 CitationData
329 ) VALUES (?,?)
330 """
331 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'
332 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'
333
334 MRGFLTR_METADATA_COLUMNS = [
335 "ppep_id",
336 "Sequence10",
337 "Sequence7",
338 "GeneName",
339 "Phosphoresidue",
340 "UniProtID",
341 "Description",
342 "FunctionPhosphoresidue",
343 "PutativeUpstreamDomains",
344 ]
345
346 # String Constants (end) ############
347
348 class Error(Exception):
349 """Base class for exceptions in this module."""
350
351 pass
352
353 class PreconditionError(Error):
354 """Exception raised for errors in the input.
355
356 Attributes:
357 expression -- input expression in which the error occurred
358 message -- explanation of the error
359 """
360
361 def __init__(self, expression, message):
362 self.expression = expression
363 self.message = message
364
365 # start_time = time.clock() #timer
366 start_time = time.process_time() # timer
367
368 # get keys from upstream tabular file using readline()
369 # ref: https://stackoverflow.com/a/16713581/15509512
370 # answer to "Use codecs to read file with correct encoding"
371 file1_encoded = open(upstream_map_filename_tab, "rb")
372 file1 = cx_getreader("latin-1")(file1_encoded)
373
374 count = 0
375 upstream_map_p_peptide_list = []
376 re_tab = re.compile("^[^\t]*")
377 while True:
378 count += 1
379 # Get next line from file
380 line = file1.readline()
381 # if line is empty
382 # end of file is reached
383 if not line:
384 break
385 if count > 1:
386 m = re_tab.match(line)
387 upstream_map_p_peptide_list.append(m[0])
388 file1.close()
389 file1_encoded.close()
390
391 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed
392 re_phos = re.compile("p")
393
394 end_time = time.process_time() # timer
395 print(
396 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,),
397 file=sys.stderr,
398 )
399
400 # ----------- Get SwissProt data from SQLite database (start) -----------
401 # build UniProt sequence LUT and list of unique SwissProt sequences
402
403 # Open SwissProt SQLite database
404 conn = sql.connect(uniprot_sqlite)
405 cur = conn.cursor()
406
407 # Set up structures to hold SwissProt data
408
409 uniprot_Sequence_List = []
410 UniProtSeqLUT = {}
411
412 # Execute query for unique seqs without fetching the results yet
413 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL)
414
415 while 1:
416 batch = uniprot_unique_seq_cur.fetchmany(size=50)
417 if not batch:
418 # handle case where no records are returned
419 break
420 for row in batch:
421 Sequence = row[0]
422 UniProtSeqLUT[(Sequence, DESCRIPTION)] = []
423 UniProtSeqLUT[(Sequence, GENE_NAME)] = []
424 UniProtSeqLUT[(Sequence, UNIPROT_ID)] = []
425 UniProtSeqLUT[Sequence] = []
426
427 # Execute query for seqs and metadata without fetching the results yet
428 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL)
429
430 while 1:
431 batch = uniprot_seq_and_meta.fetchmany(size=50)
432 if not batch:
433 # handle case where no records are returned
434 break
435 for (
436 UniProt_ID,
437 Description,
438 Gene_Name,
439 Sequence,
440 OS,
441 OX,
442 PE,
443 SV,
444 ) in batch:
445 uniprot_Sequence_List.append(Sequence)
446 UniProtSeqLUT[Sequence] = Sequence
447 UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID)
448 UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name)
449 if OS != N_A:
450 Description += " OS=" + OS
451 if OX != -1:
452 Description += " OX=" + str(OX)
453 if Gene_Name != N_A:
454 Description += " GN=" + Gene_Name
455 if PE != N_A:
456 Description += " PE=" + PE
457 if SV != N_A:
458 Description += " SV=" + SV
459 UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description)
460
461 # Close SwissProt SQLite database; clean up local variables
462 conn.close()
463 Sequence = ""
464 UniProt_ID = ""
465 Description = ""
466 Gene_Name = ""
467
468 # ----------- Get SwissProt data from SQLite database (finish) -----------
469
470 end_time = time.process_time() # timer
471 print(
472 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,),
473 file=sys.stderr,
474 )
475
476 # ----------- Get SwissProt data from SQLite database (start) -----------
477 # Open SwissProt SQLite database
478 conn = sql.connect(uniprot_sqlite)
479 cur = conn.cursor()
480
481 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide
482 DephosphoPep_UniProtSeq_LUT = {}
483
484 # Set up dictionary to accumulate results
485 PhosphoPep_UniProtSeq_LUT = {}
486
487 # Execute query for tuples without fetching the results yet
488 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL)
489
490 while 1:
491 batch = ppep_pep_uniprotseq_cur.fetchmany(size=50)
492 if not batch:
493 # handle case where no records are returned
494 break
495 for (phospho_pep, dephospho_pep, sequence) in batch:
496 # do interesting stuff here...
497 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
498 PhosphoPep_UniProtSeq_LUT[
499 (phospho_pep, DEPHOSPHOPEP)
500 ] = dephospho_pep
501 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
502 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set()
503 DephosphoPep_UniProtSeq_LUT[
504 (dephospho_pep, DESCRIPTION)
505 ] = []
506 DephosphoPep_UniProtSeq_LUT[
507 (dephospho_pep, GENE_NAME)
508 ] = []
509 DephosphoPep_UniProtSeq_LUT[
510 (dephospho_pep, UNIPROT_ID)
511 ] = []
512 DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = []
513 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep)
514
515 if (
516 sequence
517 not in DephosphoPep_UniProtSeq_LUT[
518 (dephospho_pep, SEQUENCE)
519 ]
520 ):
521 DephosphoPep_UniProtSeq_LUT[
522 (dephospho_pep, SEQUENCE)
523 ].append(sequence)
524 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]:
525 if phospho_pep != phospho_pep:
526 print(
527 "phospho_pep:'%s' phospho_pep:'%s'"
528 % (phospho_pep, phospho_pep)
529 )
530 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
531 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep
532 PhosphoPep_UniProtSeq_LUT[
533 (phospho_pep, DEPHOSPHOPEP)
534 ] = dephospho_pep
535 r = list(
536 zip(
537 [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]],
538 [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]],
539 [
540 s
541 for s in UniProtSeqLUT[(sequence, DESCRIPTION)]
542 ],
543 )
544 )
545 # Sort by `UniProt_ID`
546 # ref: https://stackoverflow.com/a/4174955/15509512
547 r = sorted(r, key=operator.itemgetter(0))
548 # Get one tuple for each `phospho_pep`
549 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep]
550 for (upid, gn, desc) in r:
551 # Append pseudo-tuple per UniProt_ID but only when it is not present
552 if (
553 upid
554 not in DephosphoPep_UniProtSeq_LUT[
555 (dephospho_pep, UNIPROT_ID)
556 ]
557 ):
558 DephosphoPep_UniProtSeq_LUT[
559 (dephospho_pep, UNIPROT_ID)
560 ].append(upid)
561 DephosphoPep_UniProtSeq_LUT[
562 (dephospho_pep, DESCRIPTION)
563 ].append(desc)
564 DephosphoPep_UniProtSeq_LUT[
565 (dephospho_pep, GENE_NAME)
566 ].append(gn)
567
568 # Close SwissProt SQLite database; clean up local variables
569 conn.close()
570 # wipe local variables
571 phospho_pep = dephospho_pep = sequence = 0
572 upid = gn = desc = r = ""
573
574 # ----------- Get SwissProt data from SQLite database (finish) -----------
575
576 end_time = time.process_time() # timer
577 print(
578 "%0.6f finished reading and decoding '%s' [0.4]"
579 % (end_time - start_time, upstream_map_filename_tab),
580 file=sys.stderr,
581 )
582
583 print(
584 "{:>10} unique upstream phosphopeptides tested".format(
585 str(len(upstream_map_p_peptide_list))
586 )
587 )
588
589 # Read in Upstream tabular file
590 # We are discarding the intensity data; so read it as text
591 upstream_data = pandas.read_table(
592 upstream_map_filename_tab, dtype="str", index_col=0
593 )
594
595 end_time = time.process_time() # timer
596 print(
597 "%0.6f read Upstream Map from file [1g_1]"
598 % (end_time - start_time,),
599 file=sys.stderr,
600 ) # timer
601
602 upstream_data.index = upstream_map_p_peptide_list
603
604 end_time = time.process_time() # timer
605 print(
606 "%0.6f added index to Upstream Map [1g_2]"
607 % (end_time - start_time,),
608 file=sys.stderr,
609 ) # timer
610
611 # ########################################################################
612 # # trim upstream_data to include only the upstream map columns
613 # old_cols = upstream_data.columns.tolist()
614 # i = 0
615 # first_intensity = -1
616 # last_intensity = -1
617 # intensity_re = re.compile("Intensity.*")
618 # for col_name in old_cols:
619 # m = intensity_re.match(col_name)
620 # if m:
621 # last_intensity = i
622 # if first_intensity == -1:
623 # first_intensity = i
624 # i += 1
625 # # print('last intensity = %d' % last_intensity)
626 # col_PKCalpha = last_intensity + 2
627 #
628 # data_in_cols = [old_cols[0]] + old_cols[
629 # first_intensity: last_intensity + 1
630 # ]
631 #
632 # if upstream_data.empty:
633 # print("upstream_data is empty")
634 # exit(0)
635 #
636 # data_in = upstream_data.copy(deep=True)[data_in_cols]
637 ########################################################################
638 # trim upstream_data to include only the upstream map columns
639 old_cols = upstream_data.columns.tolist()
640 i = 0
641 first_intensity = -1
642 last_intensity = -1
643 intensity_re = re.compile("Intensity.*")
644 for col_name in old_cols:
645 m = intensity_re.match(col_name)
646 if m:
647 last_intensity = i
648 if first_intensity == -1:
649 first_intensity = i
650 i += 1
651 # print('last intensity = %d' % last_intensity)
652 col_PKCalpha = last_intensity + 2
653
654 data_in_cols = [old_cols[0]] + old_cols[
655 first_intensity - 1: last_intensity
656 ]
657 data_col_names = [old_cols[0]] + old_cols[
658 first_intensity: last_intensity + 1
659 ]
660
661 if upstream_data.empty:
662 print("upstream_data is empty")
663 exit(0)
664
665 data_in = upstream_data.copy(deep=True)[data_in_cols]
666 data_in.columns = data_col_names
667 print("data_in")
668 print(data_in)
669 ########################################################################
670
671 # Convert floating-point integers to int64 integers
672 # ref: https://stackoverflow.com/a/68497603/15509512
673 data_in[list(data_in.columns[1:])] = (
674 data_in[list(data_in.columns[1:])]
675 .astype("float64")
676 .apply(np.int64)
677 )
678
679 # create another phosphopeptide column that will be used to join later;
680 # MAY need to change depending on Phosphopeptide column position
681 # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]]
682 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index
683
684 end_time = time.process_time() # timer
685 print(
686 "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]"
687 % (end_time - start_time,),
688 file=sys.stderr,
689 ) # timer
690
691 # Produce a dictionary of metadata for a single phosphopeptide.
692 # This is a replacement of `UniProtInfo_subdict` in the original code.
693 def pseq_to_subdict(phospho_pep):
694 # Strip "p" from phosphopeptide sequence
695 dephospho_pep = re_phos.sub("", phospho_pep)
696
697 # Determine number of phosphoresidues in phosphopeptide
698 numps = len(phospho_pep) - len(dephospho_pep)
699
700 # Determine location(s) of phosphoresidue(s) in phosphopeptide
701 # (used later for Phosphoresidue, Sequence7, and Sequence10)
702 ploc = [] # list of p locations
703 i = 0
704 p = phospho_pep
705 while i < numps:
706 ploc.append(p.find("p"))
707 p = p[: p.find("p")] + p[p.find("p") + 1:]
708 i += 1
709
710 # Establish nested dictionary
711 result = {}
712 result[SEQUENCE] = []
713 result[UNIPROT_ID] = []
714 result[DESCRIPTION] = []
715 result[GENE_NAME] = []
716 result[PHOSPHORESIDUE] = []
717 result[SEQUENCE7] = []
718 result[SEQUENCE10] = []
719
720 # Add stripped sequence to dictionary
721 result[SEQUENCE].append(dephospho_pep)
722
723 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT
724 # Caller may elect to:
725 # try:
726 # ...
727 # except PreconditionError as pe:
728 # print("'{expression}': {message}".format(
729 # expression = pe.expression,
730 # message = pe.message))
731 # )
732 # )
733 if phospho_pep not in PhosphoPep_UniProtSeq_LUT:
734 raise PreconditionError(
735 phospho_pep,
736 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT",
737 )
738 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT:
739 raise PreconditionError(
740 dephospho_pep,
741 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT",
742 )
743 if (
744 dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
745 ):
746 my_err_msg = "dephosphorylated phosphopeptide does not match "
747 my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = "
748 my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)]
749 raise PreconditionError(dephospho_pep, my_err_msg)
750
751 result[SEQUENCE] = [dephospho_pep]
752 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[
753 (dephospho_pep, UNIPROT_ID)
754 ]
755 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[
756 (dephospho_pep, DESCRIPTION)
757 ]
758 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[
759 (dephospho_pep, GENE_NAME)
760 ]
761 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT:
762 raise PreconditionError(
763 dephospho_pep,
764 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT",
765 )
766 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[
767 (dephospho_pep, SEQUENCE)
768 ]
769 if len(UniProtSeqList) < 1:
770 print(
771 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length"
772 % dephospho_pep
773 )
774 # raise PreconditionError(
775 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)",
776 # 'value has zero length'
777 # )
778 for UniProtSeq in UniProtSeqList:
779 i = 0
780 phosphoresidues = []
781 seq7s_set = set()
782 seq7s = []
783 seq10s_set = set()
784 seq10s = []
785 while i < len(ploc):
786 start = UniProtSeq.find(dephospho_pep)
787 # handle case where no sequence was found for dep-pep
788 if start < 0:
789 i += 1
790 continue
791 psite = (
792 start + ploc[i]
793 ) # location of phosphoresidue on protein sequence
794
795 # add Phosphoresidue
796 phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1)
797 phosphoresidues.append(phosphosite)
798
799 # Add Sequence7
800 if psite < 7: # phospho_pep at N terminus
801 seq7 = str(UniProtSeq)[: psite + 8]
802 if seq7[psite] == "S": # if phosphosresidue is serine
803 pres = "s"
804 elif (
805 seq7[psite] == "T"
806 ): # if phosphosresidue is threonine
807 pres = "t"
808 elif (
809 seq7[psite] == "Y"
810 ): # if phosphoresidue is tyrosine
811 pres = "y"
812 else: # if not pSTY
813 pres = "?"
814 seq7 = (
815 seq7[:psite] + pres + seq7[psite + 1: psite + 8]
816 )
817 while (
818 len(seq7) < 15
819 ): # add appropriate number of "_" to the front
820 seq7 = "_" + seq7
821 elif (
822 len(UniProtSeq) - psite < 8
823 ): # phospho_pep at C terminus
824 seq7 = str(UniProtSeq)[psite - 7:]
825 if seq7[7] == "S":
826 pres = "s"
827 elif seq7[7] == "T":
828 pres = "t"
829 elif seq7[7] == "Y":
830 pres = "y"
831 else:
832 pres = "?"
833 seq7 = seq7[:7] + pres + seq7[8:]
834 while (
835 len(seq7) < 15
836 ): # add appropriate number of "_" to the back
837 seq7 = seq7 + "_"
838 else:
839 seq7 = str(UniProtSeq)[psite - 7: psite + 8]
840 pres = "" # phosphoresidue
841 if seq7[7] == "S": # if phosphosresidue is serine
842 pres = "s"
843 elif seq7[7] == "T": # if phosphosresidue is threonine
844 pres = "t"
845 elif seq7[7] == "Y": # if phosphoresidue is tyrosine
846 pres = "y"
847 else: # if not pSTY
848 pres = "?"
849 seq7 = seq7[:7] + pres + seq7[8:]
850 if seq7 not in seq7s_set:
851 seq7s.append(seq7)
852 seq7s_set.add(seq7)
853
854 # add Sequence10
855 if psite < 10: # phospho_pep at N terminus
856 seq10 = (
857 str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11]
858 )
859 elif (
860 len(UniProtSeq) - psite < 11
861 ): # phospho_pep at C terminus
862 seq10 = (
863 str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:]
864 )
865 else:
866 seq10 = str(UniProtSeq)[psite - 10: psite + 11]
867 seq10 = seq10[:10] + "p" + seq10[10:]
868 if seq10 not in seq10s_set:
869 seq10s.append(seq10)
870 seq10s_set.add(seq10)
871
872 i += 1
873
874 result[PHOSPHORESIDUE].append(phosphoresidues)
875 result[SEQUENCE7].append(seq7s)
876 # result[SEQUENCE10] is a list of lists of strings
877 result[SEQUENCE10].append(seq10s)
878
879 r = list(
880 zip(
881 result[UNIPROT_ID],
882 result[GENE_NAME],
883 result[DESCRIPTION],
884 result[PHOSPHORESIDUE],
885 )
886 )
887 # Sort by `UniProt_ID`
888 # ref: https://stackoverflow.com//4174955/15509512
889 s = sorted(r, key=operator.itemgetter(0))
890
891 result[UNIPROT_ID] = []
892 result[GENE_NAME] = []
893 result[DESCRIPTION] = []
894 result[PHOSPHORESIDUE] = []
895
896 for r in s:
897 result[UNIPROT_ID].append(r[0])
898 result[GENE_NAME].append(r[1])
899 result[DESCRIPTION].append(r[2])
900 result[PHOSPHORESIDUE].append(r[3])
901
902 # convert lists to strings in the dictionary
903 for key, value in result.items():
904 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]:
905 result[key] = "; ".join(map(str, value))
906 elif key in [SEQUENCE10]:
907 # result[SEQUENCE10] is a list of lists of strings
908 joined_value = ""
909 joined_set = set()
910 sep = ""
911 for valL in value:
912 # valL is a list of strings
913 for val in valL:
914 # val is a string
915 if val not in joined_set:
916 joined_set.add(val)
917 joined_value += sep + val
918 sep = "; "
919 # joined_value is a string
920 result[key] = joined_value
921
922 newstring = "; ".join(
923 [", ".join(prez) for prez in result[PHOSPHORESIDUE]]
924 )
925 # #separate the isoforms in PHOSPHORESIDUE column with ";"
926 # oldstring = result[PHOSPHORESIDUE]
927 # oldlist = list(oldstring)
928 # newstring = ""
929 # i = 0
930 # for e in oldlist:
931 # if e == ";":
932 # if numps > 1:
933 # if i%numps:
934 # newstring = newstring + ";"
935 # else:
936 # newstring = newstring + ","
937 # else:
938 # newstring = newstring + ";"
939 # i +=1
940 # else:
941 # newstring = newstring + e
942 result[PHOSPHORESIDUE] = newstring
943
944 # separate sequence7's by |
945 oldstring = result[SEQUENCE7]
946 oldlist = oldstring
947 newstring = ""
948 for ol in oldlist:
949 for e in ol:
950 if e == ";":
951 newstring = newstring + " |"
952 elif len(newstring) > 0 and 1 > newstring.count(e):
953 newstring = newstring + " | " + e
954 elif 1 > newstring.count(e):
955 newstring = newstring + e
956 result[SEQUENCE7] = newstring
957
958 return [phospho_pep, result]
959
960 # Construct list of [string, dictionary] lists
961 # where the dictionary provides the SwissProt metadata
962 # for a phosphopeptide
963 result_list = [
964 whine(pseq_to_subdict, psequence)
965 for psequence in data_in[PHOSPHOPEPTIDE_MATCH]
966 ]
967
968 end_time = time.process_time() # timer
969 print(
970 "%0.6f added SwissProt annotations to phosphopeptides [B]"
971 % (end_time - start_time,),
972 file=sys.stderr,
973 ) # timer
974
975 # Construct dictionary from list of lists
976 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/
977 UniProt_Info = {
978 result[0]: result[1]
979 for result in result_list
980 if result is not None
981 }
982
983 end_time = time.process_time() # timer
984 print(
985 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]"
986 % (end_time - start_time,),
987 file=sys.stderr,
988 ) # timer
989
990 # cosmetic: add N_A to phosphopeptide rows with no hits
991 p_peptide_list = []
992 for key in UniProt_Info:
993 p_peptide_list.append(key)
994 for nestedKey in UniProt_Info[key]:
995 if UniProt_Info[key][nestedKey] == "":
996 UniProt_Info[key][nestedKey] = N_A
997
998 end_time = time.process_time() # timer
999 print(
1000 "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,),
1001 file=sys.stderr,
1002 ) # timer
1003
1004 # convert UniProt_Info dictionary to dataframe
1005 uniprot_df = pandas.DataFrame.transpose(
1006 pandas.DataFrame.from_dict(UniProt_Info)
1007 )
1008
1009 # reorder columns to match expected output file
1010 uniprot_df[
1011 PHOSPHOPEPTIDE
1012 ] = uniprot_df.index # make index a column too
1013
1014 cols = uniprot_df.columns.tolist()
1015 # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]]
1016 # uniprot_df = uniprot_df[cols]
1017 uniprot_df = uniprot_df[
1018 [
1019 PHOSPHOPEPTIDE,
1020 SEQUENCE10,
1021 SEQUENCE7,
1022 GENE_NAME,
1023 PHOSPHORESIDUE,
1024 UNIPROT_ID,
1025 DESCRIPTION,
1026 ]
1027 ]
1028
1029 end_time = time.process_time() # timer
1030 print(
1031 "%0.6f reordered columns to match expected output file [1]"
1032 % (end_time - start_time,),
1033 file=sys.stderr,
1034 ) # timer
1035
1036 # concat to split then groupby to collapse
1037 seq7_df = pandas.concat(
1038 [
1039 pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | "))
1040 for _, row in uniprot_df.iterrows()
1041 ]
1042 ).reset_index()
1043 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE]
1044
1045 # --- -------------- begin read PSP_Regulatory_sites ---------------------------------
1046 # read in PhosphoSitePlus Regulatory Sites dataset
1047 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) -----------
1048 conn = sql.connect(uniprot_sqlite)
1049 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn)
1050 # Close SwissProt SQLite database
1051 conn.close()
1052 # ... -------------- end read PSP_Regulatory_sites ------------------------------------
1053
1054 # keep only the human entries in dataframe
1055 if len(species) > 0:
1056 print(
1057 'Limit PhosphoSitesPlus records to species "' + species + '"'
1058 )
1059 regsites_df = regsites_df[regsites_df.ORGANISM == species]
1060
1061 # merge the seq7 df with the regsites df based off of the sequence7
1062 merge_df = seq7_df.merge(
1063 regsites_df,
1064 left_on=SEQUENCE7,
1065 right_on=SITE_PLUSMINUS_7AA_SQL,
1066 how="left",
1067 )
1068
1069 # after merging df, select only the columns of interest;
1070 # note that PROTEIN is absent here
1071 merge_df = merge_df[
1072 [
1073 PHOSPHOPEPTIDE,
1074 SEQUENCE7,
1075 ON_FUNCTION,
1076 ON_PROCESS,
1077 ON_PROT_INTERACT,
1078 ON_OTHER_INTERACT,
1079 ON_NOTES,
1080 ]
1081 ]
1082 # combine column values of interest
1083 # into one FUNCTION_PHOSPHORESIDUE column"
1084 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat(
1085 merge_df[ON_PROCESS], sep="; ", na_rep=""
1086 )
1087 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
1088 FUNCTION_PHOSPHORESIDUE
1089 ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="")
1090 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
1091 FUNCTION_PHOSPHORESIDUE
1092 ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="")
1093 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[
1094 FUNCTION_PHOSPHORESIDUE
1095 ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="")
1096
1097 # remove the columns that were combined
1098 merge_df = merge_df[
1099 [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE]
1100 ]
1101
1102 end_time = time.process_time() # timer
1103 print(
1104 "%0.6f merge regsite metadata [1a]" % (end_time - start_time,),
1105 file=sys.stderr,
1106 ) # timer
1107
1108 # cosmetic changes to Function Phosphoresidue column
1109 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE])
1110
1111 end_time = time.process_time() # timer
1112 print(
1113 "%0.6f more cosmetic changes [1b]" % (end_time - start_time,),
1114 file=sys.stderr,
1115 ) # timer
1116
1117 i = 0
1118 while i < len(fp_series):
1119 # remove the extra ";" so that it looks more professional
1120 if fp_series[i] == "; ; ; ; ": # remove ; from empty hits
1121 fp_series[i] = ""
1122 while fp_series[i].endswith("; "): # remove ; from the ends
1123 fp_series[i] = fp_series[i][:-2]
1124 while fp_series[i].startswith("; "): # remove ; from the beginning
1125 fp_series[i] = fp_series[i][2:]
1126 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ")
1127 fp_series[i] = fp_series[i].replace("; ; ; ", "; ")
1128 fp_series[i] = fp_series[i].replace("; ; ", "; ")
1129
1130 # turn blanks into N_A to signify the info was searched for but cannot be found
1131 if fp_series[i] == "":
1132 fp_series[i] = N_A
1133
1134 i += 1
1135 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series
1136
1137 end_time = time.process_time() # timer
1138 print(
1139 "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,),
1140 file=sys.stderr,
1141 ) # timer
1142
1143 # merge uniprot df with merge df
1144 uniprot_regsites_merged_df = uniprot_df.merge(
1145 merge_df,
1146 left_on=PHOSPHOPEPTIDE,
1147 right_on=PHOSPHOPEPTIDE,
1148 how="left",
1149 )
1150
1151 # collapse the merged df
1152 uniprot_regsites_collapsed_df = pandas.DataFrame(
1153 uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[
1154 FUNCTION_PHOSPHORESIDUE
1155 ].apply(lambda x: ppep_join(x))
1156 )
1157 # .apply(lambda x: "%s" % ' | '.join(x)))
1158
1159 end_time = time.process_time() # timer
1160 print(
1161 "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,),
1162 file=sys.stderr,
1163 ) # timer
1164
1165 uniprot_regsites_collapsed_df[
1166 PHOSPHOPEPTIDE
1167 ] = (
1168 uniprot_regsites_collapsed_df.index
1169 ) # add df index as its own column
1170
1171 # rename columns
1172 uniprot_regsites_collapsed_df.columns = [
1173 FUNCTION_PHOSPHORESIDUE,
1174 "ppp",
1175 ]
1176
1177 end_time = time.process_time() # timer
1178 print(
1179 "%0.6f selected columns to be merged to uniprot_df [1e]"
1180 % (end_time - start_time,),
1181 file=sys.stderr,
1182 ) # timer
1183
1184 # add columns based on Sequence7 matching site_+/-7_AA
1185 uniprot_regsite_df = pandas.merge(
1186 left=uniprot_df,
1187 right=uniprot_regsites_collapsed_df,
1188 how="left",
1189 left_on=PHOSPHOPEPTIDE,
1190 right_on="ppp",
1191 )
1192
1193 end_time = time.process_time() # timer
1194 print(
1195 "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]"
1196 % (end_time - start_time,),
1197 file=sys.stderr,
1198 ) # timer
1199
1200 data_in.rename(
1201 {"Protein description": PHOSPHOPEPTIDE},
1202 axis="columns",
1203 inplace=True,
1204 )
1205
1206 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort')
1207 res2 = sorted(
1208 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold()
1209 )
1210 data_in = data_in.loc[res2]
1211
1212 end_time = time.process_time() # timer
1213 print(
1214 "%0.6f sorting time [1f]" % (end_time - start_time,),
1215 file=sys.stderr,
1216 ) # timer
1217
1218 print("old_cols[:col_PKCalpha]")
1219 print(old_cols[:col_PKCalpha])
1220 cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:]
1221 upstream_data = upstream_data[cols]
1222 print("upstream_data.columns")
1223 print(upstream_data.columns)
1224
1225 end_time = time.process_time() # timer
1226 print(
1227 "%0.6f refactored columns for Upstream Map [1g]"
1228 % (end_time - start_time,),
1229 file=sys.stderr,
1230 ) # timer
1231
1232 # #rename upstream columns in new list
1233 # new_cols = []
1234 # for name in cols:
1235 # if "_NetworKIN" in name:
1236 # name = name.split("_")[0]
1237 # if " motif" in name:
1238 # name = name.split(" motif")[0]
1239 # if " sequence " in name:
1240 # name = name.split(" sequence")[0]
1241 # if "_Phosida" in name:
1242 # name = name.split("_")[0]
1243 # if "_PhosphoSite" in name:
1244 # name = name.split("_")[0]
1245 # new_cols.append(name)
1246
1247 # rename upstream columns in new list
1248 def col_rename(name):
1249 if "_NetworKIN" in name:
1250 name = name.split("_")[0]
1251 if " motif" in name:
1252 name = name.split(" motif")[0]
1253 if " sequence " in name:
1254 name = name.split(" sequence")[0]
1255 if "_Phosida" in name:
1256 name = name.split("_")[0]
1257 if "_PhosphoSite" in name:
1258 name = name.split("_")[0]
1259 return name
1260
1261 new_cols = [col_rename(col) for col in cols]
1262 upstream_data.columns = new_cols
1263
1264 end_time = time.process_time() # timer
1265 print(
1266 "%0.6f renamed columns for Upstream Map [1h_1]"
1267 % (end_time - start_time,),
1268 file=sys.stderr,
1269 ) # timer
1270
1271 # Create upstream_data_cast as a copy of upstream_data
1272 # but with first column substituted by the phosphopeptide sequence
1273 upstream_data_cast = upstream_data.copy()
1274 new_cols_cast = new_cols
1275 new_cols_cast[0] = "p_peptide"
1276 upstream_data_cast.columns = new_cols_cast
1277 upstream_data_cast["p_peptide"] = upstream_data.index
1278
1279 # --- -------------- begin read upstream_data_melt ------------------------------------
1280 # ----------- Get melted kinase mapping data from SQLite database (start) -----------
1281 conn = sql.connect(uniprot_sqlite)
1282 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn)
1283 # Close SwissProt SQLite database
1284 conn.close()
1285 upstream_data_melt = upstream_data_melt_df.copy()
1286 upstream_data_melt.columns = ["p_peptide", "characterization", "X"]
1287 upstream_data_melt["characterization"] = [
1288 col_rename(s) for s in upstream_data_melt["characterization"]
1289 ]
1290
1291 print(
1292 "%0.6f upstream_data_melt_df initially has %d rows"
1293 % (end_time - start_time, len(upstream_data_melt.axes[0])),
1294 file=sys.stderr,
1295 )
1296 # ref: https://stackoverflow.com/a/27360130/15509512
1297 # e.g. df.drop(df[df.score < 50].index, inplace=True)
1298 upstream_data_melt.drop(
1299 upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True
1300 )
1301 print(
1302 "%0.6f upstream_data_melt_df pre-dedup has %d rows"
1303 % (end_time - start_time, len(upstream_data_melt.axes[0])),
1304 file=sys.stderr,
1305 )
1306 # ----------- Get melted kinase mapping data from SQLite database (finish) -----------
1307 # ... -------------- end read upstream_data_melt --------------------------------------
1308
1309 end_time = time.process_time() # timer
1310 print(
1311 "%0.6f melted and minimized Upstream Map dataframe [1h_2]"
1312 % (end_time - start_time,),
1313 file=sys.stderr,
1314 ) # timer
1315 # ... end read upstream_data_melt
1316
1317 end_time = time.process_time() # timer
1318 print(
1319 "%0.6f indexed melted Upstream Map [1h_2a]"
1320 % (end_time - start_time,),
1321 file=sys.stderr,
1322 ) # timer
1323
1324 upstream_delta_melt_LoL = upstream_data_melt.values.tolist()
1325
1326 melt_dict = {}
1327 for key in upstream_map_p_peptide_list:
1328 melt_dict[key] = []
1329
1330 for el in upstream_delta_melt_LoL:
1331 (p_peptide, characterization, X) = tuple(el)
1332 if p_peptide in melt_dict:
1333 melt_dict[p_peptide].append(characterization)
1334 else:
1335 exit(
1336 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping'
1337 % (p_peptide)
1338 )
1339
1340 end_time = time.process_time() # timer
1341 print(
1342 "%0.6f appended peptide characterizations [1h_2b]"
1343 % (end_time - start_time,),
1344 file=sys.stderr,
1345 ) # timer
1346
1347 # for key in upstream_map_p_peptide_list:
1348 # melt_dict[key] = ' | '.join(melt_dict[key])
1349
1350 for key in upstream_map_p_peptide_list:
1351 melt_dict[key] = melt_join(melt_dict[key])
1352
1353 end_time = time.process_time() # timer
1354 print(
1355 "%0.6f concatenated multiple characterizations [1h_2c]"
1356 % (end_time - start_time,),
1357 file=sys.stderr,
1358 ) # timer
1359
1360 # map_dict is a dictionary of dictionaries
1361 map_dict = {}
1362 for key in upstream_map_p_peptide_list:
1363 map_dict[key] = {}
1364 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key]
1365
1366 end_time = time.process_time() # timer
1367 print(
1368 "%0.6f instantiated map dictionary [2]" % (end_time - start_time,),
1369 file=sys.stderr,
1370 ) # timer
1371
1372 # convert map_dict to dataframe
1373 map_df = pandas.DataFrame.transpose(
1374 pandas.DataFrame.from_dict(map_dict)
1375 )
1376 map_df["p-peptide"] = map_df.index # make index a column too
1377 cols_map_df = map_df.columns.tolist()
1378 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]]
1379 map_df = map_df[cols_map_df]
1380
1381 # join map_df to uniprot_regsite_df
1382 output_df = uniprot_regsite_df.merge(
1383 map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide"
1384 )
1385
1386 output_df = output_df[
1387 [
1388 PHOSPHOPEPTIDE,
1389 SEQUENCE10,
1390 SEQUENCE7,
1391 GENE_NAME,
1392 PHOSPHORESIDUE,
1393 UNIPROT_ID,
1394 DESCRIPTION,
1395 FUNCTION_PHOSPHORESIDUE,
1396 PUTATIVE_UPSTREAM_DOMAINS,
1397 ]
1398 ]
1399
1400 # cols_output_prelim = output_df.columns.tolist()
1401 #
1402 # print("cols_output_prelim")
1403 # print(cols_output_prelim)
1404 #
1405 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]]
1406 #
1407 # print("cols_output with p-peptide")
1408 # print(cols_output)
1409 #
1410 # cols_output = [col for col in cols_output if not col == "p-peptide"]
1411 #
1412 # print("cols_output")
1413 # print(cols_output)
1414 #
1415 # output_df = output_df[cols_output]
1416
1417 # join output_df back to quantitative columns in data_in df
1418 quant_cols = data_in.columns.tolist()
1419 quant_cols = quant_cols[1:]
1420 quant_data = data_in[quant_cols]
1421
1422 # ----------- Write merge/filter metadata to SQLite database (start) -----------
1423 # Open SwissProt SQLite database
1424 conn = sql.connect(output_sqlite)
1425 cur = conn.cursor()
1426
1427 cur.executescript(MRGFLTR_DDL)
1428
1429 cur.execute(
1430 CITATION_INSERT_STMT,
1431 ("mrgfltr_metadata_view", CITATION_INSERT_PSP),
1432 )
1433 cur.execute(
1434 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP)
1435 )
1436 cur.execute(
1437 CITATION_INSERT_STMT,
1438 ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF),
1439 )
1440 cur.execute(
1441 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF)
1442 )
1443
1444 # Read ppep-to-sequence LUT
1445 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn)
1446 # write only metadata for merged/filtered records to SQLite
1447 mrgfltr_metadata_df = output_df.copy()
1448 # replace phosphopeptide seq with ppep.id
1449 mrgfltr_metadata_df = ppep_lut_df.merge(
1450 mrgfltr_metadata_df,
1451 left_on="ppep_seq",
1452 right_on=PHOSPHOPEPTIDE,
1453 how="inner",
1454 )
1455 mrgfltr_metadata_df.drop(
1456 columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True
1457 )
1458 # rename columns
1459 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS
1460 mrgfltr_metadata_df.to_sql(
1461 "mrgfltr_metadata",
1462 con=conn,
1463 if_exists="append",
1464 index=False,
1465 method="multi",
1466 )
1467
1468 # Close SwissProt SQLite database
1469 conn.close()
1470 # ----------- Write merge/filter metadata to SQLite database (finish) -----------
1471
1472 output_df = output_df.merge(
1473 quant_data,
1474 how="right",
1475 left_on=PHOSPHOPEPTIDE,
1476 right_on=PHOSPHOPEPTIDE_MATCH,
1477 )
1478 output_cols = output_df.columns.tolist()
1479 output_cols = output_cols[:-1]
1480 output_df = output_df[output_cols]
1481
1482 # cosmetic changes to Upstream column
1483 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[
1484 PUTATIVE_UPSTREAM_DOMAINS
1485 ].fillna(
1486 ""
1487 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping
1488 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS])
1489 i = 0
1490 while i < len(us_series):
1491 # turn blanks into N_A to signify the info was searched for but cannot be found
1492 if us_series[i] == "":
1493 us_series[i] = N_A
1494 i += 1
1495 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series
1496
1497 end_time = time.process_time() # timer
1498 print(
1499 "%0.6f establisheed output [3]" % (end_time - start_time,),
1500 file=sys.stderr,
1501 ) # timer
1502
1503 (output_rows, output_cols) = output_df.shape
1504
1505 output_df = output_df.convert_dtypes(convert_integer=True)
1506
1507 # Output onto Final CSV file
1508 output_df.to_csv(output_filename_csv, index=False)
1509 output_df.to_csv(
1510 output_filename_tab, quoting=None, sep="\t", index=False
1511 )
1512
1513 end_time = time.process_time() # timer
1514 print(
1515 "%0.6f wrote output [4]" % (end_time - start_time,),
1516 file=sys.stderr,
1517 ) # timer
1518
1519 print(
1520 "{:>10} phosphopeptides written to output".format(str(output_rows))
1521 )
1522
1523 end_time = time.process_time() # timer
1524 print(
1525 "%0.6f seconds of non-system CPU time were consumed"
1526 % (end_time - start_time,),
1527 file=sys.stderr,
1528 ) # timer
1529
1530 # Rev. 7/1/2016
1531 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's
1532 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS
1533 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \
1534 # read from SwissProt SQLite database
1535 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper
1536
1537 #
1538 # copied from Excel Output Script.ipynb END #
1539 #
1540
1541 try:
1542 catch(
1543 mqpep_getswissprot,
1544 )
1545 exit(0)
1546 except Exception as e:
1547 exit("Internal error running mqpep_getswissprot(): %s" % (e))
1548
1549
1550 if __name__ == "__main__":
1551 __main__()