Mercurial > repos > galaxyp > mqppep_anova
comparison mqppep_mrgfltr.py @ 0:dbff53e6f75f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
| author | galaxyp |
|---|---|
| date | Mon, 11 Jul 2022 19:22:25 +0000 |
| parents | |
| children | 08678c931f5d |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dbff53e6f75f |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Import the packages needed | |
| 4 import argparse | |
| 5 import operator # for operator.itemgetter | |
| 6 import os.path | |
| 7 import re | |
| 8 import shutil # for shutil.copyfile(src, dest) | |
| 9 import sqlite3 as sql | |
| 10 import sys # import the sys module for exc_info | |
| 11 import time | |
| 12 import traceback # for formatting stack-trace | |
| 13 from codecs import getreader as cx_getreader | |
| 14 | |
| 15 import numpy as np | |
| 16 import pandas | |
| 17 | |
| 18 # global constants | |
| 19 N_A = "N/A" | |
| 20 | |
| 21 | |
| 22 # ref: https://stackoverflow.com/a/8915613/15509512 | |
| 23 # answers: "How to handle exceptions in a list comprehensions" | |
| 24 # usage: | |
| 25 # from math import log | |
| 26 # eggs = [1,3,0,3,2] | |
| 27 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) | |
| 28 # producing: | |
| 29 # for <built-in function log> | |
| 30 # with args (0,) | |
| 31 # exception: math domain error | |
| 32 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] | |
| 33 def catch(func, *args, handle=lambda e: e, **kwargs): | |
| 34 | |
| 35 try: | |
| 36 return func(*args, **kwargs) | |
| 37 except Exception as e: | |
| 38 print("For %s" % str(func)) | |
| 39 print(" with args %s" % str(args)) | |
| 40 print(" caught exception: %s" % str(e)) | |
| 41 (ty, va, tb) = sys.exc_info() | |
| 42 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
| 43 exit(-1) | |
| 44 return None | |
| 45 | |
| 46 | |
| 47 def whine(func, *args, handle=lambda e: e, **kwargs): | |
| 48 | |
| 49 try: | |
| 50 return func(*args, **kwargs) | |
| 51 except Exception as e: | |
| 52 print("Warning: For %s" % str(func)) | |
| 53 print(" with args %s" % str(args)) | |
| 54 print(" caught exception: %s" % str(e)) | |
| 55 (ty, va, tb) = sys.exc_info() | |
| 56 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
| 57 return None | |
| 58 | |
| 59 | |
| 60 def ppep_join(x): | |
| 61 x = [i for i in x if N_A != i] | |
| 62 result = "%s" % " | ".join(x) | |
| 63 if result != "": | |
| 64 return result | |
| 65 else: | |
| 66 return N_A | |
| 67 | |
| 68 | |
| 69 def melt_join(x): | |
| 70 tmp = {key.lower(): key for key in x} | |
| 71 result = "%s" % " | ".join([tmp[key] for key in tmp]) | |
| 72 return result | |
| 73 | |
| 74 | |
| 75 def __main__(): | |
| 76 # Parse Command Line | |
| 77 parser = argparse.ArgumentParser( | |
| 78 description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." | |
| 79 ) | |
| 80 | |
| 81 # inputs: | |
| 82 # Phosphopeptide data for experimental results, including the intensities | |
| 83 # and the mapping to kinase domains, in tabular format. | |
| 84 parser.add_argument( | |
| 85 "--phosphopeptides", | |
| 86 "-p", | |
| 87 nargs=1, | |
| 88 required=True, | |
| 89 dest="phosphopeptides", | |
| 90 help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", | |
| 91 ) | |
| 92 # UniProtKB/SwissProt DB input, SQLite | |
| 93 parser.add_argument( | |
| 94 "--ppep_mapping_db", | |
| 95 "-d", | |
| 96 nargs=1, | |
| 97 required=True, | |
| 98 dest="ppep_mapping_db", | |
| 99 help="UniProtKB/SwissProt SQLite Database", | |
| 100 ) | |
| 101 # species to limit records chosed from PhosPhositesPlus | |
| 102 parser.add_argument( | |
| 103 "--species", | |
| 104 "-x", | |
| 105 nargs=1, | |
| 106 required=False, | |
| 107 default=[], | |
| 108 dest="species", | |
| 109 help="limit PhosphoSitePlus records to indicated species (field may be empty)", | |
| 110 ) | |
| 111 | |
| 112 # outputs: | |
| 113 # tabular output | |
| 114 parser.add_argument( | |
| 115 "--mrgfltr_tab", | |
| 116 "-o", | |
| 117 nargs=1, | |
| 118 required=True, | |
| 119 dest="mrgfltr_tab", | |
| 120 help="Tabular output file for results", | |
| 121 ) | |
| 122 # CSV output | |
| 123 parser.add_argument( | |
| 124 "--mrgfltr_csv", | |
| 125 "-c", | |
| 126 nargs=1, | |
| 127 required=True, | |
| 128 dest="mrgfltr_csv", | |
| 129 help="CSV output file for results", | |
| 130 ) | |
| 131 # SQLite output | |
| 132 parser.add_argument( | |
| 133 "--mrgfltr_sqlite", | |
| 134 "-S", | |
| 135 nargs=1, | |
| 136 required=True, | |
| 137 dest="mrgfltr_sqlite", | |
| 138 help="SQLite output file for results", | |
| 139 ) | |
| 140 | |
| 141 # "Make it so!" (parse the arguments) | |
| 142 options = parser.parse_args() | |
| 143 print("options: " + str(options)) | |
| 144 | |
| 145 # determine phosphopeptide ("upstream map") input tabular file access | |
| 146 if options.phosphopeptides is None: | |
| 147 exit('Argument "phosphopeptides" is required but not supplied') | |
| 148 try: | |
| 149 upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) | |
| 150 input_file = open(upstream_map_filename_tab, "r") | |
| 151 input_file.close() | |
| 152 except Exception as e: | |
| 153 exit("Error parsing phosphopeptides argument: %s" % str(e)) | |
| 154 | |
| 155 # determine input SQLite access | |
| 156 if options.ppep_mapping_db is None: | |
| 157 exit('Argument "ppep_mapping_db" is required but not supplied') | |
| 158 try: | |
| 159 uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) | |
| 160 input_file = open(uniprot_sqlite, "rb") | |
| 161 input_file.close() | |
| 162 except Exception as e: | |
| 163 exit("Error parsing ppep_mapping_db argument: %s" % str(e)) | |
| 164 | |
| 165 # copy input SQLite dataset to output SQLite dataset | |
| 166 if options.mrgfltr_sqlite is None: | |
| 167 exit('Argument "mrgfltr_sqlite" is required but not supplied') | |
| 168 try: | |
| 169 output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) | |
| 170 shutil.copyfile(uniprot_sqlite, output_sqlite) | |
| 171 except Exception as e: | |
| 172 exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) | |
| 173 | |
| 174 # determine species to limit records from PSP_Regulatory_Sites | |
| 175 if options.species is None: | |
| 176 exit( | |
| 177 'Argument "species" is required (and may be empty) but not supplied' | |
| 178 ) | |
| 179 try: | |
| 180 if len(options.species) > 0: | |
| 181 species = options.species[0] | |
| 182 else: | |
| 183 species = "" | |
| 184 except Exception as e: | |
| 185 exit("Error parsing species argument: %s" % str(e)) | |
| 186 | |
| 187 # determine tabular output destination | |
| 188 if options.mrgfltr_tab is None: | |
| 189 exit('Argument "mrgfltr_tab" is required but not supplied') | |
| 190 try: | |
| 191 output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) | |
| 192 output_file = open(output_filename_tab, "w") | |
| 193 output_file.close() | |
| 194 except Exception as e: | |
| 195 exit("Error parsing mrgfltr_tab argument: %s" % str(e)) | |
| 196 | |
| 197 # determine CSV output destination | |
| 198 if options.mrgfltr_csv is None: | |
| 199 exit('Argument "mrgfltr_csv" is required but not supplied') | |
| 200 try: | |
| 201 output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) | |
| 202 output_file = open(output_filename_csv, "w") | |
| 203 output_file.close() | |
| 204 except Exception as e: | |
| 205 exit("Error parsing mrgfltr_csv argument: %s" % str(e)) | |
| 206 | |
| 207 def mqpep_getswissprot(): | |
| 208 | |
| 209 # | |
| 210 # copied from Excel Output Script.ipynb BEGIN # | |
| 211 # | |
| 212 | |
| 213 # String Constants ################# | |
| 214 DEPHOSPHOPEP = "DephosphoPep" | |
| 215 DESCRIPTION = "Description" | |
| 216 FUNCTION_PHOSPHORESIDUE = ( | |
| 217 "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" | |
| 218 ) | |
| 219 GENE_NAME = "Gene_Name" # Gene Name from UniProtKB | |
| 220 ON_FUNCTION = ( | |
| 221 "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites | |
| 222 ) | |
| 223 ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites | |
| 224 ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites | |
| 225 ON_PROCESS = ( | |
| 226 "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites | |
| 227 ) | |
| 228 ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites | |
| 229 PHOSPHOPEPTIDE = "Phosphopeptide" | |
| 230 PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" | |
| 231 PHOSPHORESIDUE = "Phosphoresidue" | |
| 232 PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" | |
| 233 SEQUENCE = "Sequence" | |
| 234 SEQUENCE10 = "Sequence10" | |
| 235 SEQUENCE7 = "Sequence7" | |
| 236 SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" | |
| 237 UNIPROT_ID = "UniProt_ID" | |
| 238 UNIPROT_SEQ_AND_META_SQL = """ | |
| 239 select Uniprot_ID, Description, Gene_Name, Sequence, | |
| 240 Organism_Name, Organism_ID, PE, SV | |
| 241 from UniProtKB | |
| 242 order by Sequence, UniProt_ID | |
| 243 """ | |
| 244 UNIPROT_UNIQUE_SEQ_SQL = """ | |
| 245 select distinct Sequence | |
| 246 from UniProtKB | |
| 247 group by Sequence | |
| 248 """ | |
| 249 PPEP_PEP_UNIPROTSEQ_SQL = """ | |
| 250 select distinct phosphopeptide, peptide, sequence | |
| 251 from uniprotkb_pep_ppep_view | |
| 252 order by sequence | |
| 253 """ | |
| 254 PPEP_MELT_SQL = """ | |
| 255 SELECT DISTINCT | |
| 256 phospho_peptide AS 'p_peptide', | |
| 257 kinase_map AS 'characterization', | |
| 258 'X' AS 'X' | |
| 259 FROM ppep_gene_site_view | |
| 260 """ | |
| 261 # CREATE TABLE PSP_Regulatory_site ( | |
| 262 # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, | |
| 263 # domain TEXT, | |
| 264 # ON_FUNCTION TEXT, | |
| 265 # ON_PROCESS TEXT, | |
| 266 # ON_PROT_INTERACT TEXT, | |
| 267 # ON_OTHER_INTERACT TEXT, | |
| 268 # notes TEXT, | |
| 269 # organism TEXT | |
| 270 # ); | |
| 271 PSP_REGSITE_SQL = """ | |
| 272 SELECT DISTINCT | |
| 273 SITE_PLUSMINUS_7AA , | |
| 274 DOMAIN , | |
| 275 ON_FUNCTION , | |
| 276 ON_PROCESS , | |
| 277 ON_PROT_INTERACT , | |
| 278 ON_OTHER_INTERACT , | |
| 279 NOTES , | |
| 280 ORGANISM | |
| 281 FROM PSP_Regulatory_site | |
| 282 """ | |
| 283 PPEP_ID_SQL = """ | |
| 284 SELECT | |
| 285 id AS 'ppep_id', | |
| 286 seq AS 'ppep_seq' | |
| 287 FROM ppep | |
| 288 """ | |
| 289 MRGFLTR_DDL = """ | |
| 290 DROP VIEW IF EXISTS mrgfltr_metadata_view; | |
| 291 DROP TABLE IF EXISTS mrgfltr_metadata; | |
| 292 CREATE TABLE mrgfltr_metadata | |
| 293 ( ppep_id INTEGER REFERENCES ppep(id) | |
| 294 , Sequence10 TEXT | |
| 295 , Sequence7 TEXT | |
| 296 , GeneName TEXT | |
| 297 , Phosphoresidue TEXT | |
| 298 , UniProtID TEXT | |
| 299 , Description TEXT | |
| 300 , FunctionPhosphoresidue TEXT | |
| 301 , PutativeUpstreamDomains TEXT | |
| 302 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE | |
| 303 ) | |
| 304 ; | |
| 305 CREATE VIEW mrgfltr_metadata_view AS | |
| 306 SELECT DISTINCT | |
| 307 ppep.seq AS phospho_peptide | |
| 308 , Sequence10 | |
| 309 , Sequence7 | |
| 310 , GeneName | |
| 311 , Phosphoresidue | |
| 312 , UniProtID | |
| 313 , Description | |
| 314 , FunctionPhosphoresidue | |
| 315 , PutativeUpstreamDomains | |
| 316 FROM | |
| 317 ppep, mrgfltr_metadata | |
| 318 WHERE | |
| 319 mrgfltr_metadata.ppep_id = ppep.id | |
| 320 ORDER BY | |
| 321 ppep.seq | |
| 322 ; | |
| 323 """ | |
| 324 | |
| 325 CITATION_INSERT_STMT = """ | |
| 326 INSERT INTO Citation ( | |
| 327 ObjectName, | |
| 328 CitationData | |
| 329 ) VALUES (?,?) | |
| 330 """ | |
| 331 CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' | |
| 332 CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' | |
| 333 | |
| 334 MRGFLTR_METADATA_COLUMNS = [ | |
| 335 "ppep_id", | |
| 336 "Sequence10", | |
| 337 "Sequence7", | |
| 338 "GeneName", | |
| 339 "Phosphoresidue", | |
| 340 "UniProtID", | |
| 341 "Description", | |
| 342 "FunctionPhosphoresidue", | |
| 343 "PutativeUpstreamDomains", | |
| 344 ] | |
| 345 | |
| 346 # String Constants (end) ############ | |
| 347 | |
| 348 class Error(Exception): | |
| 349 """Base class for exceptions in this module.""" | |
| 350 | |
| 351 pass | |
| 352 | |
| 353 class PreconditionError(Error): | |
| 354 """Exception raised for errors in the input. | |
| 355 | |
| 356 Attributes: | |
| 357 expression -- input expression in which the error occurred | |
| 358 message -- explanation of the error | |
| 359 """ | |
| 360 | |
| 361 def __init__(self, expression, message): | |
| 362 self.expression = expression | |
| 363 self.message = message | |
| 364 | |
| 365 # start_time = time.clock() #timer | |
| 366 start_time = time.process_time() # timer | |
| 367 | |
| 368 # get keys from upstream tabular file using readline() | |
| 369 # ref: https://stackoverflow.com/a/16713581/15509512 | |
| 370 # answer to "Use codecs to read file with correct encoding" | |
| 371 file1_encoded = open(upstream_map_filename_tab, "rb") | |
| 372 file1 = cx_getreader("latin-1")(file1_encoded) | |
| 373 | |
| 374 count = 0 | |
| 375 upstream_map_p_peptide_list = [] | |
| 376 re_tab = re.compile("^[^\t]*") | |
| 377 while True: | |
| 378 count += 1 | |
| 379 # Get next line from file | |
| 380 line = file1.readline() | |
| 381 # if line is empty | |
| 382 # end of file is reached | |
| 383 if not line: | |
| 384 break | |
| 385 if count > 1: | |
| 386 m = re_tab.match(line) | |
| 387 upstream_map_p_peptide_list.append(m[0]) | |
| 388 file1.close() | |
| 389 file1_encoded.close() | |
| 390 | |
| 391 # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed | |
| 392 re_phos = re.compile("p") | |
| 393 | |
| 394 end_time = time.process_time() # timer | |
| 395 print( | |
| 396 "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), | |
| 397 file=sys.stderr, | |
| 398 ) | |
| 399 | |
| 400 # ----------- Get SwissProt data from SQLite database (start) ----------- | |
| 401 # build UniProt sequence LUT and list of unique SwissProt sequences | |
| 402 | |
| 403 # Open SwissProt SQLite database | |
| 404 conn = sql.connect(uniprot_sqlite) | |
| 405 cur = conn.cursor() | |
| 406 | |
| 407 # Set up structures to hold SwissProt data | |
| 408 | |
| 409 uniprot_Sequence_List = [] | |
| 410 UniProtSeqLUT = {} | |
| 411 | |
| 412 # Execute query for unique seqs without fetching the results yet | |
| 413 uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) | |
| 414 | |
| 415 while 1: | |
| 416 batch = uniprot_unique_seq_cur.fetchmany(size=50) | |
| 417 if not batch: | |
| 418 # handle case where no records are returned | |
| 419 break | |
| 420 for row in batch: | |
| 421 Sequence = row[0] | |
| 422 UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] | |
| 423 UniProtSeqLUT[(Sequence, GENE_NAME)] = [] | |
| 424 UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] | |
| 425 UniProtSeqLUT[Sequence] = [] | |
| 426 | |
| 427 # Execute query for seqs and metadata without fetching the results yet | |
| 428 uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) | |
| 429 | |
| 430 while 1: | |
| 431 batch = uniprot_seq_and_meta.fetchmany(size=50) | |
| 432 if not batch: | |
| 433 # handle case where no records are returned | |
| 434 break | |
| 435 for ( | |
| 436 UniProt_ID, | |
| 437 Description, | |
| 438 Gene_Name, | |
| 439 Sequence, | |
| 440 OS, | |
| 441 OX, | |
| 442 PE, | |
| 443 SV, | |
| 444 ) in batch: | |
| 445 uniprot_Sequence_List.append(Sequence) | |
| 446 UniProtSeqLUT[Sequence] = Sequence | |
| 447 UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) | |
| 448 UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) | |
| 449 if OS != N_A: | |
| 450 Description += " OS=" + OS | |
| 451 if OX != -1: | |
| 452 Description += " OX=" + str(OX) | |
| 453 if Gene_Name != N_A: | |
| 454 Description += " GN=" + Gene_Name | |
| 455 if PE != N_A: | |
| 456 Description += " PE=" + PE | |
| 457 if SV != N_A: | |
| 458 Description += " SV=" + SV | |
| 459 UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) | |
| 460 | |
| 461 # Close SwissProt SQLite database; clean up local variables | |
| 462 conn.close() | |
| 463 Sequence = "" | |
| 464 UniProt_ID = "" | |
| 465 Description = "" | |
| 466 Gene_Name = "" | |
| 467 | |
| 468 # ----------- Get SwissProt data from SQLite database (finish) ----------- | |
| 469 | |
| 470 end_time = time.process_time() # timer | |
| 471 print( | |
| 472 "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), | |
| 473 file=sys.stderr, | |
| 474 ) | |
| 475 | |
| 476 # ----------- Get SwissProt data from SQLite database (start) ----------- | |
| 477 # Open SwissProt SQLite database | |
| 478 conn = sql.connect(uniprot_sqlite) | |
| 479 cur = conn.cursor() | |
| 480 | |
| 481 # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide | |
| 482 DephosphoPep_UniProtSeq_LUT = {} | |
| 483 | |
| 484 # Set up dictionary to accumulate results | |
| 485 PhosphoPep_UniProtSeq_LUT = {} | |
| 486 | |
| 487 # Execute query for tuples without fetching the results yet | |
| 488 ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) | |
| 489 | |
| 490 while 1: | |
| 491 batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) | |
| 492 if not batch: | |
| 493 # handle case where no records are returned | |
| 494 break | |
| 495 for (phospho_pep, dephospho_pep, sequence) in batch: | |
| 496 # do interesting stuff here... | |
| 497 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | |
| 498 PhosphoPep_UniProtSeq_LUT[ | |
| 499 (phospho_pep, DEPHOSPHOPEP) | |
| 500 ] = dephospho_pep | |
| 501 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | |
| 502 DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() | |
| 503 DephosphoPep_UniProtSeq_LUT[ | |
| 504 (dephospho_pep, DESCRIPTION) | |
| 505 ] = [] | |
| 506 DephosphoPep_UniProtSeq_LUT[ | |
| 507 (dephospho_pep, GENE_NAME) | |
| 508 ] = [] | |
| 509 DephosphoPep_UniProtSeq_LUT[ | |
| 510 (dephospho_pep, UNIPROT_ID) | |
| 511 ] = [] | |
| 512 DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] | |
| 513 DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) | |
| 514 | |
| 515 if ( | |
| 516 sequence | |
| 517 not in DephosphoPep_UniProtSeq_LUT[ | |
| 518 (dephospho_pep, SEQUENCE) | |
| 519 ] | |
| 520 ): | |
| 521 DephosphoPep_UniProtSeq_LUT[ | |
| 522 (dephospho_pep, SEQUENCE) | |
| 523 ].append(sequence) | |
| 524 for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: | |
| 525 if phospho_pep != phospho_pep: | |
| 526 print( | |
| 527 "phospho_pep:'%s' phospho_pep:'%s'" | |
| 528 % (phospho_pep, phospho_pep) | |
| 529 ) | |
| 530 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | |
| 531 PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep | |
| 532 PhosphoPep_UniProtSeq_LUT[ | |
| 533 (phospho_pep, DEPHOSPHOPEP) | |
| 534 ] = dephospho_pep | |
| 535 r = list( | |
| 536 zip( | |
| 537 [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], | |
| 538 [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], | |
| 539 [ | |
| 540 s | |
| 541 for s in UniProtSeqLUT[(sequence, DESCRIPTION)] | |
| 542 ], | |
| 543 ) | |
| 544 ) | |
| 545 # Sort by `UniProt_ID` | |
| 546 # ref: https://stackoverflow.com/a/4174955/15509512 | |
| 547 r = sorted(r, key=operator.itemgetter(0)) | |
| 548 # Get one tuple for each `phospho_pep` | |
| 549 # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] | |
| 550 for (upid, gn, desc) in r: | |
| 551 # Append pseudo-tuple per UniProt_ID but only when it is not present | |
| 552 if ( | |
| 553 upid | |
| 554 not in DephosphoPep_UniProtSeq_LUT[ | |
| 555 (dephospho_pep, UNIPROT_ID) | |
| 556 ] | |
| 557 ): | |
| 558 DephosphoPep_UniProtSeq_LUT[ | |
| 559 (dephospho_pep, UNIPROT_ID) | |
| 560 ].append(upid) | |
| 561 DephosphoPep_UniProtSeq_LUT[ | |
| 562 (dephospho_pep, DESCRIPTION) | |
| 563 ].append(desc) | |
| 564 DephosphoPep_UniProtSeq_LUT[ | |
| 565 (dephospho_pep, GENE_NAME) | |
| 566 ].append(gn) | |
| 567 | |
| 568 # Close SwissProt SQLite database; clean up local variables | |
| 569 conn.close() | |
| 570 # wipe local variables | |
| 571 phospho_pep = dephospho_pep = sequence = 0 | |
| 572 upid = gn = desc = r = "" | |
| 573 | |
| 574 # ----------- Get SwissProt data from SQLite database (finish) ----------- | |
| 575 | |
| 576 end_time = time.process_time() # timer | |
| 577 print( | |
| 578 "%0.6f finished reading and decoding '%s' [0.4]" | |
| 579 % (end_time - start_time, upstream_map_filename_tab), | |
| 580 file=sys.stderr, | |
| 581 ) | |
| 582 | |
| 583 print( | |
| 584 "{:>10} unique upstream phosphopeptides tested".format( | |
| 585 str(len(upstream_map_p_peptide_list)) | |
| 586 ) | |
| 587 ) | |
| 588 | |
| 589 # Read in Upstream tabular file | |
| 590 # We are discarding the intensity data; so read it as text | |
| 591 upstream_data = pandas.read_table( | |
| 592 upstream_map_filename_tab, dtype="str", index_col=0 | |
| 593 ) | |
| 594 | |
| 595 end_time = time.process_time() # timer | |
| 596 print( | |
| 597 "%0.6f read Upstream Map from file [1g_1]" | |
| 598 % (end_time - start_time,), | |
| 599 file=sys.stderr, | |
| 600 ) # timer | |
| 601 | |
| 602 upstream_data.index = upstream_map_p_peptide_list | |
| 603 | |
| 604 end_time = time.process_time() # timer | |
| 605 print( | |
| 606 "%0.6f added index to Upstream Map [1g_2]" | |
| 607 % (end_time - start_time,), | |
| 608 file=sys.stderr, | |
| 609 ) # timer | |
| 610 | |
| 611 # ######################################################################## | |
| 612 # # trim upstream_data to include only the upstream map columns | |
| 613 # old_cols = upstream_data.columns.tolist() | |
| 614 # i = 0 | |
| 615 # first_intensity = -1 | |
| 616 # last_intensity = -1 | |
| 617 # intensity_re = re.compile("Intensity.*") | |
| 618 # for col_name in old_cols: | |
| 619 # m = intensity_re.match(col_name) | |
| 620 # if m: | |
| 621 # last_intensity = i | |
| 622 # if first_intensity == -1: | |
| 623 # first_intensity = i | |
| 624 # i += 1 | |
| 625 # # print('last intensity = %d' % last_intensity) | |
| 626 # col_PKCalpha = last_intensity + 2 | |
| 627 # | |
| 628 # data_in_cols = [old_cols[0]] + old_cols[ | |
| 629 # first_intensity: last_intensity + 1 | |
| 630 # ] | |
| 631 # | |
| 632 # if upstream_data.empty: | |
| 633 # print("upstream_data is empty") | |
| 634 # exit(0) | |
| 635 # | |
| 636 # data_in = upstream_data.copy(deep=True)[data_in_cols] | |
| 637 ######################################################################## | |
| 638 # trim upstream_data to include only the upstream map columns | |
| 639 old_cols = upstream_data.columns.tolist() | |
| 640 i = 0 | |
| 641 first_intensity = -1 | |
| 642 last_intensity = -1 | |
| 643 intensity_re = re.compile("Intensity.*") | |
| 644 for col_name in old_cols: | |
| 645 m = intensity_re.match(col_name) | |
| 646 if m: | |
| 647 last_intensity = i | |
| 648 if first_intensity == -1: | |
| 649 first_intensity = i | |
| 650 i += 1 | |
| 651 # print('last intensity = %d' % last_intensity) | |
| 652 col_PKCalpha = last_intensity + 2 | |
| 653 | |
| 654 data_in_cols = [old_cols[0]] + old_cols[ | |
| 655 first_intensity - 1: last_intensity | |
| 656 ] | |
| 657 data_col_names = [old_cols[0]] + old_cols[ | |
| 658 first_intensity: last_intensity + 1 | |
| 659 ] | |
| 660 | |
| 661 if upstream_data.empty: | |
| 662 print("upstream_data is empty") | |
| 663 exit(0) | |
| 664 | |
| 665 data_in = upstream_data.copy(deep=True)[data_in_cols] | |
| 666 data_in.columns = data_col_names | |
| 667 print("data_in") | |
| 668 print(data_in) | |
| 669 ######################################################################## | |
| 670 | |
| 671 # Convert floating-point integers to int64 integers | |
| 672 # ref: https://stackoverflow.com/a/68497603/15509512 | |
| 673 data_in[list(data_in.columns[1:])] = ( | |
| 674 data_in[list(data_in.columns[1:])] | |
| 675 .astype("float64") | |
| 676 .apply(np.int64) | |
| 677 ) | |
| 678 | |
| 679 # create another phosphopeptide column that will be used to join later; | |
| 680 # MAY need to change depending on Phosphopeptide column position | |
| 681 # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] | |
| 682 data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index | |
| 683 | |
| 684 end_time = time.process_time() # timer | |
| 685 print( | |
| 686 "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" | |
| 687 % (end_time - start_time,), | |
| 688 file=sys.stderr, | |
| 689 ) # timer | |
| 690 | |
| 691 # Produce a dictionary of metadata for a single phosphopeptide. | |
| 692 # This is a replacement of `UniProtInfo_subdict` in the original code. | |
| 693 def pseq_to_subdict(phospho_pep): | |
| 694 # Strip "p" from phosphopeptide sequence | |
| 695 dephospho_pep = re_phos.sub("", phospho_pep) | |
| 696 | |
| 697 # Determine number of phosphoresidues in phosphopeptide | |
| 698 numps = len(phospho_pep) - len(dephospho_pep) | |
| 699 | |
| 700 # Determine location(s) of phosphoresidue(s) in phosphopeptide | |
| 701 # (used later for Phosphoresidue, Sequence7, and Sequence10) | |
| 702 ploc = [] # list of p locations | |
| 703 i = 0 | |
| 704 p = phospho_pep | |
| 705 while i < numps: | |
| 706 ploc.append(p.find("p")) | |
| 707 p = p[: p.find("p")] + p[p.find("p") + 1:] | |
| 708 i += 1 | |
| 709 | |
| 710 # Establish nested dictionary | |
| 711 result = {} | |
| 712 result[SEQUENCE] = [] | |
| 713 result[UNIPROT_ID] = [] | |
| 714 result[DESCRIPTION] = [] | |
| 715 result[GENE_NAME] = [] | |
| 716 result[PHOSPHORESIDUE] = [] | |
| 717 result[SEQUENCE7] = [] | |
| 718 result[SEQUENCE10] = [] | |
| 719 | |
| 720 # Add stripped sequence to dictionary | |
| 721 result[SEQUENCE].append(dephospho_pep) | |
| 722 | |
| 723 # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT | |
| 724 # Caller may elect to: | |
| 725 # try: | |
| 726 # ... | |
| 727 # except PreconditionError as pe: | |
| 728 # print("'{expression}': {message}".format( | |
| 729 # expression = pe.expression, | |
| 730 # message = pe.message)) | |
| 731 # ) | |
| 732 # ) | |
| 733 if phospho_pep not in PhosphoPep_UniProtSeq_LUT: | |
| 734 raise PreconditionError( | |
| 735 phospho_pep, | |
| 736 "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", | |
| 737 ) | |
| 738 if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: | |
| 739 raise PreconditionError( | |
| 740 dephospho_pep, | |
| 741 "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", | |
| 742 ) | |
| 743 if ( | |
| 744 dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] | |
| 745 ): | |
| 746 my_err_msg = "dephosphorylated phosphopeptide does not match " | |
| 747 my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " | |
| 748 my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] | |
| 749 raise PreconditionError(dephospho_pep, my_err_msg) | |
| 750 | |
| 751 result[SEQUENCE] = [dephospho_pep] | |
| 752 result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ | |
| 753 (dephospho_pep, UNIPROT_ID) | |
| 754 ] | |
| 755 result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ | |
| 756 (dephospho_pep, DESCRIPTION) | |
| 757 ] | |
| 758 result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ | |
| 759 (dephospho_pep, GENE_NAME) | |
| 760 ] | |
| 761 if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: | |
| 762 raise PreconditionError( | |
| 763 dephospho_pep, | |
| 764 "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", | |
| 765 ) | |
| 766 UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ | |
| 767 (dephospho_pep, SEQUENCE) | |
| 768 ] | |
| 769 if len(UniProtSeqList) < 1: | |
| 770 print( | |
| 771 "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" | |
| 772 % dephospho_pep | |
| 773 ) | |
| 774 # raise PreconditionError( | |
| 775 # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", | |
| 776 # 'value has zero length' | |
| 777 # ) | |
| 778 for UniProtSeq in UniProtSeqList: | |
| 779 i = 0 | |
| 780 phosphoresidues = [] | |
| 781 seq7s_set = set() | |
| 782 seq7s = [] | |
| 783 seq10s_set = set() | |
| 784 seq10s = [] | |
| 785 while i < len(ploc): | |
| 786 start = UniProtSeq.find(dephospho_pep) | |
| 787 # handle case where no sequence was found for dep-pep | |
| 788 if start < 0: | |
| 789 i += 1 | |
| 790 continue | |
| 791 psite = ( | |
| 792 start + ploc[i] | |
| 793 ) # location of phosphoresidue on protein sequence | |
| 794 | |
| 795 # add Phosphoresidue | |
| 796 phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) | |
| 797 phosphoresidues.append(phosphosite) | |
| 798 | |
| 799 # Add Sequence7 | |
| 800 if psite < 7: # phospho_pep at N terminus | |
| 801 seq7 = str(UniProtSeq)[: psite + 8] | |
| 802 if seq7[psite] == "S": # if phosphosresidue is serine | |
| 803 pres = "s" | |
| 804 elif ( | |
| 805 seq7[psite] == "T" | |
| 806 ): # if phosphosresidue is threonine | |
| 807 pres = "t" | |
| 808 elif ( | |
| 809 seq7[psite] == "Y" | |
| 810 ): # if phosphoresidue is tyrosine | |
| 811 pres = "y" | |
| 812 else: # if not pSTY | |
| 813 pres = "?" | |
| 814 seq7 = ( | |
| 815 seq7[:psite] + pres + seq7[psite + 1: psite + 8] | |
| 816 ) | |
| 817 while ( | |
| 818 len(seq7) < 15 | |
| 819 ): # add appropriate number of "_" to the front | |
| 820 seq7 = "_" + seq7 | |
| 821 elif ( | |
| 822 len(UniProtSeq) - psite < 8 | |
| 823 ): # phospho_pep at C terminus | |
| 824 seq7 = str(UniProtSeq)[psite - 7:] | |
| 825 if seq7[7] == "S": | |
| 826 pres = "s" | |
| 827 elif seq7[7] == "T": | |
| 828 pres = "t" | |
| 829 elif seq7[7] == "Y": | |
| 830 pres = "y" | |
| 831 else: | |
| 832 pres = "?" | |
| 833 seq7 = seq7[:7] + pres + seq7[8:] | |
| 834 while ( | |
| 835 len(seq7) < 15 | |
| 836 ): # add appropriate number of "_" to the back | |
| 837 seq7 = seq7 + "_" | |
| 838 else: | |
| 839 seq7 = str(UniProtSeq)[psite - 7: psite + 8] | |
| 840 pres = "" # phosphoresidue | |
| 841 if seq7[7] == "S": # if phosphosresidue is serine | |
| 842 pres = "s" | |
| 843 elif seq7[7] == "T": # if phosphosresidue is threonine | |
| 844 pres = "t" | |
| 845 elif seq7[7] == "Y": # if phosphoresidue is tyrosine | |
| 846 pres = "y" | |
| 847 else: # if not pSTY | |
| 848 pres = "?" | |
| 849 seq7 = seq7[:7] + pres + seq7[8:] | |
| 850 if seq7 not in seq7s_set: | |
| 851 seq7s.append(seq7) | |
| 852 seq7s_set.add(seq7) | |
| 853 | |
| 854 # add Sequence10 | |
| 855 if psite < 10: # phospho_pep at N terminus | |
| 856 seq10 = ( | |
| 857 str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] | |
| 858 ) | |
| 859 elif ( | |
| 860 len(UniProtSeq) - psite < 11 | |
| 861 ): # phospho_pep at C terminus | |
| 862 seq10 = ( | |
| 863 str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] | |
| 864 ) | |
| 865 else: | |
| 866 seq10 = str(UniProtSeq)[psite - 10: psite + 11] | |
| 867 seq10 = seq10[:10] + "p" + seq10[10:] | |
| 868 if seq10 not in seq10s_set: | |
| 869 seq10s.append(seq10) | |
| 870 seq10s_set.add(seq10) | |
| 871 | |
| 872 i += 1 | |
| 873 | |
| 874 result[PHOSPHORESIDUE].append(phosphoresidues) | |
| 875 result[SEQUENCE7].append(seq7s) | |
| 876 # result[SEQUENCE10] is a list of lists of strings | |
| 877 result[SEQUENCE10].append(seq10s) | |
| 878 | |
| 879 r = list( | |
| 880 zip( | |
| 881 result[UNIPROT_ID], | |
| 882 result[GENE_NAME], | |
| 883 result[DESCRIPTION], | |
| 884 result[PHOSPHORESIDUE], | |
| 885 ) | |
| 886 ) | |
| 887 # Sort by `UniProt_ID` | |
| 888 # ref: https://stackoverflow.com//4174955/15509512 | |
| 889 s = sorted(r, key=operator.itemgetter(0)) | |
| 890 | |
| 891 result[UNIPROT_ID] = [] | |
| 892 result[GENE_NAME] = [] | |
| 893 result[DESCRIPTION] = [] | |
| 894 result[PHOSPHORESIDUE] = [] | |
| 895 | |
| 896 for r in s: | |
| 897 result[UNIPROT_ID].append(r[0]) | |
| 898 result[GENE_NAME].append(r[1]) | |
| 899 result[DESCRIPTION].append(r[2]) | |
| 900 result[PHOSPHORESIDUE].append(r[3]) | |
| 901 | |
| 902 # convert lists to strings in the dictionary | |
| 903 for key, value in result.items(): | |
| 904 if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: | |
| 905 result[key] = "; ".join(map(str, value)) | |
| 906 elif key in [SEQUENCE10]: | |
| 907 # result[SEQUENCE10] is a list of lists of strings | |
| 908 joined_value = "" | |
| 909 joined_set = set() | |
| 910 sep = "" | |
| 911 for valL in value: | |
| 912 # valL is a list of strings | |
| 913 for val in valL: | |
| 914 # val is a string | |
| 915 if val not in joined_set: | |
| 916 joined_set.add(val) | |
| 917 joined_value += sep + val | |
| 918 sep = "; " | |
| 919 # joined_value is a string | |
| 920 result[key] = joined_value | |
| 921 | |
| 922 newstring = "; ".join( | |
| 923 [", ".join(prez) for prez in result[PHOSPHORESIDUE]] | |
| 924 ) | |
| 925 # #separate the isoforms in PHOSPHORESIDUE column with ";" | |
| 926 # oldstring = result[PHOSPHORESIDUE] | |
| 927 # oldlist = list(oldstring) | |
| 928 # newstring = "" | |
| 929 # i = 0 | |
| 930 # for e in oldlist: | |
| 931 # if e == ";": | |
| 932 # if numps > 1: | |
| 933 # if i%numps: | |
| 934 # newstring = newstring + ";" | |
| 935 # else: | |
| 936 # newstring = newstring + "," | |
| 937 # else: | |
| 938 # newstring = newstring + ";" | |
| 939 # i +=1 | |
| 940 # else: | |
| 941 # newstring = newstring + e | |
| 942 result[PHOSPHORESIDUE] = newstring | |
| 943 | |
| 944 # separate sequence7's by | | |
| 945 oldstring = result[SEQUENCE7] | |
| 946 oldlist = oldstring | |
| 947 newstring = "" | |
| 948 for ol in oldlist: | |
| 949 for e in ol: | |
| 950 if e == ";": | |
| 951 newstring = newstring + " |" | |
| 952 elif len(newstring) > 0 and 1 > newstring.count(e): | |
| 953 newstring = newstring + " | " + e | |
| 954 elif 1 > newstring.count(e): | |
| 955 newstring = newstring + e | |
| 956 result[SEQUENCE7] = newstring | |
| 957 | |
| 958 return [phospho_pep, result] | |
| 959 | |
| 960 # Construct list of [string, dictionary] lists | |
| 961 # where the dictionary provides the SwissProt metadata | |
| 962 # for a phosphopeptide | |
| 963 result_list = [ | |
| 964 whine(pseq_to_subdict, psequence) | |
| 965 for psequence in data_in[PHOSPHOPEPTIDE_MATCH] | |
| 966 ] | |
| 967 | |
| 968 end_time = time.process_time() # timer | |
| 969 print( | |
| 970 "%0.6f added SwissProt annotations to phosphopeptides [B]" | |
| 971 % (end_time - start_time,), | |
| 972 file=sys.stderr, | |
| 973 ) # timer | |
| 974 | |
| 975 # Construct dictionary from list of lists | |
| 976 # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ | |
| 977 UniProt_Info = { | |
| 978 result[0]: result[1] | |
| 979 for result in result_list | |
| 980 if result is not None | |
| 981 } | |
| 982 | |
| 983 end_time = time.process_time() # timer | |
| 984 print( | |
| 985 "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" | |
| 986 % (end_time - start_time,), | |
| 987 file=sys.stderr, | |
| 988 ) # timer | |
| 989 | |
| 990 # cosmetic: add N_A to phosphopeptide rows with no hits | |
| 991 p_peptide_list = [] | |
| 992 for key in UniProt_Info: | |
| 993 p_peptide_list.append(key) | |
| 994 for nestedKey in UniProt_Info[key]: | |
| 995 if UniProt_Info[key][nestedKey] == "": | |
| 996 UniProt_Info[key][nestedKey] = N_A | |
| 997 | |
| 998 end_time = time.process_time() # timer | |
| 999 print( | |
| 1000 "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), | |
| 1001 file=sys.stderr, | |
| 1002 ) # timer | |
| 1003 | |
| 1004 # convert UniProt_Info dictionary to dataframe | |
| 1005 uniprot_df = pandas.DataFrame.transpose( | |
| 1006 pandas.DataFrame.from_dict(UniProt_Info) | |
| 1007 ) | |
| 1008 | |
| 1009 # reorder columns to match expected output file | |
| 1010 uniprot_df[ | |
| 1011 PHOSPHOPEPTIDE | |
| 1012 ] = uniprot_df.index # make index a column too | |
| 1013 | |
| 1014 cols = uniprot_df.columns.tolist() | |
| 1015 # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] | |
| 1016 # uniprot_df = uniprot_df[cols] | |
| 1017 uniprot_df = uniprot_df[ | |
| 1018 [ | |
| 1019 PHOSPHOPEPTIDE, | |
| 1020 SEQUENCE10, | |
| 1021 SEQUENCE7, | |
| 1022 GENE_NAME, | |
| 1023 PHOSPHORESIDUE, | |
| 1024 UNIPROT_ID, | |
| 1025 DESCRIPTION, | |
| 1026 ] | |
| 1027 ] | |
| 1028 | |
| 1029 end_time = time.process_time() # timer | |
| 1030 print( | |
| 1031 "%0.6f reordered columns to match expected output file [1]" | |
| 1032 % (end_time - start_time,), | |
| 1033 file=sys.stderr, | |
| 1034 ) # timer | |
| 1035 | |
| 1036 # concat to split then groupby to collapse | |
| 1037 seq7_df = pandas.concat( | |
| 1038 [ | |
| 1039 pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) | |
| 1040 for _, row in uniprot_df.iterrows() | |
| 1041 ] | |
| 1042 ).reset_index() | |
| 1043 seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] | |
| 1044 | |
| 1045 # --- -------------- begin read PSP_Regulatory_sites --------------------------------- | |
| 1046 # read in PhosphoSitePlus Regulatory Sites dataset | |
| 1047 # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- | |
| 1048 conn = sql.connect(uniprot_sqlite) | |
| 1049 regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) | |
| 1050 # Close SwissProt SQLite database | |
| 1051 conn.close() | |
| 1052 # ... -------------- end read PSP_Regulatory_sites ------------------------------------ | |
| 1053 | |
| 1054 # keep only the human entries in dataframe | |
| 1055 if len(species) > 0: | |
| 1056 print( | |
| 1057 'Limit PhosphoSitesPlus records to species "' + species + '"' | |
| 1058 ) | |
| 1059 regsites_df = regsites_df[regsites_df.ORGANISM == species] | |
| 1060 | |
| 1061 # merge the seq7 df with the regsites df based off of the sequence7 | |
| 1062 merge_df = seq7_df.merge( | |
| 1063 regsites_df, | |
| 1064 left_on=SEQUENCE7, | |
| 1065 right_on=SITE_PLUSMINUS_7AA_SQL, | |
| 1066 how="left", | |
| 1067 ) | |
| 1068 | |
| 1069 # after merging df, select only the columns of interest; | |
| 1070 # note that PROTEIN is absent here | |
| 1071 merge_df = merge_df[ | |
| 1072 [ | |
| 1073 PHOSPHOPEPTIDE, | |
| 1074 SEQUENCE7, | |
| 1075 ON_FUNCTION, | |
| 1076 ON_PROCESS, | |
| 1077 ON_PROT_INTERACT, | |
| 1078 ON_OTHER_INTERACT, | |
| 1079 ON_NOTES, | |
| 1080 ] | |
| 1081 ] | |
| 1082 # combine column values of interest | |
| 1083 # into one FUNCTION_PHOSPHORESIDUE column" | |
| 1084 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( | |
| 1085 merge_df[ON_PROCESS], sep="; ", na_rep="" | |
| 1086 ) | |
| 1087 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
| 1088 FUNCTION_PHOSPHORESIDUE | |
| 1089 ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") | |
| 1090 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
| 1091 FUNCTION_PHOSPHORESIDUE | |
| 1092 ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") | |
| 1093 merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ | |
| 1094 FUNCTION_PHOSPHORESIDUE | |
| 1095 ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") | |
| 1096 | |
| 1097 # remove the columns that were combined | |
| 1098 merge_df = merge_df[ | |
| 1099 [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] | |
| 1100 ] | |
| 1101 | |
| 1102 end_time = time.process_time() # timer | |
| 1103 print( | |
| 1104 "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), | |
| 1105 file=sys.stderr, | |
| 1106 ) # timer | |
| 1107 | |
| 1108 # cosmetic changes to Function Phosphoresidue column | |
| 1109 fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) | |
| 1110 | |
| 1111 end_time = time.process_time() # timer | |
| 1112 print( | |
| 1113 "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), | |
| 1114 file=sys.stderr, | |
| 1115 ) # timer | |
| 1116 | |
| 1117 i = 0 | |
| 1118 while i < len(fp_series): | |
| 1119 # remove the extra ";" so that it looks more professional | |
| 1120 if fp_series[i] == "; ; ; ; ": # remove ; from empty hits | |
| 1121 fp_series[i] = "" | |
| 1122 while fp_series[i].endswith("; "): # remove ; from the ends | |
| 1123 fp_series[i] = fp_series[i][:-2] | |
| 1124 while fp_series[i].startswith("; "): # remove ; from the beginning | |
| 1125 fp_series[i] = fp_series[i][2:] | |
| 1126 fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") | |
| 1127 fp_series[i] = fp_series[i].replace("; ; ; ", "; ") | |
| 1128 fp_series[i] = fp_series[i].replace("; ; ", "; ") | |
| 1129 | |
| 1130 # turn blanks into N_A to signify the info was searched for but cannot be found | |
| 1131 if fp_series[i] == "": | |
| 1132 fp_series[i] = N_A | |
| 1133 | |
| 1134 i += 1 | |
| 1135 merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series | |
| 1136 | |
| 1137 end_time = time.process_time() # timer | |
| 1138 print( | |
| 1139 "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), | |
| 1140 file=sys.stderr, | |
| 1141 ) # timer | |
| 1142 | |
| 1143 # merge uniprot df with merge df | |
| 1144 uniprot_regsites_merged_df = uniprot_df.merge( | |
| 1145 merge_df, | |
| 1146 left_on=PHOSPHOPEPTIDE, | |
| 1147 right_on=PHOSPHOPEPTIDE, | |
| 1148 how="left", | |
| 1149 ) | |
| 1150 | |
| 1151 # collapse the merged df | |
| 1152 uniprot_regsites_collapsed_df = pandas.DataFrame( | |
| 1153 uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ | |
| 1154 FUNCTION_PHOSPHORESIDUE | |
| 1155 ].apply(lambda x: ppep_join(x)) | |
| 1156 ) | |
| 1157 # .apply(lambda x: "%s" % ' | '.join(x))) | |
| 1158 | |
| 1159 end_time = time.process_time() # timer | |
| 1160 print( | |
| 1161 "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), | |
| 1162 file=sys.stderr, | |
| 1163 ) # timer | |
| 1164 | |
| 1165 uniprot_regsites_collapsed_df[ | |
| 1166 PHOSPHOPEPTIDE | |
| 1167 ] = ( | |
| 1168 uniprot_regsites_collapsed_df.index | |
| 1169 ) # add df index as its own column | |
| 1170 | |
| 1171 # rename columns | |
| 1172 uniprot_regsites_collapsed_df.columns = [ | |
| 1173 FUNCTION_PHOSPHORESIDUE, | |
| 1174 "ppp", | |
| 1175 ] | |
| 1176 | |
| 1177 end_time = time.process_time() # timer | |
| 1178 print( | |
| 1179 "%0.6f selected columns to be merged to uniprot_df [1e]" | |
| 1180 % (end_time - start_time,), | |
| 1181 file=sys.stderr, | |
| 1182 ) # timer | |
| 1183 | |
| 1184 # add columns based on Sequence7 matching site_+/-7_AA | |
| 1185 uniprot_regsite_df = pandas.merge( | |
| 1186 left=uniprot_df, | |
| 1187 right=uniprot_regsites_collapsed_df, | |
| 1188 how="left", | |
| 1189 left_on=PHOSPHOPEPTIDE, | |
| 1190 right_on="ppp", | |
| 1191 ) | |
| 1192 | |
| 1193 end_time = time.process_time() # timer | |
| 1194 print( | |
| 1195 "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" | |
| 1196 % (end_time - start_time,), | |
| 1197 file=sys.stderr, | |
| 1198 ) # timer | |
| 1199 | |
| 1200 data_in.rename( | |
| 1201 {"Protein description": PHOSPHOPEPTIDE}, | |
| 1202 axis="columns", | |
| 1203 inplace=True, | |
| 1204 ) | |
| 1205 | |
| 1206 # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') | |
| 1207 res2 = sorted( | |
| 1208 data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() | |
| 1209 ) | |
| 1210 data_in = data_in.loc[res2] | |
| 1211 | |
| 1212 end_time = time.process_time() # timer | |
| 1213 print( | |
| 1214 "%0.6f sorting time [1f]" % (end_time - start_time,), | |
| 1215 file=sys.stderr, | |
| 1216 ) # timer | |
| 1217 | |
| 1218 print("old_cols[:col_PKCalpha]") | |
| 1219 print(old_cols[:col_PKCalpha]) | |
| 1220 cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] | |
| 1221 upstream_data = upstream_data[cols] | |
| 1222 print("upstream_data.columns") | |
| 1223 print(upstream_data.columns) | |
| 1224 | |
| 1225 end_time = time.process_time() # timer | |
| 1226 print( | |
| 1227 "%0.6f refactored columns for Upstream Map [1g]" | |
| 1228 % (end_time - start_time,), | |
| 1229 file=sys.stderr, | |
| 1230 ) # timer | |
| 1231 | |
| 1232 # #rename upstream columns in new list | |
| 1233 # new_cols = [] | |
| 1234 # for name in cols: | |
| 1235 # if "_NetworKIN" in name: | |
| 1236 # name = name.split("_")[0] | |
| 1237 # if " motif" in name: | |
| 1238 # name = name.split(" motif")[0] | |
| 1239 # if " sequence " in name: | |
| 1240 # name = name.split(" sequence")[0] | |
| 1241 # if "_Phosida" in name: | |
| 1242 # name = name.split("_")[0] | |
| 1243 # if "_PhosphoSite" in name: | |
| 1244 # name = name.split("_")[0] | |
| 1245 # new_cols.append(name) | |
| 1246 | |
| 1247 # rename upstream columns in new list | |
| 1248 def col_rename(name): | |
| 1249 if "_NetworKIN" in name: | |
| 1250 name = name.split("_")[0] | |
| 1251 if " motif" in name: | |
| 1252 name = name.split(" motif")[0] | |
| 1253 if " sequence " in name: | |
| 1254 name = name.split(" sequence")[0] | |
| 1255 if "_Phosida" in name: | |
| 1256 name = name.split("_")[0] | |
| 1257 if "_PhosphoSite" in name: | |
| 1258 name = name.split("_")[0] | |
| 1259 return name | |
| 1260 | |
| 1261 new_cols = [col_rename(col) for col in cols] | |
| 1262 upstream_data.columns = new_cols | |
| 1263 | |
| 1264 end_time = time.process_time() # timer | |
| 1265 print( | |
| 1266 "%0.6f renamed columns for Upstream Map [1h_1]" | |
| 1267 % (end_time - start_time,), | |
| 1268 file=sys.stderr, | |
| 1269 ) # timer | |
| 1270 | |
| 1271 # Create upstream_data_cast as a copy of upstream_data | |
| 1272 # but with first column substituted by the phosphopeptide sequence | |
| 1273 upstream_data_cast = upstream_data.copy() | |
| 1274 new_cols_cast = new_cols | |
| 1275 new_cols_cast[0] = "p_peptide" | |
| 1276 upstream_data_cast.columns = new_cols_cast | |
| 1277 upstream_data_cast["p_peptide"] = upstream_data.index | |
| 1278 | |
| 1279 # --- -------------- begin read upstream_data_melt ------------------------------------ | |
| 1280 # ----------- Get melted kinase mapping data from SQLite database (start) ----------- | |
| 1281 conn = sql.connect(uniprot_sqlite) | |
| 1282 upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) | |
| 1283 # Close SwissProt SQLite database | |
| 1284 conn.close() | |
| 1285 upstream_data_melt = upstream_data_melt_df.copy() | |
| 1286 upstream_data_melt.columns = ["p_peptide", "characterization", "X"] | |
| 1287 upstream_data_melt["characterization"] = [ | |
| 1288 col_rename(s) for s in upstream_data_melt["characterization"] | |
| 1289 ] | |
| 1290 | |
| 1291 print( | |
| 1292 "%0.6f upstream_data_melt_df initially has %d rows" | |
| 1293 % (end_time - start_time, len(upstream_data_melt.axes[0])), | |
| 1294 file=sys.stderr, | |
| 1295 ) | |
| 1296 # ref: https://stackoverflow.com/a/27360130/15509512 | |
| 1297 # e.g. df.drop(df[df.score < 50].index, inplace=True) | |
| 1298 upstream_data_melt.drop( | |
| 1299 upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True | |
| 1300 ) | |
| 1301 print( | |
| 1302 "%0.6f upstream_data_melt_df pre-dedup has %d rows" | |
| 1303 % (end_time - start_time, len(upstream_data_melt.axes[0])), | |
| 1304 file=sys.stderr, | |
| 1305 ) | |
| 1306 # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- | |
| 1307 # ... -------------- end read upstream_data_melt -------------------------------------- | |
| 1308 | |
| 1309 end_time = time.process_time() # timer | |
| 1310 print( | |
| 1311 "%0.6f melted and minimized Upstream Map dataframe [1h_2]" | |
| 1312 % (end_time - start_time,), | |
| 1313 file=sys.stderr, | |
| 1314 ) # timer | |
| 1315 # ... end read upstream_data_melt | |
| 1316 | |
| 1317 end_time = time.process_time() # timer | |
| 1318 print( | |
| 1319 "%0.6f indexed melted Upstream Map [1h_2a]" | |
| 1320 % (end_time - start_time,), | |
| 1321 file=sys.stderr, | |
| 1322 ) # timer | |
| 1323 | |
| 1324 upstream_delta_melt_LoL = upstream_data_melt.values.tolist() | |
| 1325 | |
| 1326 melt_dict = {} | |
| 1327 for key in upstream_map_p_peptide_list: | |
| 1328 melt_dict[key] = [] | |
| 1329 | |
| 1330 for el in upstream_delta_melt_LoL: | |
| 1331 (p_peptide, characterization, X) = tuple(el) | |
| 1332 if p_peptide in melt_dict: | |
| 1333 melt_dict[p_peptide].append(characterization) | |
| 1334 else: | |
| 1335 exit( | |
| 1336 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' | |
| 1337 % (p_peptide) | |
| 1338 ) | |
| 1339 | |
| 1340 end_time = time.process_time() # timer | |
| 1341 print( | |
| 1342 "%0.6f appended peptide characterizations [1h_2b]" | |
| 1343 % (end_time - start_time,), | |
| 1344 file=sys.stderr, | |
| 1345 ) # timer | |
| 1346 | |
| 1347 # for key in upstream_map_p_peptide_list: | |
| 1348 # melt_dict[key] = ' | '.join(melt_dict[key]) | |
| 1349 | |
| 1350 for key in upstream_map_p_peptide_list: | |
| 1351 melt_dict[key] = melt_join(melt_dict[key]) | |
| 1352 | |
| 1353 end_time = time.process_time() # timer | |
| 1354 print( | |
| 1355 "%0.6f concatenated multiple characterizations [1h_2c]" | |
| 1356 % (end_time - start_time,), | |
| 1357 file=sys.stderr, | |
| 1358 ) # timer | |
| 1359 | |
| 1360 # map_dict is a dictionary of dictionaries | |
| 1361 map_dict = {} | |
| 1362 for key in upstream_map_p_peptide_list: | |
| 1363 map_dict[key] = {} | |
| 1364 map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] | |
| 1365 | |
| 1366 end_time = time.process_time() # timer | |
| 1367 print( | |
| 1368 "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), | |
| 1369 file=sys.stderr, | |
| 1370 ) # timer | |
| 1371 | |
| 1372 # convert map_dict to dataframe | |
| 1373 map_df = pandas.DataFrame.transpose( | |
| 1374 pandas.DataFrame.from_dict(map_dict) | |
| 1375 ) | |
| 1376 map_df["p-peptide"] = map_df.index # make index a column too | |
| 1377 cols_map_df = map_df.columns.tolist() | |
| 1378 cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] | |
| 1379 map_df = map_df[cols_map_df] | |
| 1380 | |
| 1381 # join map_df to uniprot_regsite_df | |
| 1382 output_df = uniprot_regsite_df.merge( | |
| 1383 map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" | |
| 1384 ) | |
| 1385 | |
| 1386 output_df = output_df[ | |
| 1387 [ | |
| 1388 PHOSPHOPEPTIDE, | |
| 1389 SEQUENCE10, | |
| 1390 SEQUENCE7, | |
| 1391 GENE_NAME, | |
| 1392 PHOSPHORESIDUE, | |
| 1393 UNIPROT_ID, | |
| 1394 DESCRIPTION, | |
| 1395 FUNCTION_PHOSPHORESIDUE, | |
| 1396 PUTATIVE_UPSTREAM_DOMAINS, | |
| 1397 ] | |
| 1398 ] | |
| 1399 | |
| 1400 # cols_output_prelim = output_df.columns.tolist() | |
| 1401 # | |
| 1402 # print("cols_output_prelim") | |
| 1403 # print(cols_output_prelim) | |
| 1404 # | |
| 1405 # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] | |
| 1406 # | |
| 1407 # print("cols_output with p-peptide") | |
| 1408 # print(cols_output) | |
| 1409 # | |
| 1410 # cols_output = [col for col in cols_output if not col == "p-peptide"] | |
| 1411 # | |
| 1412 # print("cols_output") | |
| 1413 # print(cols_output) | |
| 1414 # | |
| 1415 # output_df = output_df[cols_output] | |
| 1416 | |
| 1417 # join output_df back to quantitative columns in data_in df | |
| 1418 quant_cols = data_in.columns.tolist() | |
| 1419 quant_cols = quant_cols[1:] | |
| 1420 quant_data = data_in[quant_cols] | |
| 1421 | |
| 1422 # ----------- Write merge/filter metadata to SQLite database (start) ----------- | |
| 1423 # Open SwissProt SQLite database | |
| 1424 conn = sql.connect(output_sqlite) | |
| 1425 cur = conn.cursor() | |
| 1426 | |
| 1427 cur.executescript(MRGFLTR_DDL) | |
| 1428 | |
| 1429 cur.execute( | |
| 1430 CITATION_INSERT_STMT, | |
| 1431 ("mrgfltr_metadata_view", CITATION_INSERT_PSP), | |
| 1432 ) | |
| 1433 cur.execute( | |
| 1434 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) | |
| 1435 ) | |
| 1436 cur.execute( | |
| 1437 CITATION_INSERT_STMT, | |
| 1438 ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), | |
| 1439 ) | |
| 1440 cur.execute( | |
| 1441 CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) | |
| 1442 ) | |
| 1443 | |
| 1444 # Read ppep-to-sequence LUT | |
| 1445 ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) | |
| 1446 # write only metadata for merged/filtered records to SQLite | |
| 1447 mrgfltr_metadata_df = output_df.copy() | |
| 1448 # replace phosphopeptide seq with ppep.id | |
| 1449 mrgfltr_metadata_df = ppep_lut_df.merge( | |
| 1450 mrgfltr_metadata_df, | |
| 1451 left_on="ppep_seq", | |
| 1452 right_on=PHOSPHOPEPTIDE, | |
| 1453 how="inner", | |
| 1454 ) | |
| 1455 mrgfltr_metadata_df.drop( | |
| 1456 columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True | |
| 1457 ) | |
| 1458 # rename columns | |
| 1459 mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS | |
| 1460 mrgfltr_metadata_df.to_sql( | |
| 1461 "mrgfltr_metadata", | |
| 1462 con=conn, | |
| 1463 if_exists="append", | |
| 1464 index=False, | |
| 1465 method="multi", | |
| 1466 ) | |
| 1467 | |
| 1468 # Close SwissProt SQLite database | |
| 1469 conn.close() | |
| 1470 # ----------- Write merge/filter metadata to SQLite database (finish) ----------- | |
| 1471 | |
| 1472 output_df = output_df.merge( | |
| 1473 quant_data, | |
| 1474 how="right", | |
| 1475 left_on=PHOSPHOPEPTIDE, | |
| 1476 right_on=PHOSPHOPEPTIDE_MATCH, | |
| 1477 ) | |
| 1478 output_cols = output_df.columns.tolist() | |
| 1479 output_cols = output_cols[:-1] | |
| 1480 output_df = output_df[output_cols] | |
| 1481 | |
| 1482 # cosmetic changes to Upstream column | |
| 1483 output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ | |
| 1484 PUTATIVE_UPSTREAM_DOMAINS | |
| 1485 ].fillna( | |
| 1486 "" | |
| 1487 ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping | |
| 1488 us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) | |
| 1489 i = 0 | |
| 1490 while i < len(us_series): | |
| 1491 # turn blanks into N_A to signify the info was searched for but cannot be found | |
| 1492 if us_series[i] == "": | |
| 1493 us_series[i] = N_A | |
| 1494 i += 1 | |
| 1495 output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series | |
| 1496 | |
| 1497 end_time = time.process_time() # timer | |
| 1498 print( | |
| 1499 "%0.6f establisheed output [3]" % (end_time - start_time,), | |
| 1500 file=sys.stderr, | |
| 1501 ) # timer | |
| 1502 | |
| 1503 (output_rows, output_cols) = output_df.shape | |
| 1504 | |
| 1505 output_df = output_df.convert_dtypes(convert_integer=True) | |
| 1506 | |
| 1507 # Output onto Final CSV file | |
| 1508 output_df.to_csv(output_filename_csv, index=False) | |
| 1509 output_df.to_csv( | |
| 1510 output_filename_tab, quoting=None, sep="\t", index=False | |
| 1511 ) | |
| 1512 | |
| 1513 end_time = time.process_time() # timer | |
| 1514 print( | |
| 1515 "%0.6f wrote output [4]" % (end_time - start_time,), | |
| 1516 file=sys.stderr, | |
| 1517 ) # timer | |
| 1518 | |
| 1519 print( | |
| 1520 "{:>10} phosphopeptides written to output".format(str(output_rows)) | |
| 1521 ) | |
| 1522 | |
| 1523 end_time = time.process_time() # timer | |
| 1524 print( | |
| 1525 "%0.6f seconds of non-system CPU time were consumed" | |
| 1526 % (end_time - start_time,), | |
| 1527 file=sys.stderr, | |
| 1528 ) # timer | |
| 1529 | |
| 1530 # Rev. 7/1/2016 | |
| 1531 # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's | |
| 1532 # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS | |
| 1533 # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ | |
| 1534 # read from SwissProt SQLite database | |
| 1535 # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper | |
| 1536 | |
| 1537 # | |
| 1538 # copied from Excel Output Script.ipynb END # | |
| 1539 # | |
| 1540 | |
| 1541 try: | |
| 1542 catch( | |
| 1543 mqpep_getswissprot, | |
| 1544 ) | |
| 1545 exit(0) | |
| 1546 except Exception as e: | |
| 1547 exit("Internal error running mqpep_getswissprot(): %s" % (e)) | |
| 1548 | |
| 1549 | |
| 1550 if __name__ == "__main__": | |
| 1551 __main__() |
