Mercurial > repos > galaxyp > mqppep_preproc
comparison search_ppep.py @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dfd5d2b5903 |
---|---|
1 #!/usr/bin/env python | |
2 # Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB | |
3 | |
4 import argparse | |
5 import os.path | |
6 import re | |
7 import sqlite3 | |
8 import sys # import the sys module for exc_info | |
9 import time | |
10 import traceback # import the traceback module for format_exception | |
11 from codecs import getreader as cx_getreader | |
12 | |
13 # For Aho-Corasick search for fixed set of substrings | |
14 # - add_word | |
15 # - make_automaton | |
16 # - iter | |
17 import ahocorasick | |
18 | |
19 | |
20 # ref: https://stackoverflow.com/a/8915613/15509512 | |
21 # answers: "How to handle exceptions in a list comprehensions" | |
22 # usage: | |
23 # from math import log | |
24 # eggs = [1,3,0,3,2] | |
25 # print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) | |
26 # producing: | |
27 # for <built-in function log> | |
28 # with args (0,) | |
29 # exception: math domain error | |
30 # [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] | |
31 def catch(func, *args, handle=lambda e: e, **kwargs): | |
32 | |
33 try: | |
34 return func(*args, **kwargs) | |
35 except Exception as e: | |
36 print("For %s" % str(func)) | |
37 print(" with args %s" % str(args)) | |
38 print(" caught exception: %s" % str(e)) | |
39 (ty, va, tb) = sys.exc_info() | |
40 print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) | |
41 # exit(-1) | |
42 return None # was handle(e) | |
43 | |
44 | |
45 def __main__(): | |
46 | |
47 DROP_TABLES_SQL = """ | |
48 DROP VIEW IF EXISTS ppep_gene_site_view; | |
49 DROP VIEW IF EXISTS uniprot_view; | |
50 DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; | |
51 DROP VIEW IF EXISTS ppep_intensity_view; | |
52 DROP VIEW IF EXISTS ppep_metadata_view; | |
53 | |
54 DROP TABLE IF EXISTS sample; | |
55 DROP TABLE IF EXISTS ppep; | |
56 DROP TABLE IF EXISTS site_type; | |
57 DROP TABLE IF EXISTS deppep_UniProtKB; | |
58 DROP TABLE IF EXISTS deppep; | |
59 DROP TABLE IF EXISTS ppep_gene_site; | |
60 DROP TABLE IF EXISTS ppep_metadata; | |
61 DROP TABLE IF EXISTS ppep_intensity; | |
62 """ | |
63 | |
64 CREATE_TABLES_SQL = """ | |
65 CREATE TABLE deppep | |
66 ( id INTEGER PRIMARY KEY | |
67 , seq TEXT UNIQUE ON CONFLICT IGNORE | |
68 ) | |
69 ; | |
70 CREATE TABLE deppep_UniProtKB | |
71 ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE | |
72 , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE | |
73 , pos_start INTEGER | |
74 , pos_end INTEGER | |
75 , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) | |
76 ON CONFLICT IGNORE | |
77 ) | |
78 ; | |
79 CREATE TABLE ppep | |
80 ( id INTEGER PRIMARY KEY | |
81 , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE | |
82 , seq TEXT UNIQUE ON CONFLICT IGNORE | |
83 , scrubbed TEXT | |
84 ); | |
85 CREATE TABLE site_type | |
86 ( id INTEGER PRIMARY KEY | |
87 , type_name TEXT UNIQUE ON CONFLICT IGNORE | |
88 ); | |
89 CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) | |
90 ; | |
91 CREATE TABLE sample | |
92 ( id INTEGER PRIMARY KEY | |
93 , name TEXT UNIQUE ON CONFLICT IGNORE | |
94 ) | |
95 ; | |
96 CREATE VIEW uniprot_view AS | |
97 SELECT DISTINCT | |
98 Uniprot_ID | |
99 , Description | |
100 , Organism_Name | |
101 , Organism_ID | |
102 , Gene_Name | |
103 , PE | |
104 , SV | |
105 , Sequence | |
106 , Description || | |
107 CASE WHEN Organism_Name = 'N/A' | |
108 THEN '' | |
109 ELSE ' OS='|| Organism_Name | |
110 END || | |
111 CASE WHEN Organism_ID = -1 | |
112 THEN '' | |
113 ELSE ' OX='|| Organism_ID | |
114 END || | |
115 CASE WHEN Gene_Name = 'N/A' | |
116 THEN '' | |
117 ELSE ' GN='|| Gene_Name | |
118 END || | |
119 CASE WHEN PE = 'N/A' | |
120 THEN '' | |
121 ELSE ' PE='|| PE | |
122 END || | |
123 CASE WHEN SV = 'N/A' | |
124 THEN '' | |
125 ELSE ' SV='|| SV | |
126 END AS long_description | |
127 , Database | |
128 FROM UniProtKB | |
129 ; | |
130 CREATE VIEW uniprotkb_pep_ppep_view AS | |
131 SELECT deppep_UniProtKB.UniprotKB_ID AS accession | |
132 , deppep_UniProtKB.pos_start AS pos_start | |
133 , deppep_UniProtKB.pos_end AS pos_end | |
134 , deppep.seq AS peptide | |
135 , ppep.seq AS phosphopeptide | |
136 , ppep.scrubbed AS scrubbed | |
137 , uniprot_view.Sequence AS sequence | |
138 , uniprot_view.Description AS description | |
139 , uniprot_view.long_description AS long_description | |
140 , ppep.id AS ppep_id | |
141 FROM ppep, deppep, deppep_UniProtKB, uniprot_view | |
142 WHERE deppep.id = ppep.deppep_id | |
143 AND deppep.id = deppep_UniProtKB.deppep_id | |
144 AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID | |
145 ORDER BY UniprotKB_ID, deppep.seq, ppep.seq | |
146 ; | |
147 CREATE TABLE ppep_gene_site | |
148 ( ppep_id INTEGER REFERENCES ppep(id) | |
149 , gene_names TEXT | |
150 , site_type_id INTEGER REFERENCES site_type(id) | |
151 , kinase_map TEXT | |
152 , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE | |
153 ) | |
154 ; | |
155 CREATE VIEW ppep_gene_site_view AS | |
156 SELECT DISTINCT | |
157 ppep.seq AS phospho_peptide | |
158 , ppep_id | |
159 , gene_names | |
160 , type_name | |
161 , kinase_map | |
162 FROM | |
163 ppep, ppep_gene_site, site_type | |
164 WHERE | |
165 ppep_gene_site.ppep_id = ppep.id | |
166 AND | |
167 ppep_gene_site.site_type_id = site_type.id | |
168 ORDER BY | |
169 ppep.seq | |
170 ; | |
171 CREATE TABLE ppep_metadata | |
172 ( ppep_id INTEGER REFERENCES ppep(id) | |
173 , protein_description TEXT | |
174 , gene_name TEXT | |
175 , FASTA_name TEXT | |
176 , phospho_sites TEXT | |
177 , motifs_unique TEXT | |
178 , accessions TEXT | |
179 , motifs_all_members TEXT | |
180 , domain TEXT | |
181 , ON_FUNCTION TEXT | |
182 , ON_PROCESS TEXT | |
183 , ON_PROT_INTERACT TEXT | |
184 , ON_OTHER_INTERACT TEXT | |
185 , notes TEXT | |
186 , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE | |
187 ) | |
188 ; | |
189 CREATE VIEW ppep_metadata_view AS | |
190 SELECT DISTINCT | |
191 ppep.seq AS phospho_peptide | |
192 , protein_description | |
193 , gene_name | |
194 , FASTA_name | |
195 , phospho_sites | |
196 , motifs_unique | |
197 , accessions | |
198 , motifs_all_members | |
199 , domain | |
200 , ON_FUNCTION | |
201 , ON_PROCESS | |
202 , ON_PROT_INTERACT | |
203 , ON_OTHER_INTERACT | |
204 , notes | |
205 FROM | |
206 ppep, ppep_metadata | |
207 WHERE | |
208 ppep_metadata.ppep_id = ppep.id | |
209 ORDER BY | |
210 ppep.seq | |
211 ; | |
212 CREATE TABLE ppep_intensity | |
213 ( ppep_id INTEGER REFERENCES ppep(id) | |
214 , sample_id INTEGER | |
215 , intensity INTEGER | |
216 , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE | |
217 ) | |
218 ; | |
219 CREATE VIEW ppep_intensity_view AS | |
220 SELECT DISTINCT | |
221 ppep.seq AS phospho_peptide | |
222 , sample.name AS sample | |
223 , intensity | |
224 FROM | |
225 ppep, sample, ppep_intensity | |
226 WHERE | |
227 ppep_intensity.sample_id = sample.id | |
228 AND | |
229 ppep_intensity.ppep_id = ppep.id | |
230 ; | |
231 """ | |
232 | |
233 UNIPROT_SEQ_AND_ID_SQL = """ | |
234 select Sequence, Uniprot_ID | |
235 from UniProtKB | |
236 """ | |
237 | |
238 # Parse Command Line | |
239 parser = argparse.ArgumentParser( | |
240 description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." | |
241 ) | |
242 | |
243 # inputs: | |
244 # Phosphopeptide data for experimental results, including the intensities | |
245 # and the mapping to kinase domains, in tabular format. | |
246 parser.add_argument( | |
247 "--phosphopeptides", | |
248 "-p", | |
249 nargs=1, | |
250 required=True, | |
251 dest="phosphopeptides", | |
252 help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", | |
253 ) | |
254 parser.add_argument( | |
255 "--uniprotkb", | |
256 "-u", | |
257 nargs=1, | |
258 required=True, | |
259 dest="uniprotkb", | |
260 help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", | |
261 ) | |
262 parser.add_argument( | |
263 "--schema", | |
264 action="store_true", | |
265 dest="db_schema", | |
266 help="show updated database schema", | |
267 ) | |
268 parser.add_argument( | |
269 "--warn-duplicates", | |
270 action="store_true", | |
271 dest="warn_duplicates", | |
272 help="show warnings for duplicated sequences", | |
273 ) | |
274 parser.add_argument( | |
275 "--verbose", | |
276 action="store_true", | |
277 dest="verbose", | |
278 help="show somewhat verbose program tracing", | |
279 ) | |
280 # "Make it so!" (parse the arguments) | |
281 options = parser.parse_args() | |
282 if options.verbose: | |
283 print("options: " + str(options) + "\n") | |
284 | |
285 # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file | |
286 if options.phosphopeptides is None: | |
287 exit('Argument "phosphopeptides" is required but not supplied') | |
288 try: | |
289 f_name = os.path.abspath(options.phosphopeptides[0]) | |
290 except Exception as e: | |
291 exit("Error parsing phosphopeptides argument: %s" % (e)) | |
292 | |
293 # path to SQLite input/output tabular file | |
294 if options.uniprotkb is None: | |
295 exit('Argument "uniprotkb" is required but not supplied') | |
296 try: | |
297 db_name = os.path.abspath(options.uniprotkb[0]) | |
298 except Exception as e: | |
299 exit("Error parsing uniprotkb argument: %s" % (e)) | |
300 | |
301 # print("options.schema is %d" % options.db_schema) | |
302 | |
303 # db_name = "demo/test.sqlite" | |
304 # f_name = "demo/test_input.txt" | |
305 | |
306 con = sqlite3.connect(db_name) | |
307 cur = con.cursor() | |
308 ker = con.cursor() | |
309 | |
310 cur.executescript(DROP_TABLES_SQL) | |
311 | |
312 # if options.db_schema: | |
313 # print("\nAfter dropping tables/views that are to be created, schema is:") | |
314 # cur.execute("SELECT * FROM sqlite_schema") | |
315 # for row in cur.fetchall(): | |
316 # if row[4] is not None: | |
317 # print("%s;" % row[4]) | |
318 | |
319 cur.executescript(CREATE_TABLES_SQL) | |
320 | |
321 if options.db_schema: | |
322 print( | |
323 "\nAfter creating tables/views that are to be created, schema is:" | |
324 ) | |
325 cur.execute("SELECT * FROM sqlite_schema") | |
326 for row in cur.fetchall(): | |
327 if row[4] is not None: | |
328 print("%s;" % row[4]) | |
329 | |
330 def generate_ppep(f): | |
331 # get keys from upstream tabular file using readline() | |
332 # ref: https://stackoverflow.com/a/16713581/15509512 | |
333 # answer to "Use codecs to read file with correct encoding" | |
334 file1_encoded = open(f, "rb") | |
335 file1 = cx_getreader("latin-1")(file1_encoded) | |
336 | |
337 count = 0 | |
338 re_tab = re.compile("^[^\t]*") | |
339 re_quote = re.compile('"') | |
340 while True: | |
341 count += 1 | |
342 # Get next line from file | |
343 line = file1.readline() | |
344 # if line is empty | |
345 # end of file is reached | |
346 if not line: | |
347 break | |
348 if count > 1: | |
349 m = re_tab.match(line) | |
350 m = re_quote.sub("", m[0]) | |
351 yield m | |
352 file1.close() | |
353 file1_encoded.close() | |
354 | |
355 # Build an Aho-Corasick automaton from a trie | |
356 # - ref: | |
357 # - https://pypi.org/project/pyahocorasick/ | |
358 # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm | |
359 # - https://en.wikipedia.org/wiki/Trie | |
360 auto = ahocorasick.Automaton() | |
361 re_phos = re.compile("p") | |
362 # scrub out unsearchable characters per section | |
363 # "Match the p_peptides to the @sequences array:" | |
364 # of the original | |
365 # PhosphoPeptide Upstream Kinase Mapping.pl | |
366 # which originally read | |
367 # $tmp_p_peptide =~ s/#//g; | |
368 # $tmp_p_peptide =~ s/\d//g; | |
369 # $tmp_p_peptide =~ s/\_//g; | |
370 # $tmp_p_peptide =~ s/\.//g; | |
371 # | |
372 re_scrub = re.compile("0-9_.#") | |
373 ppep_count = 0 | |
374 for ppep in generate_ppep(f_name): | |
375 ppep_count += 1 | |
376 add_to_trie = False | |
377 # print(ppep) | |
378 scrubbed = re_scrub.sub("", ppep) | |
379 deppep = re_phos.sub("", scrubbed) | |
380 if options.verbose: | |
381 print("deppep: %s; scrubbed: %s" % (deppep, scrubbed)) | |
382 # print(deppep) | |
383 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) | |
384 if cur.fetchone() is None: | |
385 add_to_trie = True | |
386 cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) | |
387 cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) | |
388 deppep_id = cur.fetchone()[0] | |
389 if add_to_trie: | |
390 # print((deppep_id, deppep)) | |
391 # Build the trie | |
392 auto.add_word(deppep, (deppep_id, deppep)) | |
393 cur.execute( | |
394 "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", | |
395 (ppep, scrubbed, deppep_id), | |
396 ) | |
397 # def generate_deppep(): | |
398 # cur.execute("SELECT seq FROM deppep") | |
399 # for row in cur.fetchall(): | |
400 # yield row[0] | |
401 cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") | |
402 for row in cur.fetchall(): | |
403 deppep_count = row[0] | |
404 | |
405 cur.execute( | |
406 "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" | |
407 ) | |
408 for row in cur.fetchall(): | |
409 sequence_count = row[0] | |
410 | |
411 print("%d phosphopeptides were read from input" % ppep_count) | |
412 print( | |
413 "%d corresponding dephosphopeptides are represented in input" | |
414 % deppep_count | |
415 ) | |
416 # Look for cases where both Gene_Name and Sequence are identical | |
417 cur.execute( | |
418 """ | |
419 SELECT Uniprot_ID, Gene_Name, Sequence | |
420 FROM UniProtKB | |
421 WHERE Sequence IN ( | |
422 SELECT Sequence | |
423 FROM UniProtKB | |
424 GROUP BY Sequence, Gene_Name | |
425 HAVING count(*) > 1 | |
426 ) | |
427 ORDER BY Sequence | |
428 """ | |
429 ) | |
430 duplicate_count = 0 | |
431 old_seq = "" | |
432 for row in cur.fetchall(): | |
433 if duplicate_count == 0: | |
434 print( | |
435 "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." | |
436 ) | |
437 if row[2] != old_seq: | |
438 old_seq = row[2] | |
439 duplicate_count += 1 | |
440 if options.warn_duplicates: | |
441 print("\n%s\t%s\t%s" % row) | |
442 else: | |
443 if options.warn_duplicates: | |
444 print("%s\t%s" % (row[0], row[1])) | |
445 if duplicate_count > 0: | |
446 print( | |
447 "\n%d sequences have duplicated accession IDs\n" % duplicate_count | |
448 ) | |
449 | |
450 print("%s accession sequences will be searched\n" % sequence_count) | |
451 | |
452 # print(auto.dump()) | |
453 | |
454 # Convert the trie to an automaton (a finite-state machine) | |
455 auto.make_automaton() | |
456 | |
457 # Execute query for seqs and metadata without fetching the results yet | |
458 uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) | |
459 while 1: | |
460 batch = uniprot_seq_and_id.fetchmany(size=50) | |
461 if not batch: | |
462 break | |
463 for Sequence, UniProtKB_id in batch: | |
464 if Sequence is not None: | |
465 for end_index, (insert_order, original_value) in auto.iter( | |
466 Sequence | |
467 ): | |
468 ker.execute( | |
469 """ | |
470 INSERT INTO deppep_UniProtKB | |
471 (deppep_id,UniProtKB_id,pos_start,pos_end) | |
472 VALUES (?,?,?,?) | |
473 """, | |
474 ( | |
475 insert_order, | |
476 UniProtKB_id, | |
477 1 + end_index - len(original_value), | |
478 end_index, | |
479 ), | |
480 ) | |
481 else: | |
482 raise ValueError( | |
483 "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" | |
484 % (UniProtKB_id,) | |
485 ) | |
486 ker.execute( | |
487 """ | |
488 SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' | |
489 FROM uniprotkb_pep_ppep_view | |
490 """ | |
491 ) | |
492 for row in ker.fetchall(): | |
493 print(row[0]) | |
494 | |
495 ker.execute( | |
496 """ | |
497 SELECT count(*) || ' accession matches were found', count(*) AS accession_count | |
498 FROM ( | |
499 SELECT accession | |
500 FROM uniprotkb_pep_ppep_view | |
501 GROUP BY accession | |
502 ) | |
503 """ | |
504 ) | |
505 for row in ker.fetchall(): | |
506 print(row[0]) | |
507 | |
508 ker.execute( | |
509 """ | |
510 SELECT count(*) || ' peptide matches were found' | |
511 FROM ( | |
512 SELECT peptide | |
513 FROM uniprotkb_pep_ppep_view | |
514 GROUP BY peptide | |
515 ) | |
516 """ | |
517 ) | |
518 for row in ker.fetchall(): | |
519 print(row[0]) | |
520 | |
521 ker.execute( | |
522 """ | |
523 SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count | |
524 FROM ( | |
525 SELECT phosphopeptide | |
526 FROM uniprotkb_pep_ppep_view | |
527 GROUP BY phosphopeptide | |
528 ) | |
529 """ | |
530 ) | |
531 for row in ker.fetchall(): | |
532 print(row[0]) | |
533 | |
534 # link peptides not found in sequence database to a dummy sequence-record | |
535 ker.execute( | |
536 """ | |
537 INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end) | |
538 SELECT id, 'No Uniprot_ID', 0, 0 | |
539 FROM deppep | |
540 WHERE id NOT IN (SELECT deppep_id FROM deppep_UniProtKB) | |
541 """ | |
542 ) | |
543 | |
544 con.commit() | |
545 ker.execute("vacuum") | |
546 con.close() | |
547 | |
548 | |
549 if __name__ == "__main__": | |
550 wrap_start_time = time.perf_counter() | |
551 __main__() | |
552 wrap_stop_time = time.perf_counter() | |
553 # print(wrap_start_time) | |
554 # print(wrap_stop_time) | |
555 print( | |
556 "\nThe matching process took %d milliseconds to run.\n" | |
557 % ((wrap_stop_time - wrap_start_time) * 1000), | |
558 ) | |
559 | |
560 # vim: sw=4 ts=4 et ai : |