Mercurial > repos > galaxyp > mqppep_preproc
diff search_ppep.py @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search_ppep.py Mon Jul 11 19:22:54 2022 +0000 @@ -0,0 +1,560 @@ +#!/usr/bin/env python +# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB + +import argparse +import os.path +import re +import sqlite3 +import sys # import the sys module for exc_info +import time +import traceback # import the traceback module for format_exception +from codecs import getreader as cx_getreader + +# For Aho-Corasick search for fixed set of substrings +# - add_word +# - make_automaton +# - iter +import ahocorasick + + +# ref: https://stackoverflow.com/a/8915613/15509512 +# answers: "How to handle exceptions in a list comprehensions" +# usage: +# from math import log +# eggs = [1,3,0,3,2] +# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) +# producing: +# for <built-in function log> +# with args (0,) +# exception: math domain error +# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] +def catch(func, *args, handle=lambda e: e, **kwargs): + + try: + return func(*args, **kwargs) + except Exception as e: + print("For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + # exit(-1) + return None # was handle(e) + + +def __main__(): + + DROP_TABLES_SQL = """ + DROP VIEW IF EXISTS ppep_gene_site_view; + DROP VIEW IF EXISTS uniprot_view; + DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; + DROP VIEW IF EXISTS ppep_intensity_view; + DROP VIEW IF EXISTS ppep_metadata_view; + + DROP TABLE IF EXISTS sample; + DROP TABLE IF EXISTS ppep; + DROP TABLE IF EXISTS site_type; + DROP TABLE IF EXISTS deppep_UniProtKB; + DROP TABLE IF EXISTS deppep; + DROP TABLE IF EXISTS ppep_gene_site; + DROP TABLE IF EXISTS ppep_metadata; + DROP TABLE IF EXISTS ppep_intensity; + """ + + CREATE_TABLES_SQL = """ + CREATE TABLE deppep + ( id INTEGER PRIMARY KEY + , seq TEXT UNIQUE ON CONFLICT IGNORE + ) + ; + CREATE TABLE deppep_UniProtKB + ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE + , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE + , pos_start INTEGER + , pos_end INTEGER + , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) + ON CONFLICT IGNORE + ) + ; + CREATE TABLE ppep + ( id INTEGER PRIMARY KEY + , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE + , seq TEXT UNIQUE ON CONFLICT IGNORE + , scrubbed TEXT + ); + CREATE TABLE site_type + ( id INTEGER PRIMARY KEY + , type_name TEXT UNIQUE ON CONFLICT IGNORE + ); + CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) + ; + CREATE TABLE sample + ( id INTEGER PRIMARY KEY + , name TEXT UNIQUE ON CONFLICT IGNORE + ) + ; + CREATE VIEW uniprot_view AS + SELECT DISTINCT + Uniprot_ID + , Description + , Organism_Name + , Organism_ID + , Gene_Name + , PE + , SV + , Sequence + , Description || + CASE WHEN Organism_Name = 'N/A' + THEN '' + ELSE ' OS='|| Organism_Name + END || + CASE WHEN Organism_ID = -1 + THEN '' + ELSE ' OX='|| Organism_ID + END || + CASE WHEN Gene_Name = 'N/A' + THEN '' + ELSE ' GN='|| Gene_Name + END || + CASE WHEN PE = 'N/A' + THEN '' + ELSE ' PE='|| PE + END || + CASE WHEN SV = 'N/A' + THEN '' + ELSE ' SV='|| SV + END AS long_description + , Database + FROM UniProtKB + ; + CREATE VIEW uniprotkb_pep_ppep_view AS + SELECT deppep_UniProtKB.UniprotKB_ID AS accession + , deppep_UniProtKB.pos_start AS pos_start + , deppep_UniProtKB.pos_end AS pos_end + , deppep.seq AS peptide + , ppep.seq AS phosphopeptide + , ppep.scrubbed AS scrubbed + , uniprot_view.Sequence AS sequence + , uniprot_view.Description AS description + , uniprot_view.long_description AS long_description + , ppep.id AS ppep_id + FROM ppep, deppep, deppep_UniProtKB, uniprot_view + WHERE deppep.id = ppep.deppep_id + AND deppep.id = deppep_UniProtKB.deppep_id + AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID + ORDER BY UniprotKB_ID, deppep.seq, ppep.seq + ; + CREATE TABLE ppep_gene_site + ( ppep_id INTEGER REFERENCES ppep(id) + , gene_names TEXT + , site_type_id INTEGER REFERENCES site_type(id) + , kinase_map TEXT + , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_gene_site_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , ppep_id + , gene_names + , type_name + , kinase_map + FROM + ppep, ppep_gene_site, site_type + WHERE + ppep_gene_site.ppep_id = ppep.id + AND + ppep_gene_site.site_type_id = site_type.id + ORDER BY + ppep.seq + ; + CREATE TABLE ppep_metadata + ( ppep_id INTEGER REFERENCES ppep(id) + , protein_description TEXT + , gene_name TEXT + , FASTA_name TEXT + , phospho_sites TEXT + , motifs_unique TEXT + , accessions TEXT + , motifs_all_members TEXT + , domain TEXT + , ON_FUNCTION TEXT + , ON_PROCESS TEXT + , ON_PROT_INTERACT TEXT + , ON_OTHER_INTERACT TEXT + , notes TEXT + , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_metadata_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , protein_description + , gene_name + , FASTA_name + , phospho_sites + , motifs_unique + , accessions + , motifs_all_members + , domain + , ON_FUNCTION + , ON_PROCESS + , ON_PROT_INTERACT + , ON_OTHER_INTERACT + , notes + FROM + ppep, ppep_metadata + WHERE + ppep_metadata.ppep_id = ppep.id + ORDER BY + ppep.seq + ; + CREATE TABLE ppep_intensity + ( ppep_id INTEGER REFERENCES ppep(id) + , sample_id INTEGER + , intensity INTEGER + , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_intensity_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , sample.name AS sample + , intensity + FROM + ppep, sample, ppep_intensity + WHERE + ppep_intensity.sample_id = sample.id + AND + ppep_intensity.ppep_id = ppep.id + ; + """ + + UNIPROT_SEQ_AND_ID_SQL = """ + select Sequence, Uniprot_ID + from UniProtKB + """ + + # Parse Command Line + parser = argparse.ArgumentParser( + description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." + ) + + # inputs: + # Phosphopeptide data for experimental results, including the intensities + # and the mapping to kinase domains, in tabular format. + parser.add_argument( + "--phosphopeptides", + "-p", + nargs=1, + required=True, + dest="phosphopeptides", + help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", + ) + parser.add_argument( + "--uniprotkb", + "-u", + nargs=1, + required=True, + dest="uniprotkb", + help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", + ) + parser.add_argument( + "--schema", + action="store_true", + dest="db_schema", + help="show updated database schema", + ) + parser.add_argument( + "--warn-duplicates", + action="store_true", + dest="warn_duplicates", + help="show warnings for duplicated sequences", + ) + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + help="show somewhat verbose program tracing", + ) + # "Make it so!" (parse the arguments) + options = parser.parse_args() + if options.verbose: + print("options: " + str(options) + "\n") + + # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file + if options.phosphopeptides is None: + exit('Argument "phosphopeptides" is required but not supplied') + try: + f_name = os.path.abspath(options.phosphopeptides[0]) + except Exception as e: + exit("Error parsing phosphopeptides argument: %s" % (e)) + + # path to SQLite input/output tabular file + if options.uniprotkb is None: + exit('Argument "uniprotkb" is required but not supplied') + try: + db_name = os.path.abspath(options.uniprotkb[0]) + except Exception as e: + exit("Error parsing uniprotkb argument: %s" % (e)) + + # print("options.schema is %d" % options.db_schema) + + # db_name = "demo/test.sqlite" + # f_name = "demo/test_input.txt" + + con = sqlite3.connect(db_name) + cur = con.cursor() + ker = con.cursor() + + cur.executescript(DROP_TABLES_SQL) + + # if options.db_schema: + # print("\nAfter dropping tables/views that are to be created, schema is:") + # cur.execute("SELECT * FROM sqlite_schema") + # for row in cur.fetchall(): + # if row[4] is not None: + # print("%s;" % row[4]) + + cur.executescript(CREATE_TABLES_SQL) + + if options.db_schema: + print( + "\nAfter creating tables/views that are to be created, schema is:" + ) + cur.execute("SELECT * FROM sqlite_schema") + for row in cur.fetchall(): + if row[4] is not None: + print("%s;" % row[4]) + + def generate_ppep(f): + # get keys from upstream tabular file using readline() + # ref: https://stackoverflow.com/a/16713581/15509512 + # answer to "Use codecs to read file with correct encoding" + file1_encoded = open(f, "rb") + file1 = cx_getreader("latin-1")(file1_encoded) + + count = 0 + re_tab = re.compile("^[^\t]*") + re_quote = re.compile('"') + while True: + count += 1 + # Get next line from file + line = file1.readline() + # if line is empty + # end of file is reached + if not line: + break + if count > 1: + m = re_tab.match(line) + m = re_quote.sub("", m[0]) + yield m + file1.close() + file1_encoded.close() + + # Build an Aho-Corasick automaton from a trie + # - ref: + # - https://pypi.org/project/pyahocorasick/ + # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm + # - https://en.wikipedia.org/wiki/Trie + auto = ahocorasick.Automaton() + re_phos = re.compile("p") + # scrub out unsearchable characters per section + # "Match the p_peptides to the @sequences array:" + # of the original + # PhosphoPeptide Upstream Kinase Mapping.pl + # which originally read + # $tmp_p_peptide =~ s/#//g; + # $tmp_p_peptide =~ s/\d//g; + # $tmp_p_peptide =~ s/\_//g; + # $tmp_p_peptide =~ s/\.//g; + # + re_scrub = re.compile("0-9_.#") + ppep_count = 0 + for ppep in generate_ppep(f_name): + ppep_count += 1 + add_to_trie = False + # print(ppep) + scrubbed = re_scrub.sub("", ppep) + deppep = re_phos.sub("", scrubbed) + if options.verbose: + print("deppep: %s; scrubbed: %s" % (deppep, scrubbed)) + # print(deppep) + cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) + if cur.fetchone() is None: + add_to_trie = True + cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) + cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) + deppep_id = cur.fetchone()[0] + if add_to_trie: + # print((deppep_id, deppep)) + # Build the trie + auto.add_word(deppep, (deppep_id, deppep)) + cur.execute( + "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", + (ppep, scrubbed, deppep_id), + ) + # def generate_deppep(): + # cur.execute("SELECT seq FROM deppep") + # for row in cur.fetchall(): + # yield row[0] + cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") + for row in cur.fetchall(): + deppep_count = row[0] + + cur.execute( + "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" + ) + for row in cur.fetchall(): + sequence_count = row[0] + + print("%d phosphopeptides were read from input" % ppep_count) + print( + "%d corresponding dephosphopeptides are represented in input" + % deppep_count + ) + # Look for cases where both Gene_Name and Sequence are identical + cur.execute( + """ + SELECT Uniprot_ID, Gene_Name, Sequence + FROM UniProtKB + WHERE Sequence IN ( + SELECT Sequence + FROM UniProtKB + GROUP BY Sequence, Gene_Name + HAVING count(*) > 1 + ) + ORDER BY Sequence + """ + ) + duplicate_count = 0 + old_seq = "" + for row in cur.fetchall(): + if duplicate_count == 0: + print( + "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." + ) + if row[2] != old_seq: + old_seq = row[2] + duplicate_count += 1 + if options.warn_duplicates: + print("\n%s\t%s\t%s" % row) + else: + if options.warn_duplicates: + print("%s\t%s" % (row[0], row[1])) + if duplicate_count > 0: + print( + "\n%d sequences have duplicated accession IDs\n" % duplicate_count + ) + + print("%s accession sequences will be searched\n" % sequence_count) + + # print(auto.dump()) + + # Convert the trie to an automaton (a finite-state machine) + auto.make_automaton() + + # Execute query for seqs and metadata without fetching the results yet + uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) + while 1: + batch = uniprot_seq_and_id.fetchmany(size=50) + if not batch: + break + for Sequence, UniProtKB_id in batch: + if Sequence is not None: + for end_index, (insert_order, original_value) in auto.iter( + Sequence + ): + ker.execute( + """ + INSERT INTO deppep_UniProtKB + (deppep_id,UniProtKB_id,pos_start,pos_end) + VALUES (?,?,?,?) + """, + ( + insert_order, + UniProtKB_id, + 1 + end_index - len(original_value), + end_index, + ), + ) + else: + raise ValueError( + "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" + % (UniProtKB_id,) + ) + ker.execute( + """ + SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' + FROM uniprotkb_pep_ppep_view + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute( + """ + SELECT count(*) || ' accession matches were found', count(*) AS accession_count + FROM ( + SELECT accession + FROM uniprotkb_pep_ppep_view + GROUP BY accession + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute( + """ + SELECT count(*) || ' peptide matches were found' + FROM ( + SELECT peptide + FROM uniprotkb_pep_ppep_view + GROUP BY peptide + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute( + """ + SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count + FROM ( + SELECT phosphopeptide + FROM uniprotkb_pep_ppep_view + GROUP BY phosphopeptide + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + # link peptides not found in sequence database to a dummy sequence-record + ker.execute( + """ + INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end) + SELECT id, 'No Uniprot_ID', 0, 0 + FROM deppep + WHERE id NOT IN (SELECT deppep_id FROM deppep_UniProtKB) + """ + ) + + con.commit() + ker.execute("vacuum") + con.close() + + +if __name__ == "__main__": + wrap_start_time = time.perf_counter() + __main__() + wrap_stop_time = time.perf_counter() + # print(wrap_start_time) + # print(wrap_stop_time) + print( + "\nThe matching process took %d milliseconds to run.\n" + % ((wrap_stop_time - wrap_start_time) * 1000), + ) + +# vim: sw=4 ts=4 et ai :