Mercurial > repos > galaxyp > unipept
diff unipept.py @ 7:75b3b3d0adbf draft
"planemo upload for repository http://unipept.ugent.be/apidocs commit b6707ea113b2a89b0bb8072dfcc9ceeef4a1b708"
author | galaxyp |
---|---|
date | Tue, 06 Apr 2021 16:13:57 +0000 |
parents | 9aaa46d45472 |
children | 7863f1abcdda |
line wrap: on
line diff
--- a/unipept.py Tue Jun 02 10:30:01 2020 -0400 +++ b/unipept.py Tue Apr 06 16:13:57 2021 +0000 @@ -9,8 +9,8 @@ """ import json import optparse +import re import sys -import re import urllib.error import urllib.parse import urllib.request @@ -102,7 +102,7 @@ } pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] -pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] +pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[2:] pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] @@ -487,6 +487,8 @@ parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') + parser.add_option('-P', '--peptide_match', dest='peptide_match', choices=['best', 'full', 'report'], default='best', help='Match whole peptide') + parser.add_option('--unmatched_aa', dest='unmatched_aa', default=None, help='Show unmatched AA in peptide as') # debug parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') @@ -495,6 +497,16 @@ if options.version: print('%s' % version) sys.exit(0) + + def tryptic_match_string(peptide, tryptic_matches): + if options.unmatched_aa: + p = peptide.lower() + for m in tryptic_matches: + p = p.replace(m.lower(), m) + return re.sub('[a-z]', options.unmatched_aa, p) + else: + return ','.join(tryptic_matches) + invalid_ec = 2 if options.strict else None peptides = [] # Get peptide sequences @@ -522,7 +534,12 @@ else: column_order = pept2lca_column_order # map to tryptic peptides - pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} + if options.peptide_match == 'full': + pepToParts = {p: [p] for p in peptides} + else: + pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} + if options.debug: + print("column_order: %s\n" % (column_order), file=sys.stderr) partToPeps = {} for peptide, parts in pepToParts.items(): if options.debug: @@ -591,8 +608,10 @@ for peptide in peptides: # Get the intersection of matches to the tryptic parts keyToMatch = None + tryptic_match = [] for part in pepToParts[peptide]: if part in mapping: + tryptic_match.append(part) temp = {match[dupkey]: match for match in mapping[part]} if keyToMatch: dkeys = set(keyToMatch.keys()) - set(temp.keys()) @@ -605,6 +624,7 @@ unmatched_peptides.append(peptide) else: for key, match in keyToMatch.items(): + match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) @@ -614,14 +634,17 @@ # map resp back to peptides for peptide in peptides: matches = list() + tryptic_match = [] for part in pepToParts[peptide]: if part in respMap: + tryptic_match.append(part) matches.append(respMap[part]) match = best_match(peptide, matches) if not match: unmatched_peptides.append(peptide) longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] match = {'peptide': longest_tryptic_peptide} + match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) @@ -630,14 +653,17 @@ # map resp back to peptides for peptide in peptides: matches = list() + tryptic_match = [] for part in pepToParts[peptide]: if part in respMap and 'total_protein_count' in respMap[part]: + tryptic_match.append(part) matches.append(respMap[part]) match = best_match(peptide, matches) if not match: unmatched_peptides.append(peptide) longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] match = {'peptide': longest_tryptic_peptide} + match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) @@ -686,6 +712,10 @@ total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' column_names = ['peptide', 'total_protein_count'] vals = [peptide, total_protein_count] + if options.peptide_match == 'report': + column_names = ['peptide', 'tryptic_match', 'total_protein_count'] + tryptic_match = pdict.get('tryptic_match', '') + vals = [peptide, tryptic_match, total_protein_count] if ec_dict: vals += ec_dict[peptide] column_names += ec_cols @@ -700,11 +730,11 @@ column_names += taxon_cols[1:] rows.append(vals) elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: + if options.peptide_match == 'report': + column_order.insert(1, 'tryptic_match') (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) column_names = taxon_cols rows = list(taxa.values()) - for peptide, vals in taxa.items(): - rows.append(vals) if options.tsv: with open(options.tsv, 'w') as outputFile: if column_names: