Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 7:75b3b3d0adbf draft
"planemo upload for repository http://unipept.ugent.be/apidocs commit b6707ea113b2a89b0bb8072dfcc9ceeef4a1b708"
author | galaxyp |
---|---|
date | Tue, 06 Apr 2021 16:13:57 +0000 |
parents | 9aaa46d45472 |
children | 7863f1abcdda |
comparison
equal
deleted
inserted
replaced
6:9aaa46d45472 | 7:75b3b3d0adbf |
---|---|
7 # | 7 # |
8 #------------------------------------------------------------------------------ | 8 #------------------------------------------------------------------------------ |
9 """ | 9 """ |
10 import json | 10 import json |
11 import optparse | 11 import optparse |
12 import re | |
12 import sys | 13 import sys |
13 import re | |
14 import urllib.error | 14 import urllib.error |
15 import urllib.parse | 15 import urllib.parse |
16 import urllib.request | 16 import urllib.request |
17 | 17 |
18 | 18 |
100 '6.5': 'form phosphoric ester bonds', | 100 '6.5': 'form phosphoric ester bonds', |
101 '6.6': 'form nitrogen-metal bonds', | 101 '6.6': 'form nitrogen-metal bonds', |
102 } | 102 } |
103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] | 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] |
104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] | 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] |
105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[2:] |
106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] | 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] |
107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] | 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] |
108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] |
109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] |
110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] |
485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | 485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') |
486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | 486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') |
487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | 487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') |
488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') | 488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') |
489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') | 489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') |
490 parser.add_option('-P', '--peptide_match', dest='peptide_match', choices=['best', 'full', 'report'], default='best', help='Match whole peptide') | |
491 parser.add_option('--unmatched_aa', dest='unmatched_aa', default=None, help='Show unmatched AA in peptide as') | |
490 # debug | 492 # debug |
491 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') | 493 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') |
492 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') | 494 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') |
493 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') | 495 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') |
494 (options, args) = parser.parse_args() | 496 (options, args) = parser.parse_args() |
495 if options.version: | 497 if options.version: |
496 print('%s' % version) | 498 print('%s' % version) |
497 sys.exit(0) | 499 sys.exit(0) |
500 | |
501 def tryptic_match_string(peptide, tryptic_matches): | |
502 if options.unmatched_aa: | |
503 p = peptide.lower() | |
504 for m in tryptic_matches: | |
505 p = p.replace(m.lower(), m) | |
506 return re.sub('[a-z]', options.unmatched_aa, p) | |
507 else: | |
508 return ','.join(tryptic_matches) | |
509 | |
498 invalid_ec = 2 if options.strict else None | 510 invalid_ec = 2 if options.strict else None |
499 peptides = [] | 511 peptides = [] |
500 # Get peptide sequences | 512 # Get peptide sequences |
501 if options.mzid: | 513 if options.mzid: |
502 peptides += read_mzid(options.mzid) | 514 peptides += read_mzid(options.mzid) |
520 if options.extra or options.names: | 532 if options.extra or options.names: |
521 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | 533 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order |
522 else: | 534 else: |
523 column_order = pept2lca_column_order | 535 column_order = pept2lca_column_order |
524 # map to tryptic peptides | 536 # map to tryptic peptides |
525 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} | 537 if options.peptide_match == 'full': |
538 pepToParts = {p: [p] for p in peptides} | |
539 else: | |
540 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} | |
541 if options.debug: | |
542 print("column_order: %s\n" % (column_order), file=sys.stderr) | |
526 partToPeps = {} | 543 partToPeps = {} |
527 for peptide, parts in pepToParts.items(): | 544 for peptide, parts in pepToParts.items(): |
528 if options.debug: | 545 if options.debug: |
529 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) | 546 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) |
530 for part in parts: | 547 for part in parts: |
589 for match in unipept_resp: | 606 for match in unipept_resp: |
590 mapping.setdefault(match['peptide'], []).append(match) | 607 mapping.setdefault(match['peptide'], []).append(match) |
591 for peptide in peptides: | 608 for peptide in peptides: |
592 # Get the intersection of matches to the tryptic parts | 609 # Get the intersection of matches to the tryptic parts |
593 keyToMatch = None | 610 keyToMatch = None |
611 tryptic_match = [] | |
594 for part in pepToParts[peptide]: | 612 for part in pepToParts[peptide]: |
595 if part in mapping: | 613 if part in mapping: |
614 tryptic_match.append(part) | |
596 temp = {match[dupkey]: match for match in mapping[part]} | 615 temp = {match[dupkey]: match for match in mapping[part]} |
597 if keyToMatch: | 616 if keyToMatch: |
598 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | 617 dkeys = set(keyToMatch.keys()) - set(temp.keys()) |
599 for k in dkeys: | 618 for k in dkeys: |
600 del keyToMatch[k] | 619 del keyToMatch[k] |
603 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | 622 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp |
604 if not keyToMatch: | 623 if not keyToMatch: |
605 unmatched_peptides.append(peptide) | 624 unmatched_peptides.append(peptide) |
606 else: | 625 else: |
607 for key, match in keyToMatch.items(): | 626 for key, match in keyToMatch.items(): |
627 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) | |
608 match['tryptic_peptide'] = match['peptide'] | 628 match['tryptic_peptide'] = match['peptide'] |
609 match['peptide'] = peptide | 629 match['peptide'] = peptide |
610 peptideMatches.append(match) | 630 peptideMatches.append(match) |
611 elif options.unipept in ['pept2lca', 'peptinfo']: | 631 elif options.unipept in ['pept2lca', 'peptinfo']: |
612 # should be one response per trypticPeptide for pep2lca | 632 # should be one response per trypticPeptide for pep2lca |
613 respMap = {v['peptide']: v for v in unipept_resp} | 633 respMap = {v['peptide']: v for v in unipept_resp} |
614 # map resp back to peptides | 634 # map resp back to peptides |
615 for peptide in peptides: | 635 for peptide in peptides: |
616 matches = list() | 636 matches = list() |
637 tryptic_match = [] | |
617 for part in pepToParts[peptide]: | 638 for part in pepToParts[peptide]: |
618 if part in respMap: | 639 if part in respMap: |
640 tryptic_match.append(part) | |
619 matches.append(respMap[part]) | 641 matches.append(respMap[part]) |
620 match = best_match(peptide, matches) | 642 match = best_match(peptide, matches) |
621 if not match: | 643 if not match: |
622 unmatched_peptides.append(peptide) | 644 unmatched_peptides.append(peptide) |
623 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 645 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
624 match = {'peptide': longest_tryptic_peptide} | 646 match = {'peptide': longest_tryptic_peptide} |
647 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) | |
625 match['tryptic_peptide'] = match['peptide'] | 648 match['tryptic_peptide'] = match['peptide'] |
626 match['peptide'] = peptide | 649 match['peptide'] = peptide |
627 peptideMatches.append(match) | 650 peptideMatches.append(match) |
628 else: | 651 else: |
629 respMap = {v['peptide']: v for v in unipept_resp} | 652 respMap = {v['peptide']: v for v in unipept_resp} |
630 # map resp back to peptides | 653 # map resp back to peptides |
631 for peptide in peptides: | 654 for peptide in peptides: |
632 matches = list() | 655 matches = list() |
656 tryptic_match = [] | |
633 for part in pepToParts[peptide]: | 657 for part in pepToParts[peptide]: |
634 if part in respMap and 'total_protein_count' in respMap[part]: | 658 if part in respMap and 'total_protein_count' in respMap[part]: |
659 tryptic_match.append(part) | |
635 matches.append(respMap[part]) | 660 matches.append(respMap[part]) |
636 match = best_match(peptide, matches) | 661 match = best_match(peptide, matches) |
637 if not match: | 662 if not match: |
638 unmatched_peptides.append(peptide) | 663 unmatched_peptides.append(peptide) |
639 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 664 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
640 match = {'peptide': longest_tryptic_peptide} | 665 match = {'peptide': longest_tryptic_peptide} |
666 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match) | |
641 match['tryptic_peptide'] = match['peptide'] | 667 match['tryptic_peptide'] = match['peptide'] |
642 match['peptide'] = peptide | 668 match['peptide'] = peptide |
643 peptideMatches.append(match) | 669 peptideMatches.append(match) |
644 resp = peptideMatches | 670 resp = peptideMatches |
645 if options.debug: | 671 if options.debug: |
684 for i, pdict in enumerate(resp): | 710 for i, pdict in enumerate(resp): |
685 peptide = pdict['peptide'] | 711 peptide = pdict['peptide'] |
686 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' | 712 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' |
687 column_names = ['peptide', 'total_protein_count'] | 713 column_names = ['peptide', 'total_protein_count'] |
688 vals = [peptide, total_protein_count] | 714 vals = [peptide, total_protein_count] |
715 if options.peptide_match == 'report': | |
716 column_names = ['peptide', 'tryptic_match', 'total_protein_count'] | |
717 tryptic_match = pdict.get('tryptic_match', '') | |
718 vals = [peptide, tryptic_match, total_protein_count] | |
689 if ec_dict: | 719 if ec_dict: |
690 vals += ec_dict[peptide] | 720 vals += ec_dict[peptide] |
691 column_names += ec_cols | 721 column_names += ec_cols |
692 if go_dict: | 722 if go_dict: |
693 vals += go_dict[peptide] | 723 vals += go_dict[peptide] |
698 if taxa: | 728 if taxa: |
699 vals += taxa[peptide][1:] | 729 vals += taxa[peptide][1:] |
700 column_names += taxon_cols[1:] | 730 column_names += taxon_cols[1:] |
701 rows.append(vals) | 731 rows.append(vals) |
702 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: | 732 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: |
733 if options.peptide_match == 'report': | |
734 column_order.insert(1, 'tryptic_match') | |
703 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | 735 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
704 column_names = taxon_cols | 736 column_names = taxon_cols |
705 rows = list(taxa.values()) | 737 rows = list(taxa.values()) |
706 for peptide, vals in taxa.items(): | |
707 rows.append(vals) | |
708 if options.tsv: | 738 if options.tsv: |
709 with open(options.tsv, 'w') as outputFile: | 739 with open(options.tsv, 'w') as outputFile: |
710 if column_names: | 740 if column_names: |
711 outputFile.write("#%s\n" % '\t'.join(column_names)) | 741 outputFile.write("#%s\n" % '\t'.join(column_names)) |
712 for vals in rows: | 742 for vals in rows: |