comparison unipept.py @ 7:75b3b3d0adbf draft

"planemo upload for repository http://unipept.ugent.be/apidocs commit b6707ea113b2a89b0bb8072dfcc9ceeef4a1b708"
author galaxyp
date Tue, 06 Apr 2021 16:13:57 +0000
parents 9aaa46d45472
children 7863f1abcdda
comparison
equal deleted inserted replaced
6:9aaa46d45472 7:75b3b3d0adbf
7 # 7 #
8 #------------------------------------------------------------------------------ 8 #------------------------------------------------------------------------------
9 """ 9 """
10 import json 10 import json
11 import optparse 11 import optparse
12 import re
12 import sys 13 import sys
13 import re
14 import urllib.error 14 import urllib.error
15 import urllib.parse 15 import urllib.parse
16 import urllib.request 16 import urllib.request
17 17
18 18
100 '6.5': 'form phosphoric ester bonds', 100 '6.5': 'form phosphoric ester bonds',
101 '6.6': 'form nitrogen-metal bonds', 101 '6.6': 'form nitrogen-metal bonds',
102 } 102 }
103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name']
104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma']
105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[2:]
106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id']
107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids']
108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') 485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') 486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') 487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') 488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches')
489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') 489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/')
490 parser.add_option('-P', '--peptide_match', dest='peptide_match', choices=['best', 'full', 'report'], default='best', help='Match whole peptide')
491 parser.add_option('--unmatched_aa', dest='unmatched_aa', default=None, help='Show unmatched AA in peptide as')
490 # debug 492 # debug
491 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') 493 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST')
492 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') 494 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging')
493 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') 495 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit')
494 (options, args) = parser.parse_args() 496 (options, args) = parser.parse_args()
495 if options.version: 497 if options.version:
496 print('%s' % version) 498 print('%s' % version)
497 sys.exit(0) 499 sys.exit(0)
500
501 def tryptic_match_string(peptide, tryptic_matches):
502 if options.unmatched_aa:
503 p = peptide.lower()
504 for m in tryptic_matches:
505 p = p.replace(m.lower(), m)
506 return re.sub('[a-z]', options.unmatched_aa, p)
507 else:
508 return ','.join(tryptic_matches)
509
498 invalid_ec = 2 if options.strict else None 510 invalid_ec = 2 if options.strict else None
499 peptides = [] 511 peptides = []
500 # Get peptide sequences 512 # Get peptide sequences
501 if options.mzid: 513 if options.mzid:
502 peptides += read_mzid(options.mzid) 514 peptides += read_mzid(options.mzid)
520 if options.extra or options.names: 532 if options.extra or options.names:
521 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order 533 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
522 else: 534 else:
523 column_order = pept2lca_column_order 535 column_order = pept2lca_column_order
524 # map to tryptic peptides 536 # map to tryptic peptides
525 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} 537 if options.peptide_match == 'full':
538 pepToParts = {p: [p] for p in peptides}
539 else:
540 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides}
541 if options.debug:
542 print("column_order: %s\n" % (column_order), file=sys.stderr)
526 partToPeps = {} 543 partToPeps = {}
527 for peptide, parts in pepToParts.items(): 544 for peptide, parts in pepToParts.items():
528 if options.debug: 545 if options.debug:
529 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) 546 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr)
530 for part in parts: 547 for part in parts:
589 for match in unipept_resp: 606 for match in unipept_resp:
590 mapping.setdefault(match['peptide'], []).append(match) 607 mapping.setdefault(match['peptide'], []).append(match)
591 for peptide in peptides: 608 for peptide in peptides:
592 # Get the intersection of matches to the tryptic parts 609 # Get the intersection of matches to the tryptic parts
593 keyToMatch = None 610 keyToMatch = None
611 tryptic_match = []
594 for part in pepToParts[peptide]: 612 for part in pepToParts[peptide]:
595 if part in mapping: 613 if part in mapping:
614 tryptic_match.append(part)
596 temp = {match[dupkey]: match for match in mapping[part]} 615 temp = {match[dupkey]: match for match in mapping[part]}
597 if keyToMatch: 616 if keyToMatch:
598 dkeys = set(keyToMatch.keys()) - set(temp.keys()) 617 dkeys = set(keyToMatch.keys()) - set(temp.keys())
599 for k in dkeys: 618 for k in dkeys:
600 del keyToMatch[k] 619 del keyToMatch[k]
603 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp 622 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
604 if not keyToMatch: 623 if not keyToMatch:
605 unmatched_peptides.append(peptide) 624 unmatched_peptides.append(peptide)
606 else: 625 else:
607 for key, match in keyToMatch.items(): 626 for key, match in keyToMatch.items():
627 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match)
608 match['tryptic_peptide'] = match['peptide'] 628 match['tryptic_peptide'] = match['peptide']
609 match['peptide'] = peptide 629 match['peptide'] = peptide
610 peptideMatches.append(match) 630 peptideMatches.append(match)
611 elif options.unipept in ['pept2lca', 'peptinfo']: 631 elif options.unipept in ['pept2lca', 'peptinfo']:
612 # should be one response per trypticPeptide for pep2lca 632 # should be one response per trypticPeptide for pep2lca
613 respMap = {v['peptide']: v for v in unipept_resp} 633 respMap = {v['peptide']: v for v in unipept_resp}
614 # map resp back to peptides 634 # map resp back to peptides
615 for peptide in peptides: 635 for peptide in peptides:
616 matches = list() 636 matches = list()
637 tryptic_match = []
617 for part in pepToParts[peptide]: 638 for part in pepToParts[peptide]:
618 if part in respMap: 639 if part in respMap:
640 tryptic_match.append(part)
619 matches.append(respMap[part]) 641 matches.append(respMap[part])
620 match = best_match(peptide, matches) 642 match = best_match(peptide, matches)
621 if not match: 643 if not match:
622 unmatched_peptides.append(peptide) 644 unmatched_peptides.append(peptide)
623 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] 645 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
624 match = {'peptide': longest_tryptic_peptide} 646 match = {'peptide': longest_tryptic_peptide}
647 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match)
625 match['tryptic_peptide'] = match['peptide'] 648 match['tryptic_peptide'] = match['peptide']
626 match['peptide'] = peptide 649 match['peptide'] = peptide
627 peptideMatches.append(match) 650 peptideMatches.append(match)
628 else: 651 else:
629 respMap = {v['peptide']: v for v in unipept_resp} 652 respMap = {v['peptide']: v for v in unipept_resp}
630 # map resp back to peptides 653 # map resp back to peptides
631 for peptide in peptides: 654 for peptide in peptides:
632 matches = list() 655 matches = list()
656 tryptic_match = []
633 for part in pepToParts[peptide]: 657 for part in pepToParts[peptide]:
634 if part in respMap and 'total_protein_count' in respMap[part]: 658 if part in respMap and 'total_protein_count' in respMap[part]:
659 tryptic_match.append(part)
635 matches.append(respMap[part]) 660 matches.append(respMap[part])
636 match = best_match(peptide, matches) 661 match = best_match(peptide, matches)
637 if not match: 662 if not match:
638 unmatched_peptides.append(peptide) 663 unmatched_peptides.append(peptide)
639 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] 664 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
640 match = {'peptide': longest_tryptic_peptide} 665 match = {'peptide': longest_tryptic_peptide}
666 match['tryptic_match'] = tryptic_match_string(peptide, tryptic_match)
641 match['tryptic_peptide'] = match['peptide'] 667 match['tryptic_peptide'] = match['peptide']
642 match['peptide'] = peptide 668 match['peptide'] = peptide
643 peptideMatches.append(match) 669 peptideMatches.append(match)
644 resp = peptideMatches 670 resp = peptideMatches
645 if options.debug: 671 if options.debug:
684 for i, pdict in enumerate(resp): 710 for i, pdict in enumerate(resp):
685 peptide = pdict['peptide'] 711 peptide = pdict['peptide']
686 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' 712 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
687 column_names = ['peptide', 'total_protein_count'] 713 column_names = ['peptide', 'total_protein_count']
688 vals = [peptide, total_protein_count] 714 vals = [peptide, total_protein_count]
715 if options.peptide_match == 'report':
716 column_names = ['peptide', 'tryptic_match', 'total_protein_count']
717 tryptic_match = pdict.get('tryptic_match', '')
718 vals = [peptide, tryptic_match, total_protein_count]
689 if ec_dict: 719 if ec_dict:
690 vals += ec_dict[peptide] 720 vals += ec_dict[peptide]
691 column_names += ec_cols 721 column_names += ec_cols
692 if go_dict: 722 if go_dict:
693 vals += go_dict[peptide] 723 vals += go_dict[peptide]
698 if taxa: 728 if taxa:
699 vals += taxa[peptide][1:] 729 vals += taxa[peptide][1:]
700 column_names += taxon_cols[1:] 730 column_names += taxon_cols[1:]
701 rows.append(vals) 731 rows.append(vals)
702 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: 732 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
733 if options.peptide_match == 'report':
734 column_order.insert(1, 'tryptic_match')
703 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) 735 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
704 column_names = taxon_cols 736 column_names = taxon_cols
705 rows = list(taxa.values()) 737 rows = list(taxa.values())
706 for peptide, vals in taxa.items():
707 rows.append(vals)
708 if options.tsv: 738 if options.tsv:
709 with open(options.tsv, 'w') as outputFile: 739 with open(options.tsv, 'w') as outputFile:
710 if column_names: 740 if column_names:
711 outputFile.write("#%s\n" % '\t'.join(column_names)) 741 outputFile.write("#%s\n" % '\t'.join(column_names))
712 for vals in rows: 742 for vals in rows: