Mercurial > repos > galaxyp > unipept
view unipept.py @ 3:34758ab8aaa4 draft
Uploaded
author | galaxyp |
---|---|
date | Mon, 20 Feb 2017 10:32:03 -0500 |
parents | 0c1ee95282fa |
children | 4953dcd7dd39 |
line wrap: on
line source
#!/usr/bin/env python """ # #------------------------------------------------------------------------------ # University of Minnesota # Copyright 2015, Regents of the University of Minnesota #------------------------------------------------------------------------------ # Author: # # James E Johnson # #------------------------------------------------------------------------------ """ import json import logging import optparse from optparse import OptionParser import os import sys import re import urllib import urllib2 try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET def warn_err(msg,exit_code=1): sys.stderr.write(msg) if exit_code: sys.exit(exit_code) pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] def __main__(): version = '2.0' pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' def read_tabular(filepath,col): peptides = [] with open(filepath) as fp: for i,line in enumerate(fp): if line.strip() == '' or line.startswith('#'): continue fields = line.rstrip('\n').split('\t') peptide = fields[col] if not re.match(pep_pat,peptide): warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) peptides.append(peptide) return peptides def get_fasta_entries(fp): name, seq = None, [] for line in fp: line = line.rstrip() if line.startswith(">"): if name: yield (name, ''.join(seq)) name, seq = line, [] else: seq.append(line) if name: yield (name, ''.join(seq)) def read_fasta(filepath): peptides = [] with open(filepath) as fp: for id, peptide in get_fasta_entries(fp): if not re.match(pep_pat,peptide): warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) peptides.append(peptide) return peptides def read_mzid(fp): peptides = [] for event, elem in ET.iterparse(fp): if event == 'end': if re.search('PeptideSequence',elem.tag): peptides.append(elem.text) return peptides def read_pepxml(fp): peptides = [] for event, elem in ET.iterparse(fp): if event == 'end': if re.search('search_hit',elem.tag): peptides.append(elem.get('peptide')) return peptides def best_match(peptide,matches): if not matches: return None elif len(matches) == 1: return matches[0].copy() else: # find the most specific match (peptide is always the first column order field) for col in reversed(pept2lca_extra_column_order[1:]): col_id = col+"_id" if options.extra else col for match in matches: if 'taxon_rank' in match and match['taxon_rank'] == col: return match.copy() if col_id in match and match[col_id]: return match.copy() return None #Parse Command Line parser = optparse.OptionParser() # unipept API choice parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) # input files parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) # Unipept Flags parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) # output fields parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) # Warn vs Error Flag parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) # output files parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) # debug parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) (options, args) = parser.parse_args() if options.version: print >> sys.stdout,"%s" % version sys.exit(0) invalid_ec = 2 if options.strict else None peptides = [] ## Get peptide sequences if options.mzid: peptides += read_mzid(options.mzid) if options.pepxml: peptides += read_pepxml(options.pepxml) if options.tabular: peptides += read_tabular(options.tabular,options.column) if options.fasta: peptides += read_fasta(options.fasta) if args and len(args) > 0: for i,peptide in enumerate(args): if not re.match(pep_pat,peptide): warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) peptides.append(peptide) if len(peptides) < 1: warn_err("No peptides input!",exit_code=1) column_order = pept2lca_column_order if options.unipept == 'pept2prot': column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order else: if options.extra or options.names: column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order else: column_order = pept2lca_column_order ## map to tryptic peptides pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} partToPeps = {} for peptide, parts in pepToParts.iteritems(): if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) for part in parts: if len(part) > 50: warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) if 5 <= len(part) <= 50: partToPeps.setdefault(part,[]).append(peptide) trypticPeptides = partToPeps.keys() ## unipept unipept_resp = [] idx = range(0,len(trypticPeptides),options.max_request) idx.append(len(trypticPeptides)) for i in range(len(idx)-1): post_data = [] if options.equate_il: post_data.append(("equate_il","true")) if options.names or options.json: post_data.append(("extra","true")) post_data.append(("names","true")) elif options.extra or options.json: post_data.append(("extra","true")) post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) unipept_resp += json.loads( urllib2.urlopen( req ).read() ) unmatched_peptides = [] peptideMatches = [] if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide ## multiple entries per trypticPeptide for pep2prot or pep2taxa mapping = {} for match in unipept_resp: mapping.setdefault(match['peptide'],[]).append(match) for peptide in peptides: # Get the intersection of matches to the tryptic parts keyToMatch = None for part in pepToParts[peptide]: if part in mapping: temp = {match[dupkey] : match for match in mapping[part]} if keyToMatch: dkeys = set(keyToMatch.keys()) - set(temp.keys()) for k in dkeys: del keyToMatch[k] else: keyToMatch = temp ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp if not keyToMatch: unmatched_peptides.append(peptide) else: for key,match in keyToMatch.iteritems(): match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) else: ## should be one response per trypticPeptide for pep2lca respMap = {v['peptide']:v for v in unipept_resp} ## map resp back to peptides for peptide in peptides: matches = list() for part in pepToParts[peptide]: if part in respMap: matches.append(respMap[part]) match = best_match(peptide,matches) if not match: unmatched_peptides.append(peptide) longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] match = {'peptide' : longest_tryptic_peptide} match['tryptic_peptide'] = match['peptide'] match['peptide'] = peptide peptideMatches.append(match) resp = peptideMatches if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) ## output results if not (options.unmatched or options.json or options.tsv or options.csv): print >> sys.stdout, str(resp) if options.unmatched: with open(options.unmatched,'w') as outputFile: for peptide in peptides: if peptide in unmatched_peptides: outputFile.write("%s\n" % peptide) if options.json: if options.unipept == 'pept2prot': with open(options.json,'w') as outputFile: outputFile.write(str(resp)) else: found_keys = set() for i,pdict in enumerate(resp): found_keys |= set(pdict.keys()) taxa_cols = [] for col in pept2lca_extra_column_order[-1:0:-1]: if col+'_id' in found_keys: taxa_cols.append(col) id_to_node = dict() def get_node(id,name,rank,child,seq): if id not in id_to_node: data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } id_to_node[id] = node else: node = id_to_node[id] node['data']['count'] += 1 if seq is not None and seq not in node['data']['sequences']: node['data']['sequences'].append(seq) if child is None: node['data']['self_count'] += 1 elif child['id'] not in node['kids']: node['kids'].append(child['id']) node['children'].append(child) return node root = get_node(1,'root','no rank',None,None) for i,pdict in enumerate(resp): sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) seq = sequence child = None for col in taxa_cols: col_id = col+'_id' if col_id in pdict and pdict.get(col_id): col_name = col if col in found_keys else col+'_name' child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) seq = None if child: get_node(1,'root','no rank',child,None) with open(options.json,'w') as outputFile: outputFile.write(json.dumps(root)) if options.tsv or options.csv: # 'pept2lca','pept2taxa','pept2prot' found_keys = set() results = [] for i,pdict in enumerate(resp): results.append(pdict) found_keys |= set(pdict.keys()) # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) column_names = [] column_keys = [] for col in column_order: if col in found_keys: column_names.append(col) column_keys.append(col) elif options.extra or options.names: col_id = col+'_id' col_name = col+'_name' if options.extra: if col_id in found_keys: column_names.append(col_id) column_keys.append(col_id) if options.names: if col_name in found_keys: column_names.append(col) column_keys.append(col_name) else: if col+'_name' in found_keys: column_names.append(col) column_keys.append(col+'_name') elif col+'_id' in found_keys: column_names.append(col) column_keys.append(col+'_id') # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) taxa = [] for i,pdict in enumerate(results): vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] if vals not in taxa: taxa.append(vals) if options.tsv: with open(options.tsv,'w') as outputFile: outputFile.write("#%s\n"% '\t'.join(column_names)) for vals in taxa: outputFile.write("%s\n"% '\t'.join(vals)) if options.csv: with open(options.csv,'w') as outputFile: outputFile.write("%s\n"% ','.join(column_names)) for vals in taxa: outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) if __name__ == "__main__" : __main__()