Mercurial > repos > galaxyp > unipept
diff unipept.py @ 1:0c1ee95282fa draft
Uploaded
author | galaxyp |
---|---|
date | Tue, 14 Apr 2015 16:44:22 -0400 |
parents | 6430407e5869 |
children | 34758ab8aaa4 |
line wrap: on
line diff
--- a/unipept.py Fri Apr 03 14:55:49 2015 -0400 +++ b/unipept.py Tue Apr 14 16:44:22 2015 -0400 @@ -31,39 +31,86 @@ if exit_code: sys.exit(exit_code) -def read_fasta(fp): +pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] +pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] +pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] +pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] +pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] + +def __main__(): + version = '1.1' + pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' + + def read_tabular(filepath,col): + peptides = [] + with open(filepath) as fp: + for i,line in enumerate(fp): + if line.strip() == '' or line.startswith('#'): + continue + fields = line.rstrip('\n').split('\t') + peptide = fields[col] + if not re.match(pep_pat,peptide): + warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) + peptides.append(peptide) + return peptides + + def get_fasta_entries(fp): name, seq = None, [] for line in fp: - line = line.rstrip() - if line.startswith(">"): - if name: yield (name, ''.join(seq)) - name, seq = line, [] - else: - seq.append(line) + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) if name: yield (name, ''.join(seq)) -def read_mzid(fp): - peptides = [] - for event, elem in ET.iterparse(fp): - if event == 'end': - if re.search('PeptideSequence',elem.tag): - peptides.append(elem.text) - return peptides + def read_fasta(filepath): + peptides = [] + with open(filepath) as fp: + for id, peptide in get_fasta_entries(fp): + if not re.match(pep_pat,peptide): + warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) + peptides.append(peptide) + return peptides + + def read_mzid(fp): + peptides = [] + for event, elem in ET.iterparse(fp): + if event == 'end': + if re.search('PeptideSequence',elem.tag): + peptides.append(elem.text) + return peptides -def read_pepxml(fp): - peptides = [] - for event, elem in ET.iterparse(fp): - if event == 'end': - if re.search('search_hit',elem.tag): - peptides.append(elem.get('peptide')) - return peptides + def read_pepxml(fp): + peptides = [] + for event, elem in ET.iterparse(fp): + if event == 'end': + if re.search('search_hit',elem.tag): + peptides.append(elem.get('peptide')) + return peptides -def __main__(): + def best_match(peptide,matches): + if not matches: + return None + elif len(matches) == 1: + return matches[0].copy() + else: + # find the most specific match (peptide is always the first column order field) + for col in reversed(pept2lca_extra_column_order[1:]): + col_id = col+"_id" if options.extra else col + for match in matches: + if 'taxon_rank' in match and match['taxon_rank'] == col: + return match.copy() + if col_id in match and match[col_id]: + return match.copy() + return None + #Parse Command Line parser = optparse.OptionParser() - # unipept API - parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) - # files + # unipept API choice + parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) + # input files parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) @@ -73,38 +120,33 @@ parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) + # output fields + parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) # Warn vs Error Flag parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) - # outputs + # output files parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') - parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' ) + parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) + # debug + parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) + parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) (options, args) = parser.parse_args() + if options.version: + print >> sys.stdout,"%s" % version + sys.exit(0) invalid_ec = 2 if options.strict else None peptides = [] - pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' ## Get peptide sequences if options.mzid: peptides += read_mzid(options.mzid) if options.pepxml: peptides += read_pepxml(options.pepxml) if options.tabular: - with open(options.tabular) as fp: - for i,line in enumerate(fp): - if line.strip() == '' or line.startswith('#'): - continue - fields = line.rstrip('\n').split('\t') - peptide = fields[options.column] - if not re.match(pep_pat,peptide): - warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec) - peptides.append(peptide) + peptides += read_tabular(options.tabular,options.column) if options.fasta: - with open(options.fasta) as fp: - for id, peptide in read_fasta(fp): - if not re.match(pep_pat,peptide): - warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec) - peptides.append(peptide) + peptides += read_fasta(options.fasta) if args and len(args) > 0: for i,peptide in enumerate(args): if not re.match(pep_pat,peptide): @@ -112,6 +154,25 @@ peptides.append(peptide) if len(peptides) < 1: warn_err("No peptides input!",exit_code=1) + column_order = pept2lca_column_order + if options.unipept == 'pept2prot': + column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order + else: + if options.extra or options.names: + column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order + else: + column_order = pept2lca_column_order + ## map to tryptic peptides + pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} + partToPeps = {} + for peptide, parts in pepToParts.iteritems(): + if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) + for part in parts: + if len(part) > 50: + warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) + if 5 <= len(part) <= 50: + partToPeps.setdefault(part,[]).append(peptide) + trypticPeptides = partToPeps.keys() ## unipept post_data = [] if options.equate_il: @@ -121,30 +182,72 @@ post_data.append(("names","true")) elif options.extra: post_data.append(("extra","true")) - post_data += [('input[]', x) for x in peptides] + post_data += [('input[]', x) for x in trypticPeptides] headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) - resp = json.loads( urllib2.urlopen( req ).read() ) + unipept_resp = json.loads( urllib2.urlopen( req ).read() ) + unmatched_peptides = [] + peptideMatches = [] + if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) + if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': + dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide + ## multiple entries per trypticPeptide for pep2prot or pep2taxa + mapping = {} + for match in unipept_resp: + mapping.setdefault(match['peptide'],[]).append(match) + for peptide in peptides: + # Get the intersection of matches to the tryptic parts + keyToMatch = None + for part in pepToParts[peptide]: + if part in mapping: + temp = {match[dupkey] : match for match in mapping[part]} + if keyToMatch: + dkeys = set(keyToMatch.keys()) - set(temp.keys()) + for k in dkeys: + del keyToMatch[k] + else: + keyToMatch = temp + ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp + if not keyToMatch: + unmatched_peptides.append(peptide) + else: + for key,match in keyToMatch.iteritems(): + match['tryptic_peptide'] = match['peptide'] + match['peptide'] = peptide + peptideMatches.append(match) + else: + ## should be one response per trypticPeptide for pep2lca + respMap = {v['peptide']:v for v in unipept_resp} + ## map resp back to peptides + for peptide in peptides: + matches = list() + for part in pepToParts[peptide]: + if part in respMap: + matches.append(respMap[part]) + match = best_match(peptide,matches) + if not match: + unmatched_peptides.append(peptide) + longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] + match = {'peptide' : longest_tryptic_peptide} + match['tryptic_peptide'] = match['peptide'] + match['peptide'] = peptide + peptideMatches.append(match) + resp = peptideMatches + if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) ## output results - if not (options.mismatch or options.json or options.tsv or options.csv): + if not (options.unmatched or options.json or options.tsv or options.csv): print >> sys.stdout, str(resp) - if options.mismatch: - peptides_matched = [] - for i,pdict in enumerate(resp): - peptides_matched.append(pdict['peptide']) - with open(options.mismatch,'w') as outputFile: + if options.unmatched: + with open(options.unmatched,'w') as outputFile: for peptide in peptides: - if not peptide in peptides_matched: + if peptide in unmatched_peptides: outputFile.write("%s\n" % peptide) if options.json: with open(options.json,'w') as outputFile: outputFile.write(str(resp)) if options.tsv or options.csv: # 'pept2lca','pept2taxa','pept2prot' - pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] - pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] - column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order found_keys = set() results = [] for i,pdict in enumerate(resp): @@ -179,7 +282,8 @@ taxa = [] for i,pdict in enumerate(results): vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] - taxa.append(vals) + if vals not in taxa: + taxa.append(vals) if options.tsv: with open(options.tsv,'w') as outputFile: outputFile.write("#%s\n"% '\t'.join(column_names))