unipept: unipept.py comparison

comparison unipept.py @ 1:0c1ee95282fa draft

Uploaded

author	galaxyp
date	Tue, 14 Apr 2015 16:44:22 -0400
parents	6430407e5869
children	34758ab8aaa4

comparison

equal deleted inserted replaced

-:6430407e5869
+:0c1ee95282fa
 def warn_err(msg,exit_code=1):
 sys.stderr.write(msg)
 if exit_code:
 sys.exit(exit_code)
-def read_fasta(fp):
+pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
+pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
+pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
+pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
+def __main__():
+version = '1.1'
+pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
+def read_tabular(filepath,col):
+peptides = []
+with open(filepath) as fp:
+for i,line in enumerate(fp):
+if line.strip() == '' or line.startswith('#'):
+continue
+fields = line.rstrip('\n').split('\t')
+peptide = fields[col]
+if not re.match(pep_pat,peptide):
+warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
+peptides.append(peptide)
+return peptides
+def get_fasta_entries(fp):
 name, seq = None, []
 for line in fp:
 line = line.rstrip()
 if line.startswith(">"):
 if name: yield (name, ''.join(seq))
 name, seq = line, []
 else:
 seq.append(line)
 if name: yield (name, ''.join(seq))
-def read_mzid(fp):
+def read_fasta(filepath):
 peptides = []
-for event, elem in ET.iterparse(fp):
+with open(filepath) as fp:
-if event == 'end':
+for id, peptide in get_fasta_entries(fp):
-if re.search('PeptideSequence',elem.tag):
+if not re.match(pep_pat,peptide):
-peptides.append(elem.text)
+warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
-return peptides
+peptides.append(peptide)
+return peptides
-def read_pepxml(fp):
-peptides = []
+def read_mzid(fp):
-for event, elem in ET.iterparse(fp):
+peptides = []
-if event == 'end':
+for event, elem in ET.iterparse(fp):
-if re.search('search_hit',elem.tag):
+if event == 'end':
-peptides.append(elem.get('peptide'))
+if re.search('PeptideSequence',elem.tag):
-return peptides
+peptides.append(elem.text)
+return peptides
-def __main__():
+def read_pepxml(fp):
+peptides = []
+for event, elem in ET.iterparse(fp):
+if event == 'end':
+if re.search('search_hit',elem.tag):
+peptides.append(elem.get('peptide'))
+return peptides
+def best_match(peptide,matches):
+if not matches:
+return None
+elif len(matches) == 1:
+return matches[0].copy()
+else:
+# find the most specific match (peptide is always the first column order field)
+for col in reversed(pept2lca_extra_column_order[1:]):
+col_id = col+"_id" if options.extra else col
+for match in matches:
+if 'taxon_rank' in match and match['taxon_rank'] == col:
+return match.copy()
+if col_id in match and match[col_id]:
+return match.copy()
+return None
 #Parse Command Line
 parser = optparse.OptionParser()
-# unipept API
+# unipept API choice
-parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
+parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
-# files
+# input files
 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
 # Unipept Flags
 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+# output fields
+parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
 # Warn vs Error Flag
 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
-# outputs
+# output files
 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
-parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' )
+parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
+# debug
+parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
+parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
 (options, args) = parser.parse_args()
+if options.version:
+print >> sys.stdout,"%s" % version
+sys.exit(0)
 invalid_ec = 2 if options.strict else None
 peptides = []
-pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
 ## Get peptide sequences
 if options.mzid:
 peptides += read_mzid(options.mzid)
 if options.pepxml:
 peptides += read_pepxml(options.pepxml)
 if options.tabular:
-with open(options.tabular) as fp:
+peptides += read_tabular(options.tabular,options.column)
-for i,line in enumerate(fp):
-if line.strip() == '' or line.startswith('#'):
-continue
-fields = line.rstrip('\n').split('\t')
-peptide = fields[options.column]
-if not re.match(pep_pat,peptide):
-warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec)
-peptides.append(peptide)
 if options.fasta:
-with open(options.fasta) as fp:
+peptides += read_fasta(options.fasta)
-for id, peptide in read_fasta(fp):
-if not re.match(pep_pat,peptide):
-warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec)
-peptides.append(peptide)
 if args and len(args) > 0:
 for i,peptide in enumerate(args):
 if not re.match(pep_pat,peptide):
 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
 peptides.append(peptide)
 if len(peptides) < 1:
 warn_err("No peptides input!",exit_code=1)
+column_order = pept2lca_column_order
+if options.unipept == 'pept2prot':
+column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
+else:
+if options.extra or options.names:
+column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
+else:
+column_order = pept2lca_column_order
+## map to tryptic peptides
+pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
+partToPeps = {}
+for peptide, parts in pepToParts.iteritems():
+if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
+for part in parts:
+if len(part) > 50:
+warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
+if 5 <= len(part) <= 50:
+partToPeps.setdefault(part,[]).append(peptide)
+trypticPeptides = partToPeps.keys()
 ## unipept
 post_data = []
 if options.equate_il:
 post_data.append(("equate_il","true"))
 if options.names:
 post_data.append(("extra","true"))
 post_data.append(("names","true"))
 elif options.extra:
 post_data.append(("extra","true"))
-post_data += [('input[]', x) for x in peptides]
+post_data += [('input[]', x) for x in trypticPeptides]
 headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
-resp = json.loads( urllib2.urlopen( req ).read() )
+unipept_resp = json.loads( urllib2.urlopen( req ).read() )
+unmatched_peptides = []
+peptideMatches = []
+if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
+if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
+dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
+## multiple entries per trypticPeptide for pep2prot or pep2taxa
+mapping = {}
+for match in unipept_resp:
+mapping.setdefault(match['peptide'],[]).append(match)
+for peptide in peptides:
+# Get the intersection of matches to the tryptic parts
+keyToMatch = None
+for part in pepToParts[peptide]:
+if part in mapping:
+temp = {match[dupkey] : match  for match in mapping[part]}
+if keyToMatch:
+dkeys = set(keyToMatch.keys()) - set(temp.keys())
+for k in dkeys:
+del keyToMatch[k]
+else:
+keyToMatch = temp
+## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
+if not keyToMatch:
+unmatched_peptides.append(peptide)
+else:
+for key,match in keyToMatch.iteritems():
+match['tryptic_peptide'] = match['peptide']
+match['peptide'] = peptide
+peptideMatches.append(match)
+else:
+## should be one response per trypticPeptide for pep2lca
+respMap = {v['peptide']:v for v in unipept_resp}
+## map resp back to peptides
+for peptide in peptides:
+matches = list()
+for part in pepToParts[peptide]:
+if part in respMap:
+matches.append(respMap[part])
+match = best_match(peptide,matches)
+if not match:
+unmatched_peptides.append(peptide)
+longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+match = {'peptide' : longest_tryptic_peptide}
+match['tryptic_peptide'] = match['peptide']
+match['peptide'] = peptide
+peptideMatches.append(match)
+resp = peptideMatches
+if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
 ## output results
-if not (options.mismatch or options.json or options.tsv or options.csv):
+if not (options.unmatched or options.json or options.tsv or options.csv):
 print >> sys.stdout, str(resp)
-if options.mismatch:
+if options.unmatched:
-peptides_matched = []
+with open(options.unmatched,'w') as outputFile:
-for i,pdict in enumerate(resp):
-peptides_matched.append(pdict['peptide'])
-with open(options.mismatch,'w') as outputFile:
 for peptide in peptides:
-if not peptide in peptides_matched:
+if peptide in unmatched_peptides:
 outputFile.write("%s\n" % peptide)
 if options.json:
 with open(options.json,'w') as outputFile:
 outputFile.write(str(resp))
 if options.tsv or options.csv:
 # 'pept2lca','pept2taxa','pept2prot'
-pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
-pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
-column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order
 found_keys = set()
 results = []
 for i,pdict in enumerate(resp):
 results.append(pdict)
 found_keys |= set(pdict.keys())
 column_keys.append(col+'_id')
 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
 taxa = []
 for i,pdict in enumerate(results):
 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
-taxa.append(vals)
+if vals not in taxa:
+taxa.append(vals)
 if options.tsv:
 with open(options.tsv,'w') as outputFile:
 outputFile.write("#%s\n"% '\t'.join(column_names))
 for vals in taxa:
 outputFile.write("%s\n"% '\t'.join(vals))

Mercurial > repos > galaxyp > unipept

comparison unipept.py @ 1:0c1ee95282fa draft