diff unipept.py @ 1:0c1ee95282fa draft

Uploaded
author galaxyp
date Tue, 14 Apr 2015 16:44:22 -0400
parents 6430407e5869
children 34758ab8aaa4
line wrap: on
line diff
--- a/unipept.py	Fri Apr 03 14:55:49 2015 -0400
+++ b/unipept.py	Tue Apr 14 16:44:22 2015 -0400
@@ -31,39 +31,86 @@
     if exit_code:
       sys.exit(exit_code)
 
-def read_fasta(fp):
+pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
+pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
+pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
+pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
+pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
+
+def __main__():
+  version = '1.1'
+  pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
+
+  def read_tabular(filepath,col):
+    peptides = []
+    with open(filepath) as fp:
+      for i,line in enumerate(fp):
+        if line.strip() == '' or line.startswith('#'):
+          continue
+        fields = line.rstrip('\n').split('\t')
+        peptide = fields[col]
+        if not re.match(pep_pat,peptide):
+          warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
+        peptides.append(peptide)
+    return peptides
+
+  def get_fasta_entries(fp):
     name, seq = None, []
     for line in fp:
-        line = line.rstrip()
-        if line.startswith(">"):
-            if name: yield (name, ''.join(seq))
-            name, seq = line, []
-        else:
-            seq.append(line)
+      line = line.rstrip()
+      if line.startswith(">"):
+        if name: yield (name, ''.join(seq))
+        name, seq = line, []
+      else:
+        seq.append(line)
     if name: yield (name, ''.join(seq))
 
-def read_mzid(fp):
-  peptides = []
-  for event, elem in ET.iterparse(fp):
-    if event == 'end':
-      if re.search('PeptideSequence',elem.tag):
-        peptides.append(elem.text)
-  return peptides
+  def read_fasta(filepath):
+    peptides = []
+    with open(filepath) as fp:
+      for id, peptide in get_fasta_entries(fp):
+        if not re.match(pep_pat,peptide):
+          warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
+        peptides.append(peptide)
+    return peptides
+
+  def read_mzid(fp):
+    peptides = []
+    for event, elem in ET.iterparse(fp):
+      if event == 'end':
+        if re.search('PeptideSequence',elem.tag):
+          peptides.append(elem.text)
+    return peptides
 
-def read_pepxml(fp):
-  peptides = []
-  for event, elem in ET.iterparse(fp):
-    if event == 'end':
-      if re.search('search_hit',elem.tag):
-        peptides.append(elem.get('peptide'))
-  return peptides
+  def read_pepxml(fp):
+    peptides = []
+    for event, elem in ET.iterparse(fp):
+      if event == 'end':
+        if re.search('search_hit',elem.tag):
+          peptides.append(elem.get('peptide'))
+    return peptides
 
-def __main__():
+  def best_match(peptide,matches):
+    if not matches:
+      return None
+    elif len(matches) == 1:
+      return matches[0].copy()
+    else:
+      # find the most specific match (peptide is always the first column order field)
+      for col in reversed(pept2lca_extra_column_order[1:]):
+        col_id = col+"_id" if options.extra else col
+        for match in matches:
+          if 'taxon_rank' in match and match['taxon_rank'] == col:
+            return match.copy()
+          if col_id in match and match[col_id]:
+            return match.copy()
+    return None
+
   #Parse Command Line
   parser = optparse.OptionParser()
-  # unipept API
-  parser.add_option( '-A', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
-  # files
+  # unipept API choice
+  parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
+  # input files
   parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
   parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
   parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
@@ -73,38 +120,33 @@
   parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
   parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
   parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+  # output fields
+  parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
   # Warn vs Error Flag
   parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
-  # outputs
+  # output files
   parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
   parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
   parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
-  parser.add_option( '-M', '--mismatch', dest='mismatch', default=None, help='Output file path for peptide with no matches' )
+  parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
+  # debug
+  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
+  parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
   (options, args) = parser.parse_args()
+  if options.version:
+    print >> sys.stdout,"%s" % version
+    sys.exit(0)
   invalid_ec = 2 if options.strict else None
   peptides = []
-  pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
   ## Get peptide sequences
   if options.mzid:
     peptides += read_mzid(options.mzid)
   if options.pepxml:
     peptides += read_pepxml(options.pepxml)
   if options.tabular:
-    with open(options.tabular) as fp:
-      for i,line in enumerate(fp):
-        if line.strip() == '' or line.startswith('#'):
-          continue
-        fields = line.rstrip('\n').split('\t')
-        peptide = fields[options.column]
-        if not re.match(pep_pat,peptide):
-          warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,options.column,options.tabular),exit_code=invalid_ec)
-        peptides.append(peptide) 
+    peptides += read_tabular(options.tabular,options.column) 
   if options.fasta:
-    with open(options.fasta) as fp:
-      for id, peptide in read_fasta(fp):
-        if not re.match(pep_pat,peptide):
-          warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,options.fasta),exit_code=invalid_ec)
-        peptides.append(peptide) 
+    peptides += read_fasta(options.fasta) 
   if args and len(args) > 0:
     for i,peptide in enumerate(args):
       if not re.match(pep_pat,peptide):
@@ -112,6 +154,25 @@
       peptides.append(peptide) 
   if len(peptides) < 1:
     warn_err("No peptides input!",exit_code=1)
+  column_order = pept2lca_column_order
+  if options.unipept == 'pept2prot':
+    column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
+  else:
+    if options.extra or options.names:
+      column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
+    else:
+      column_order = pept2lca_column_order
+  ## map to tryptic peptides
+  pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
+  partToPeps = {}
+  for peptide, parts in pepToParts.iteritems():
+    if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
+    for part in parts:
+      if len(part) > 50:
+        warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
+      if 5 <= len(part) <= 50:
+        partToPeps.setdefault(part,[]).append(peptide)
+  trypticPeptides = partToPeps.keys()
   ## unipept
   post_data = []
   if options.equate_il:
@@ -121,30 +182,72 @@
     post_data.append(("names","true"))
   elif options.extra:
     post_data.append(("extra","true"))
-  post_data += [('input[]', x) for x in peptides]
+  post_data += [('input[]', x) for x in trypticPeptides]
   headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
   url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
   req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
-  resp = json.loads( urllib2.urlopen( req ).read() )
+  unipept_resp = json.loads( urllib2.urlopen( req ).read() )
+  unmatched_peptides = []
+  peptideMatches = []
+  if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
+  if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
+    dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
+    ## multiple entries per trypticPeptide for pep2prot or pep2taxa
+    mapping = {}
+    for match in unipept_resp:
+      mapping.setdefault(match['peptide'],[]).append(match)
+    for peptide in peptides:
+      # Get the intersection of matches to the tryptic parts
+      keyToMatch = None
+      for part in pepToParts[peptide]:
+        if part in mapping:
+          temp = {match[dupkey] : match  for match in mapping[part]}
+          if keyToMatch:
+            dkeys = set(keyToMatch.keys()) - set(temp.keys())
+            for k in dkeys:
+              del keyToMatch[k]
+          else:
+            keyToMatch = temp
+          ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
+      if not keyToMatch:
+        unmatched_peptides.append(peptide)
+      else:
+        for key,match in keyToMatch.iteritems():
+          match['tryptic_peptide'] = match['peptide']
+          match['peptide'] = peptide
+          peptideMatches.append(match)
+  else:
+    ## should be one response per trypticPeptide for pep2lca
+    respMap = {v['peptide']:v for v in unipept_resp}
+    ## map resp back to peptides
+    for peptide in peptides:
+      matches = list()
+      for part in pepToParts[peptide]:
+        if part in respMap:
+          matches.append(respMap[part])
+      match = best_match(peptide,matches)
+      if not match:
+        unmatched_peptides.append(peptide)
+        longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
+        match = {'peptide' : longest_tryptic_peptide}
+      match['tryptic_peptide'] = match['peptide']
+      match['peptide'] = peptide
+      peptideMatches.append(match)
+  resp = peptideMatches
+  if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
   ## output results
-  if not (options.mismatch or options.json or options.tsv or options.csv):
+  if not (options.unmatched or options.json or options.tsv or options.csv):
     print >> sys.stdout, str(resp)
-  if options.mismatch:
-    peptides_matched = []
-    for i,pdict in enumerate(resp):
-      peptides_matched.append(pdict['peptide'])
-    with open(options.mismatch,'w') as outputFile:
+  if options.unmatched:
+    with open(options.unmatched,'w') as outputFile:
       for peptide in peptides:
-        if not peptide in peptides_matched:
+        if peptide in unmatched_peptides:
           outputFile.write("%s\n" % peptide)
   if options.json:
     with open(options.json,'w') as outputFile:
       outputFile.write(str(resp))  
   if options.tsv or options.csv:
     # 'pept2lca','pept2taxa','pept2prot'
-    pept2lca_column_order = [ 'peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class_','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
-    pept2prot_column_order = [ 'peptide','uniprot_id','taxon_id','taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
-    column_order = pept2prot_column_order if options.unipept == 'pept2prot' else pept2lca_column_order
     found_keys = set()
     results = []
     for i,pdict in enumerate(resp):
@@ -179,7 +282,8 @@
     taxa = []
     for i,pdict in enumerate(results):
       vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
-      taxa.append(vals)
+      if vals not in taxa:
+        taxa.append(vals)
     if options.tsv:
       with open(options.tsv,'w') as outputFile:
         outputFile.write("#%s\n"% '\t'.join(column_names))