0
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 #
|
|
4 #------------------------------------------------------------------------------
|
|
5 # University of Minnesota
|
|
6 # Copyright 2015, Regents of the University of Minnesota
|
|
7 #------------------------------------------------------------------------------
|
|
8 # Author:
|
|
9 #
|
|
10 # James E Johnson
|
|
11 #
|
|
12 #------------------------------------------------------------------------------
|
|
13 """
|
|
14
|
|
15 import json
|
|
16 import logging
|
|
17 import optparse
|
|
18 from optparse import OptionParser
|
|
19 import os
|
|
20 import sys
|
|
21 import re
|
|
22 import urllib
|
|
23 import urllib2
|
|
24 try:
|
|
25 import xml.etree.cElementTree as ET
|
|
26 except ImportError:
|
|
27 import xml.etree.ElementTree as ET
|
|
28
|
|
29 def warn_err(msg,exit_code=1):
|
|
30 sys.stderr.write(msg)
|
|
31 if exit_code:
|
|
32 sys.exit(exit_code)
|
|
33
|
1
|
34 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
|
|
35 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
|
|
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
|
|
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
|
|
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
|
|
39
|
|
40 def __main__():
|
3
|
41 version = '2.0'
|
1
|
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
|
|
43
|
|
44 def read_tabular(filepath,col):
|
|
45 peptides = []
|
|
46 with open(filepath) as fp:
|
|
47 for i,line in enumerate(fp):
|
|
48 if line.strip() == '' or line.startswith('#'):
|
|
49 continue
|
|
50 fields = line.rstrip('\n').split('\t')
|
|
51 peptide = fields[col]
|
|
52 if not re.match(pep_pat,peptide):
|
|
53 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec)
|
|
54 peptides.append(peptide)
|
|
55 return peptides
|
|
56
|
|
57 def get_fasta_entries(fp):
|
0
|
58 name, seq = None, []
|
|
59 for line in fp:
|
1
|
60 line = line.rstrip()
|
|
61 if line.startswith(">"):
|
|
62 if name: yield (name, ''.join(seq))
|
|
63 name, seq = line, []
|
|
64 else:
|
|
65 seq.append(line)
|
0
|
66 if name: yield (name, ''.join(seq))
|
|
67
|
1
|
68 def read_fasta(filepath):
|
|
69 peptides = []
|
|
70 with open(filepath) as fp:
|
|
71 for id, peptide in get_fasta_entries(fp):
|
|
72 if not re.match(pep_pat,peptide):
|
|
73 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
|
|
74 peptides.append(peptide)
|
|
75 return peptides
|
|
76
|
|
77 def read_mzid(fp):
|
|
78 peptides = []
|
|
79 for event, elem in ET.iterparse(fp):
|
|
80 if event == 'end':
|
|
81 if re.search('PeptideSequence',elem.tag):
|
|
82 peptides.append(elem.text)
|
|
83 return peptides
|
0
|
84
|
1
|
85 def read_pepxml(fp):
|
|
86 peptides = []
|
|
87 for event, elem in ET.iterparse(fp):
|
|
88 if event == 'end':
|
|
89 if re.search('search_hit',elem.tag):
|
|
90 peptides.append(elem.get('peptide'))
|
|
91 return peptides
|
0
|
92
|
1
|
93 def best_match(peptide,matches):
|
|
94 if not matches:
|
|
95 return None
|
|
96 elif len(matches) == 1:
|
|
97 return matches[0].copy()
|
|
98 else:
|
|
99 # find the most specific match (peptide is always the first column order field)
|
|
100 for col in reversed(pept2lca_extra_column_order[1:]):
|
|
101 col_id = col+"_id" if options.extra else col
|
|
102 for match in matches:
|
|
103 if 'taxon_rank' in match and match['taxon_rank'] == col:
|
|
104 return match.copy()
|
|
105 if col_id in match and match[col_id]:
|
|
106 return match.copy()
|
|
107 return None
|
|
108
|
0
|
109 #Parse Command Line
|
|
110 parser = optparse.OptionParser()
|
1
|
111 # unipept API choice
|
|
112 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' )
|
|
113 # input files
|
0
|
114 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
|
|
115 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
|
|
116 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
|
|
117 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
|
|
118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
|
|
119 # Unipept Flags
|
|
120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
|
|
121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
|
|
122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
|
3
|
123 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
|
|
124
|
1
|
125 # output fields
|
|
126 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
|
0
|
127 # Warn vs Error Flag
|
|
128 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
|
1
|
129 # output files
|
0
|
130 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
|
|
131 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
|
|
132 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
|
1
|
133 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
|
|
134 # debug
|
|
135 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
|
|
136 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
|
0
|
137 (options, args) = parser.parse_args()
|
1
|
138 if options.version:
|
|
139 print >> sys.stdout,"%s" % version
|
|
140 sys.exit(0)
|
0
|
141 invalid_ec = 2 if options.strict else None
|
|
142 peptides = []
|
|
143 ## Get peptide sequences
|
|
144 if options.mzid:
|
|
145 peptides += read_mzid(options.mzid)
|
|
146 if options.pepxml:
|
|
147 peptides += read_pepxml(options.pepxml)
|
|
148 if options.tabular:
|
1
|
149 peptides += read_tabular(options.tabular,options.column)
|
0
|
150 if options.fasta:
|
1
|
151 peptides += read_fasta(options.fasta)
|
0
|
152 if args and len(args) > 0:
|
|
153 for i,peptide in enumerate(args):
|
|
154 if not re.match(pep_pat,peptide):
|
|
155 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
|
|
156 peptides.append(peptide)
|
|
157 if len(peptides) < 1:
|
|
158 warn_err("No peptides input!",exit_code=1)
|
1
|
159 column_order = pept2lca_column_order
|
|
160 if options.unipept == 'pept2prot':
|
|
161 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
|
|
162 else:
|
|
163 if options.extra or options.names:
|
|
164 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
|
|
165 else:
|
|
166 column_order = pept2lca_column_order
|
|
167 ## map to tryptic peptides
|
|
168 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides}
|
|
169 partToPeps = {}
|
|
170 for peptide, parts in pepToParts.iteritems():
|
|
171 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts)
|
|
172 for part in parts:
|
|
173 if len(part) > 50:
|
|
174 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
|
|
175 if 5 <= len(part) <= 50:
|
|
176 partToPeps.setdefault(part,[]).append(peptide)
|
|
177 trypticPeptides = partToPeps.keys()
|
0
|
178 ## unipept
|
3
|
179 unipept_resp = []
|
|
180 idx = range(0,len(trypticPeptides),options.max_request)
|
|
181 idx.append(len(trypticPeptides))
|
|
182 for i in range(len(idx)-1):
|
|
183 post_data = []
|
|
184 if options.equate_il:
|
|
185 post_data.append(("equate_il","true"))
|
|
186 if options.names or options.json:
|
|
187 post_data.append(("extra","true"))
|
|
188 post_data.append(("names","true"))
|
|
189 elif options.extra or options.json:
|
|
190 post_data.append(("extra","true"))
|
|
191 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
|
|
192 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
|
|
193 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
|
|
194 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
|
|
195 unipept_resp += json.loads( urllib2.urlopen( req ).read() )
|
1
|
196 unmatched_peptides = []
|
|
197 peptideMatches = []
|
|
198 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
|
|
199 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
|
|
200 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
|
|
201 ## multiple entries per trypticPeptide for pep2prot or pep2taxa
|
|
202 mapping = {}
|
|
203 for match in unipept_resp:
|
|
204 mapping.setdefault(match['peptide'],[]).append(match)
|
|
205 for peptide in peptides:
|
|
206 # Get the intersection of matches to the tryptic parts
|
|
207 keyToMatch = None
|
|
208 for part in pepToParts[peptide]:
|
|
209 if part in mapping:
|
|
210 temp = {match[dupkey] : match for match in mapping[part]}
|
|
211 if keyToMatch:
|
|
212 dkeys = set(keyToMatch.keys()) - set(temp.keys())
|
|
213 for k in dkeys:
|
|
214 del keyToMatch[k]
|
|
215 else:
|
|
216 keyToMatch = temp
|
|
217 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
|
|
218 if not keyToMatch:
|
|
219 unmatched_peptides.append(peptide)
|
|
220 else:
|
|
221 for key,match in keyToMatch.iteritems():
|
|
222 match['tryptic_peptide'] = match['peptide']
|
|
223 match['peptide'] = peptide
|
|
224 peptideMatches.append(match)
|
|
225 else:
|
|
226 ## should be one response per trypticPeptide for pep2lca
|
|
227 respMap = {v['peptide']:v for v in unipept_resp}
|
|
228 ## map resp back to peptides
|
|
229 for peptide in peptides:
|
|
230 matches = list()
|
|
231 for part in pepToParts[peptide]:
|
|
232 if part in respMap:
|
|
233 matches.append(respMap[part])
|
|
234 match = best_match(peptide,matches)
|
|
235 if not match:
|
|
236 unmatched_peptides.append(peptide)
|
|
237 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
|
|
238 match = {'peptide' : longest_tryptic_peptide}
|
|
239 match['tryptic_peptide'] = match['peptide']
|
|
240 match['peptide'] = peptide
|
|
241 peptideMatches.append(match)
|
|
242 resp = peptideMatches
|
|
243 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
|
0
|
244 ## output results
|
1
|
245 if not (options.unmatched or options.json or options.tsv or options.csv):
|
0
|
246 print >> sys.stdout, str(resp)
|
1
|
247 if options.unmatched:
|
|
248 with open(options.unmatched,'w') as outputFile:
|
0
|
249 for peptide in peptides:
|
1
|
250 if peptide in unmatched_peptides:
|
0
|
251 outputFile.write("%s\n" % peptide)
|
|
252 if options.json:
|
3
|
253 if options.unipept == 'pept2prot':
|
|
254 with open(options.json,'w') as outputFile:
|
|
255 outputFile.write(str(resp))
|
|
256 else:
|
|
257 found_keys = set()
|
|
258 for i,pdict in enumerate(resp):
|
|
259 found_keys |= set(pdict.keys())
|
|
260 taxa_cols = []
|
|
261 for col in pept2lca_extra_column_order[-1:0:-1]:
|
|
262 if col+'_id' in found_keys:
|
|
263 taxa_cols.append(col)
|
|
264 id_to_node = dict()
|
|
265 def get_node(id,name,rank,child,seq):
|
|
266 if id not in id_to_node:
|
|
267 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] }
|
|
268 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
|
|
269 id_to_node[id] = node
|
|
270 else:
|
|
271 node = id_to_node[id]
|
|
272 node['data']['count'] += 1
|
|
273 if seq is not None and seq not in node['data']['sequences']:
|
|
274 node['data']['sequences'].append(seq)
|
|
275 if child is None:
|
|
276 node['data']['self_count'] += 1
|
|
277 elif child['id'] not in node['kids']:
|
|
278 node['kids'].append(child['id'])
|
|
279 node['children'].append(child)
|
|
280 return node
|
|
281 root = get_node(1,'root','no rank',None,None)
|
|
282 for i,pdict in enumerate(resp):
|
|
283 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
|
|
284 seq = sequence
|
|
285 child = None
|
|
286 for col in taxa_cols:
|
|
287 col_id = col+'_id'
|
|
288 if col_id in pdict and pdict.get(col_id):
|
|
289 col_name = col if col in found_keys else col+'_name'
|
|
290 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
|
|
291 seq = None
|
|
292 if child:
|
|
293 get_node(1,'root','no rank',child,None)
|
|
294 with open(options.json,'w') as outputFile:
|
|
295 outputFile.write(json.dumps(root))
|
0
|
296 if options.tsv or options.csv:
|
|
297 # 'pept2lca','pept2taxa','pept2prot'
|
|
298 found_keys = set()
|
|
299 results = []
|
|
300 for i,pdict in enumerate(resp):
|
|
301 results.append(pdict)
|
|
302 found_keys |= set(pdict.keys())
|
|
303 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
|
|
304 column_names = []
|
|
305 column_keys = []
|
|
306 for col in column_order:
|
|
307 if col in found_keys:
|
|
308 column_names.append(col)
|
|
309 column_keys.append(col)
|
|
310 elif options.extra or options.names:
|
|
311 col_id = col+'_id'
|
|
312 col_name = col+'_name'
|
|
313 if options.extra:
|
|
314 if col_id in found_keys:
|
|
315 column_names.append(col_id)
|
|
316 column_keys.append(col_id)
|
|
317 if options.names:
|
|
318 if col_name in found_keys:
|
|
319 column_names.append(col)
|
|
320 column_keys.append(col_name)
|
|
321 else:
|
|
322 if col+'_name' in found_keys:
|
|
323 column_names.append(col)
|
|
324 column_keys.append(col+'_name')
|
|
325 elif col+'_id' in found_keys:
|
|
326 column_names.append(col)
|
|
327 column_keys.append(col+'_id')
|
|
328 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
|
|
329 taxa = []
|
|
330 for i,pdict in enumerate(results):
|
|
331 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
|
1
|
332 if vals not in taxa:
|
|
333 taxa.append(vals)
|
0
|
334 if options.tsv:
|
|
335 with open(options.tsv,'w') as outputFile:
|
|
336 outputFile.write("#%s\n"% '\t'.join(column_names))
|
|
337 for vals in taxa:
|
|
338 outputFile.write("%s\n"% '\t'.join(vals))
|
|
339 if options.csv:
|
|
340 with open(options.csv,'w') as outputFile:
|
|
341 outputFile.write("%s\n"% ','.join(column_names))
|
|
342 for vals in taxa:
|
|
343 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
|
|
344
|
|
345 if __name__ == "__main__" : __main__()
|