comparison unipept.py @ 5:917fd3ebc223 draft

"planemo upload for repository http://unipept.ugent.be/apidocs commit dd464f03c32f657fc555081117da18ba4c091af6-dirty"
author galaxyp
date Thu, 30 Apr 2020 07:39:28 -0400
parents 4953dcd7dd39
children 9aaa46d45472
comparison
equal deleted inserted replaced
4:4953dcd7dd39 5:917fd3ebc223
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """ 2 """
3 # 3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2015, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author: 4 # Author:
9 # 5 #
10 # James E Johnson 6 # James E Johnson
11 # 7 #
12 #------------------------------------------------------------------------------ 8 #------------------------------------------------------------------------------
13 """ 9 """
14
15 import json 10 import json
16 import logging
17 import optparse 11 import optparse
18 from optparse import OptionParser
19 import os
20 import sys 12 import sys
21 import re 13 import re
22 import urllib 14 import urllib.error
23 import urllib2 15 import urllib.parse
24 16 import urllib.request
25 """ 17
26 pept2taxa json
27 pept2lca json
28 pept2prot
29 pept2ec ecjson ec
30 pept2go go
31 pept2funct go ec
32 peptinfo json ecjson ec go
33
34 """
35 18
36 try: 19 try:
37 import xml.etree.cElementTree as ET 20 import xml.etree.cElementTree as ET
38 except ImportError: 21 except ImportError:
39 import xml.etree.ElementTree as ET 22 import xml.etree.ElementTree as ET
40 23
41 def warn_err(msg,exit_code=1): 24
25 def warn_err(msg, exit_code=1):
42 sys.stderr.write(msg) 26 sys.stderr.write(msg)
43 if exit_code: 27 if exit_code:
44 sys.exit(exit_code) 28 sys.exit(exit_code)
29
45 30
46 go_types = ['biological process', 'molecular function', 'cellular component'] 31 go_types = ['biological process', 'molecular function', 'cellular component']
32 ipr_types = ['Domain', 'Family', 'Homologous_superfamily', 'Repeat', 'Conserved_site', 'Active_site', 'Binding_site', 'PTM']
47 ec_name_dict = { 33 ec_name_dict = {
48 '1' : 'Oxidoreductase', 34 '1': 'Oxidoreductase',
49 '1.1' : 'act on the CH-OH group of donors', 35 '1.1': 'act on the CH-OH group of donors',
50 '1.2' : 'act on the aldehyde or oxo group of donors', 36 '1.2': 'act on the aldehyde or oxo group of donors',
51 '1.3' : 'act on the CH-CH group of donors', 37 '1.3': 'act on the CH-CH group of donors',
52 '1.4' : 'act on the CH-NH2 group of donors', 38 '1.4': 'act on the CH-NH2 group of donors',
53 '1.5' : 'act on CH-NH group of donors', 39 '1.5': 'act on CH-NH group of donors',
54 '1.6' : 'act on NADH or NADPH', 40 '1.6': 'act on NADH or NADPH',
55 '1.7' : 'act on other nitrogenous compounds as donors', 41 '1.7': 'act on other nitrogenous compounds as donors',
56 '1.8' : 'act on a sulfur group of donors', 42 '1.8': 'act on a sulfur group of donors',
57 '1.9' : 'act on a heme group of donors', 43 '1.9': 'act on a heme group of donors',
58 '1.10' : 'act on diphenols and related substances as donors', 44 '1.10': 'act on diphenols and related substances as donors',
59 '1.11' : 'act on peroxide as an acceptor -- peroxidases', 45 '1.11': 'act on peroxide as an acceptor -- peroxidases',
60 '1.12' : 'act on hydrogen as a donor', 46 '1.12': 'act on hydrogen as a donor',
61 '1.13' : 'act on single donors with incorporation of molecular oxygen', 47 '1.13': 'act on single donors with incorporation of molecular oxygen',
62 '1.14' : 'act on paired donors with incorporation of molecular oxygen', 48 '1.14': 'act on paired donors with incorporation of molecular oxygen',
63 '1.15' : 'act on superoxide radicals as acceptors', 49 '1.15': 'act on superoxide radicals as acceptors',
64 '1.16' : 'oxidize metal ions', 50 '1.16': 'oxidize metal ions',
65 '1.17' : 'act on CH or CH2 groups', 51 '1.17': 'act on CH or CH2 groups',
66 '1.18' : 'act on iron-sulfur proteins as donors', 52 '1.18': 'act on iron-sulfur proteins as donors',
67 '1.19' : 'act on reduced flavodoxin as donor', 53 '1.19': 'act on reduced flavodoxin as donor',
68 '1.20' : 'act on phosphorus or arsenic as donors', 54 '1.20': 'act on phosphorus or arsenic as donors',
69 '1.21' : 'act on X-H and Y-H to form an X-Y bond', 55 '1.21': 'act on X-H and Y-H to form an X-Y bond',
70 '1.97' : 'other oxidoreductases', 56 '1.97': 'other oxidoreductases',
71 '2' : 'Transferase', 57 '2': 'Transferase',
72 '2.1' : 'transfer one-carbon groups, Methylase', 58 '2.1': 'transfer one-carbon groups, Methylase',
73 '2.2' : 'transfer aldehyde or ketone groups', 59 '2.2': 'transfer aldehyde or ketone groups',
74 '2.3' : 'acyltransferases', 60 '2.3': 'acyltransferases',
75 '2.4' : 'glycosyltransferases', 61 '2.4': 'glycosyltransferases',
76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups', 62 '2.5': 'transfer alkyl or aryl groups, other than methyl groups',
77 '2.6' : 'transfer nitrogenous groups', 63 '2.6': 'transfer nitrogenous groups',
78 '2.7' : 'transfer phosphorus-containing groups', 64 '2.7': 'transfer phosphorus-containing groups',
79 '2.8' : 'transfer sulfur-containing groups', 65 '2.8': 'transfer sulfur-containing groups',
80 '2.9' : 'transfer selenium-containing groups', 66 '2.9': 'transfer selenium-containing groups',
81 '3' : 'Hydrolase', 67 '3': 'Hydrolase',
82 '3.1' : 'act on ester bonds', 68 '3.1': 'act on ester bonds',
83 '3.2' : 'act on sugars - glycosylases', 69 '3.2': 'act on sugars - glycosylases',
84 '3.3' : 'act on ether bonds', 70 '3.3': 'act on ether bonds',
85 '3.4' : 'act on peptide bonds - Peptidase', 71 '3.4': 'act on peptide bonds - Peptidase',
86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', 72 '3.5': 'act on carbon-nitrogen bonds, other than peptide bonds',
87 '3.6' : 'act on acid anhydrides', 73 '3.6': 'act on acid anhydrides',
88 '3.7' : 'act on carbon-carbon bonds', 74 '3.7': 'act on carbon-carbon bonds',
89 '3.8' : 'act on halide bonds', 75 '3.8': 'act on halide bonds',
90 '3.9' : 'act on phosphorus-nitrogen bonds', 76 '3.9': 'act on phosphorus-nitrogen bonds',
91 '3.10' : 'act on sulfur-nitrogen bonds', 77 '3.10': 'act on sulfur-nitrogen bonds',
92 '3.11' : 'act on carbon-phosphorus bonds', 78 '3.11': 'act on carbon-phosphorus bonds',
93 '3.12' : 'act on sulfur-sulfur bonds', 79 '3.12': 'act on sulfur-sulfur bonds',
94 '3.13' : 'act on carbon-sulfur bonds', 80 '3.13': 'act on carbon-sulfur bonds',
95 '4' : 'Lyase', 81 '4': 'Lyase',
96 '4.1' : 'carbon-carbon lyases', 82 '4.1': 'carbon-carbon lyases',
97 '4.2' : 'carbon-oxygen lyases', 83 '4.2': 'carbon-oxygen lyases',
98 '4.3' : 'carbon-nitrogen lyases', 84 '4.3': 'carbon-nitrogen lyases',
99 '4.4' : 'carbon-sulfur lyases', 85 '4.4': 'carbon-sulfur lyases',
100 '4.5' : 'carbon-halide lyases', 86 '4.5': 'carbon-halide lyases',
101 '4.6' : 'phosphorus-oxygen lyases', 87 '4.6': 'phosphorus-oxygen lyases',
102 '5' : 'Isomerase', 88 '5': 'Isomerase',
103 '5.1' : 'racemases and epimerases', 89 '5.1': 'racemases and epimerases',
104 '5.2' : 'cis-trans-isomerases', 90 '5.2': 'cis-trans-isomerases',
105 '5.3' : 'intramolecular oxidoreductases', 91 '5.3': 'intramolecular oxidoreductases',
106 '5.4' : 'intramolecular transferases -- mutases', 92 '5.4': 'intramolecular transferases -- mutases',
107 '5.5' : 'intramolecular lyases', 93 '5.5': 'intramolecular lyases',
108 '5.99' : 'other isomerases', 94 '5.99': 'other isomerases',
109 '6' : 'Ligase', 95 '6': 'Ligase',
110 '6.1' : 'form carbon-oxygen bonds', 96 '6.1': 'form carbon-oxygen bonds',
111 '6.2' : 'form carbon-sulfur bonds', 97 '6.2': 'form carbon-sulfur bonds',
112 '6.3' : 'form carbon-nitrogen bonds', 98 '6.3': 'form carbon-nitrogen bonds',
113 '6.4' : 'form carbon-carbon bonds', 99 '6.4': 'form carbon-carbon bonds',
114 '6.5' : 'form phosphoric ester bonds', 100 '6.5': 'form phosphoric ester bonds',
115 '6.6' : 'form nitrogen-metal bonds', 101 '6.6': 'form nitrogen-metal bonds',
116 } 102 }
117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name']
118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma']
119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id']
121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids']
122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] 111 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']]
126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] 112 pept2interpro_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count']]
113 pept2interpro_extra_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count', 'type', 'name']]
114 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go', 'ipr']
115
127 116
128 def __main__(): 117 def __main__():
129 version = '2.0' 118 version = '4.3'
130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' 119 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
131 120
132 def read_tabular(filepath,col): 121 def read_tabular(filepath, col):
122 peptides = []
123 with open(filepath) as fp:
124 for i, line in enumerate(fp):
125 if line.strip() == '' or line.startswith('#'):
126 continue
127 fields = line.rstrip('\n').split('\t')
128 peptide = fields[col]
129 if not re.match(pep_pat, peptide):
130 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide, i, col, filepath), exit_code=invalid_ec)
131 peptides.append(peptide)
132 return peptides
133
134 def get_fasta_entries(fp):
135 name, seq = None, []
136 for line in fp:
137 line = line.rstrip()
138 if line.startswith(">"):
139 if name:
140 yield (name, ''.join(seq))
141 name, seq = line, []
142 else:
143 seq.append(line)
144 if name:
145 yield (name, ''.join(seq))
146
147 def read_fasta(filepath):
148 peptides = []
149 with open(filepath) as fp:
150 for id, peptide in get_fasta_entries(fp):
151 if not re.match(pep_pat, peptide):
152 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide, id, filepath), exit_code=invalid_ec)
153 peptides.append(peptide)
154 return peptides
155
156 def read_mzid(fp):
157 peptides = []
158 for event, elem in ET.iterparse(fp):
159 if event == 'end':
160 if re.search('PeptideSequence', elem.tag):
161 peptides.append(elem.text)
162 return peptides
163
164 def read_pepxml(fp):
165 peptides = []
166 for event, elem in ET.iterparse(fp):
167 if event == 'end':
168 if re.search('search_hit', elem.tag):
169 peptides.append(elem.get('peptide'))
170 return peptides
171
172 def best_match(peptide, matches):
173 if not matches:
174 return None
175 elif len(matches) == 1:
176 return matches[0].copy()
177 elif 'taxon_rank' in matches[0]:
178 # find the most specific match (peptide is always the first column order field)
179 for col in reversed(pept2lca_extra_column_order[1:]):
180 col_id = col + "_id" if options.extra else col
181 for match in matches:
182 if 'taxon_rank' in match and match['taxon_rank'] == col:
183 return match.copy()
184 if col_id in match and match[col_id]:
185 return match.copy()
186 else:
187 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy()
188 return None
189
190 def get_taxon_json(resp):
191 found_keys = set()
192 for i, pdict in enumerate(resp):
193 found_keys |= set(pdict.keys())
194 taxa_cols = []
195 for col in pept2lca_extra_column_order[-1:0:-1]:
196 if col + '_id' in found_keys:
197 taxa_cols.append(col)
198 id_to_node = dict()
199
200 def get_node(id, name, rank, child, seq):
201 if id not in id_to_node:
202 data = {'count': 0, 'self_count': 0, 'valid_taxon': 1, 'rank': rank, 'sequences': []}
203 node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data}
204 id_to_node[id] = node
205 else:
206 node = id_to_node[id]
207 node['data']['count'] += 1
208 if seq is not None and seq not in node['data']['sequences']:
209 node['data']['sequences'].append(seq)
210 if child is None:
211 node['data']['self_count'] += 1
212 elif child['id'] not in node['kids']:
213 node['kids'].append(child['id'])
214 node['children'].append(child)
215 return node
216 root = get_node(1, 'root', 'no rank', None, None)
217 for i, pdict in enumerate(resp):
218 sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None))
219 seq = sequence
220 child = None
221 for col in taxa_cols:
222 col_id = col + '_id'
223 if col_id in pdict and pdict.get(col_id):
224 col_name = col if col in found_keys else col + '_name'
225 child = get_node(pdict.get(col_id, None), pdict.get(col_name, ''), col, child, seq)
226 seq = None
227 if child is not None:
228 get_node(1, 'root', 'no rank', child, None)
229 return root
230
231 def get_ec_json(resp):
232 ecMap = dict()
233 for pdict in resp:
234 if 'ec' in pdict:
235 for ec in pdict['ec']:
236 ec_number = ec['ec_number']
237 if ec_number not in ecMap:
238 ecMap[ec_number] = []
239 ecMap[ec_number].append(pdict)
240
241 def get_ids(ec):
242 ids = []
243 i = len(ec)
244 while i >= 0:
245 ids.append(ec[:i])
246 i = ec.rfind('.', 0, i - 1)
247 return ids
248 id_to_node = dict()
249
250 def get_node(id, name, child, seq):
251 if id not in id_to_node:
252 data = {'count': 0, 'self_count': 0, 'sequences': []}
253 node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data}
254 id_to_node[id] = node
255 else:
256 node = id_to_node[id]
257 node['data']['count'] += 1
258 if seq is not None and seq not in node['data']['sequences']:
259 node['data']['sequences'].append(seq)
260 if child is None:
261 node['data']['self_count'] += 1
262 elif child['id'] not in node['kids']:
263 node['kids'].append(child['id'])
264 node['children'].append(child)
265 return node
266 root = get_node(0, '-.-.-.-', None, None)
267 for i in range(1, 7):
268 child = get_node(str(i), '%s\n%s' % (str(i), ec_name_dict[str(i)]), None, None)
269 get_node(0, '-.-.-.-', child, None)
270 for i, pdict in enumerate(resp):
271 sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None))
272 seq = sequence
273 if 'ec' in pdict:
274 for ec in pdict['ec']:
275 child = None
276 ec_number = ec['ec_number']
277 for ec_id in get_ids(ec_number):
278 ec_name = str(ec_id)
279 child = get_node(ec_id, ec_name, child, seq)
280 seq = None
281 if child:
282 get_node(0, '-.-.-.-', child, None)
283 return root
284
285 def get_taxon_dict(resp, column_order, extra=False, names=False):
286 found_keys = set()
287 results = []
288 for i, pdict in enumerate(resp):
289 results.append(pdict)
290 found_keys |= set(pdict.keys())
291 # print >> sys.stderr, "%s\n%s" % (pdict.keys(), found_keys)
292 column_names = []
293 column_keys = []
294 for col in column_order:
295 if col in found_keys:
296 column_names.append(col)
297 column_keys.append(col)
298 elif names:
299 col_id = col + '_id'
300 col_name = col + '_name'
301 if extra:
302 if col_id in found_keys:
303 column_names.append(col_id)
304 column_keys.append(col_id)
305 if names:
306 if col_name in found_keys:
307 column_names.append(col)
308 column_keys.append(col_name)
309 else:
310 if col + '_name' in found_keys:
311 column_names.append(col)
312 column_keys.append(col + '_name')
313 elif col + '_id' in found_keys:
314 column_names.append(col)
315 column_keys.append(col + '_id')
316 # print >> sys.stderr, "%s\n%s" % (column_names, column_keys)
317 taxa = dict() # peptide: [taxonomy]
318 for i, pdict in enumerate(results):
319 peptide = pdict['peptide'] if 'peptide' in pdict else None
320 if peptide and peptide not in taxa:
321 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
322 taxa[peptide] = vals
323 return (taxa, column_names)
324
325 def get_ec_dict(resp, extra=False):
326 ec_cols = ['ec_numbers', 'ec_protein_counts']
327 if extra:
328 ec_cols.append('ec_names')
329 ec_dict = dict()
330 for i, pdict in enumerate(resp):
331 peptide = pdict['peptide']
332 ec_numbers = []
333 protein_counts = []
334 ec_names = []
335 if 'ec' in pdict:
336 for ec in pdict['ec']:
337 ec_numbers.append(ec['ec_number'])
338 protein_counts.append(str(ec['protein_count']))
339 if extra:
340 ec_names.append(ec['name'] if 'name' in ec else '')
341 vals = [','.join(ec_numbers), ','.join(protein_counts)]
342 if extra:
343 vals.append(','.join(ec_names))
344 ec_dict[peptide] = vals
345 return (ec_dict, ec_cols)
346
347 def get_go_dict(resp, extra=False):
348 go_cols = ['go_terms', 'go_protein_counts']
349 if extra:
350 go_cols.append('go_names')
351 go_dict = dict()
352 for i, pdict in enumerate(resp):
353 peptide = pdict['peptide']
354 go_terms = []
355 protein_counts = []
356 go_names = []
357 if 'go' in pdict:
358 for go in pdict['go']:
359 if 'go_term' in go:
360 go_terms.append(go['go_term'])
361 protein_counts.append(str(go['protein_count']))
362 if extra:
363 go_names.append(go['name'] if 'name' in go else '')
364 else:
365 for go_type in go_types:
366 if go_type in go:
367 for _go in go[go_type]:
368 go_terms.append(_go['go_term'])
369 protein_counts.append(str(_go['protein_count']))
370 if extra:
371 go_names.append(_go['name'] if 'name' in _go else '')
372 vals = [','.join(go_terms), ','.join(protein_counts)]
373 if extra:
374 vals.append(','.join(go_names))
375 go_dict[peptide] = vals
376 return (go_dict, go_cols)
377
378 def get_ipr_dict(resp, extra=False):
379 ipr_cols = ['ipr_codes', 'ipr_protein_counts']
380 if extra:
381 ipr_cols.append('ipr_types')
382 ipr_cols.append('ipr_names')
383 ipr_dict = dict()
384 for i, pdict in enumerate(resp):
385 peptide = pdict['peptide']
386 ipr_codes = []
387 protein_counts = []
388 ipr_names = []
389 ipr_types = []
390 if 'ipr' in pdict:
391 for ipr in pdict['ipr']:
392 if 'code' in ipr:
393 ipr_codes.append(ipr['code'])
394 protein_counts.append(str(ipr['protein_count']))
395 if extra:
396 ipr_types.append(ipr['type'] if 'type' in ipr else '')
397 ipr_names.append(ipr['name'] if 'name' in ipr else '')
398 else:
399 for ipr_type in ipr_types:
400 if ipr_type in ipr:
401 for _ipr in ipr[ipr_type]:
402 ipr_codes.append(_ipr['code'])
403 protein_counts.append(str(_ipr['protein_count']))
404 if extra:
405 ipr_types.append(ipr_type)
406 ipr_names.append(_ipr['name'] if 'name' in _ipr else '')
407 vals = [','.join(ipr_codes), ','.join(protein_counts)]
408 if extra:
409 vals.append(','.join(ipr_types))
410 vals.append(','.join(ipr_names))
411 ipr_dict[peptide] = vals
412 return (ipr_dict, ipr_cols)
413
414 def write_ec_table(outfile, resp, column_order):
415 with open(outfile, 'w') as fh:
416 for i, pdict in enumerate(resp):
417 if 'ec' in pdict:
418 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
419 for ec in pdict['ec']:
420 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
421 fh.write('%s\n' % '\t'.join(tvals + vals))
422
423 def write_go_table(outfile, resp, column_order):
424 with open(outfile, 'w') as fh:
425 for i, pdict in enumerate(resp):
426 if 'go' in pdict:
427 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
428 for go in pdict['go']:
429 if 'go_term' in go:
430 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
431 fh.write('%s\n' % '\t'.join(tvals + vals))
432 else:
433 for go_type in go_types:
434 if go_type in go:
435 for _go in go[go_type]:
436 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
437 vals.append(go_type)
438 fh.write('%s\n' % '\t'.join(tvals + vals))
439
440 def write_ipr_table(outfile, resp, column_order):
441 with open(outfile, 'w') as fh:
442 for i, pdict in enumerate(resp):
443 if 'ipr' in pdict:
444 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
445 for ipr in pdict['ipr']:
446 if 'code' in ipr:
447 vals = [str(ipr[x]) if x in ipr and ipr[x] else '' for x in column_order[-1]]
448 fh.write('%s\n' % '\t'.join(tvals + vals))
449 else:
450 for ipr_type in ipr_types:
451 if ipr_type in ipr:
452 for _ipr in ipr[ipr_type]:
453 vals = [str(_ipr[x]) if x in _ipr and _ipr[x] else '' for x in column_order[-1]]
454 vals.append(ipr_type)
455 fh.write('%s\n' % '\t'.join(tvals + vals))
456
457 # Parse Command Line
458 parser = optparse.OptionParser()
459 # unipept API choice
460 parser.add_option('-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca', 'pept2taxa', 'pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo'],
461 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo')
462 # input files
463 parser.add_option('-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column')
464 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences')
465 parser.add_option('-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences')
466 parser.add_option('-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences')
467 parser.add_option('-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences')
468 # Unipept Flags
469 parser.add_option('-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records')
470 parser.add_option('-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor')
471 parser.add_option('-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor')
472 parser.add_option('-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component')
473 parser.add_option('-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request')
474
475 # output fields
476 parser.add_option('-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs')
477 # Warn vs Error Flag
478 parser.add_option('-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide')
479 # output files
480 parser.add_option('-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
481 parser.add_option('-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
482 parser.add_option('-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
483 parser.add_option('-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
484 parser.add_option('-I', '--ipr_tsv', dest='ipr_tsv', default=None, help='Output file path for InterPro TAB-separated-values (.tsv) formatted results')
485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches')
489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/')
490 # debug
491 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST')
492 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging')
493 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit')
494 (options, args) = parser.parse_args()
495 if options.version:
496 print('%s' % version)
497 sys.exit(0)
498 invalid_ec = 2 if options.strict else None
133 peptides = [] 499 peptides = []
134 with open(filepath) as fp: 500 # Get peptide sequences
135 for i,line in enumerate(fp): 501 if options.mzid:
136 if line.strip() == '' or line.startswith('#'): 502 peptides += read_mzid(options.mzid)
137 continue 503 if options.pepxml:
138 fields = line.rstrip('\n').split('\t') 504 peptides += read_pepxml(options.pepxml)
139 peptide = fields[col] 505 if options.tabular:
140 if not re.match(pep_pat,peptide): 506 peptides += read_tabular(options.tabular, options.column)
141 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) 507 if options.fasta:
142 peptides.append(peptide) 508 peptides += read_fasta(options.fasta)
143 return peptides 509 if args and len(args) > 0:
144 510 for i, peptide in enumerate(args):
145 def get_fasta_entries(fp): 511 if not re.match(pep_pat, peptide):
146 name, seq = None, [] 512 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide, i), exit_code=invalid_ec)
147 for line in fp: 513 peptides.append(peptide)
148 line = line.rstrip() 514 if len(peptides) < 1:
149 if line.startswith(">"): 515 warn_err("No peptides input!", exit_code=1)
150 if name: yield (name, ''.join(seq)) 516 column_order = pept2lca_column_order
151 name, seq = line, [] 517 if options.unipept == 'pept2prot':
152 else: 518 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
153 seq.append(line)
154 if name: yield (name, ''.join(seq))
155
156 def read_fasta(filepath):
157 peptides = []
158 with open(filepath) as fp:
159 for id, peptide in get_fasta_entries(fp):
160 if not re.match(pep_pat,peptide):
161 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec)
162 peptides.append(peptide)
163 return peptides
164
165 def read_mzid(fp):
166 peptides = []
167 for event, elem in ET.iterparse(fp):
168 if event == 'end':
169 if re.search('PeptideSequence',elem.tag):
170 peptides.append(elem.text)
171 return peptides
172
173 def read_pepxml(fp):
174 peptides = []
175 for event, elem in ET.iterparse(fp):
176 if event == 'end':
177 if re.search('search_hit',elem.tag):
178 peptides.append(elem.get('peptide'))
179 return peptides
180
181 def best_match(peptide,matches):
182 if not matches:
183 return None
184 elif len(matches) == 1:
185 return matches[0].copy()
186 elif 'taxon_rank' in matches[0]:
187 # find the most specific match (peptide is always the first column order field)
188 for col in reversed(pept2lca_extra_column_order[1:]):
189 col_id = col+"_id" if options.extra else col
190 for match in matches:
191 if 'taxon_rank' in match and match['taxon_rank'] == col:
192 return match.copy()
193 if col_id in match and match[col_id]:
194 return match.copy()
195 else: 519 else:
196 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() 520 if options.extra or options.names:
197 return None 521 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
198 522 else:
199 def get_taxon_json(resp): 523 column_order = pept2lca_column_order
200 found_keys = set() 524 # map to tryptic peptides
201 for i,pdict in enumerate(resp): 525 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides}
202 found_keys |= set(pdict.keys()) 526 partToPeps = {}
203 taxa_cols = [] 527 for peptide, parts in pepToParts.items():
204 for col in pept2lca_extra_column_order[-1:0:-1]: 528 if options.debug:
205 if col+'_id' in found_keys: 529 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr)
206 taxa_cols.append(col) 530 for part in parts:
207 id_to_node = dict() 531 if len(part) > 50:
208 def get_node(id,name,rank,child,seq): 532 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide, len(part), part), exit_code=None)
209 if id not in id_to_node: 533 if 5 <= len(part) <= 50:
210 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } 534 partToPeps.setdefault(part, []).append(peptide)
211 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } 535 trypticPeptides = list(partToPeps.keys())
212 id_to_node[id] = node 536 # unipept
213 else: 537 unipept_resp = []
214 node = id_to_node[id] 538 idx = list(range(0, len(trypticPeptides), options.max_request))
215 node['data']['count'] += 1 539 idx.append(len(trypticPeptides))
216 if seq is not None and seq not in node['data']['sequences']: 540 for i in range(len(idx) - 1):
217 node['data']['sequences'].append(seq) 541 post_data = []
218 if child is None: 542 if options.equate_il:
219 node['data']['self_count'] += 1 543 post_data.append(('equate_il', 'true'))
220 elif child['id'] not in node['kids']: 544 if options.names or options.json:
221 node['kids'].append(child['id']) 545 post_data.append(('extra', 'true'))
222 node['children'].append(child) 546 post_data.append(('names', 'true'))
223 return node 547 elif options.extra or options.json:
224 root = get_node(1,'root','no rank',None,None) 548 post_data.append(('extra', 'true'))
225 for i,pdict in enumerate(resp): 549 if options.domains:
226 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) 550 post_data.append(('domains', 'true'))
227 seq = sequence 551 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i + 1]]]
228 child = None 552 if options.debug:
229 for col in taxa_cols: 553 print('post_data: %s\n' % (str(post_data)), file=sys.stderr)
230 col_id = col+'_id' 554 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
231 if col_id in pdict and pdict.get(col_id): 555 url = '%s/%s' % (options.url.rstrip('/'), options.unipept)
232 col_name = col if col in found_keys else col+'_name' 556 if options.get:
233 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) 557 params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data])
234 seq = None 558 url = '%s.json?%s' % (url, params)
235 if child: 559 req = urllib.request.Request(url)
236 get_node(1,'root','no rank',child,None) 560 else:
237 return root 561 url = '%s.json' % (url)
238 562 data = urllib.parse.urlencode(post_data).encode()
239 def get_ec_json(resp): 563 params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data])
240 ecMap = dict() 564 if options.debug:
241 for pdict in resp: 565 print('data:\n%s\n' % (data), file=sys.stderr)
242 if 'ec' in pdict: 566 req = urllib.request.Request(url, headers=headers, data=urllib.parse.urlencode(post_data).encode(), method='POST')
243 for ec in pdict['ec']: 567 if options.debug:
244 ec_number = ec['ec_number'] 568 print("url: %s\n" % (str(url)), file=sys.stderr)
245 if ec_number not in ecMap: 569 try:
246 ecMap[ec_number] = [] 570 resp = urllib.request.urlopen(req)
247 ecMap[ec_number].append(pdict) 571 rdata = resp.read()
248 def get_ids(ec): 572 if options.debug:
249 ids = [] 573 print("%s %s\n" % (url, str(resp.getcode())), file=sys.stderr)
250 i = len(ec) 574 if resp.getcode() == 200:
251 while i >= 0: 575 if options.debug:
252 ids.append(ec[:i]) 576 print("rdata: \n%s\n\n" % (rdata), file=sys.stderr)
253 i = ec.rfind('.',0,i - 1) 577 unipept_resp += json.loads(rdata)
254 return ids 578 # unipept_resp += json.loads(urllib.request.urlopen(req).read())
255 id_to_node = dict() 579 except Exception as e:
256 def get_node(id,name,child,seq): 580 warn_err('HTTP Error %s\n' % (str(e)), exit_code=None)
257 if id not in id_to_node: 581 unmatched_peptides = []
258 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } 582 peptideMatches = []
259 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } 583 if options.debug:
260 id_to_node[id] = node 584 print("unipept response: %s\n" % str(unipept_resp), file=sys.stderr)
261 else: 585 if options.unipept in ['pept2prot', 'pept2taxa']:
262 node = id_to_node[id] 586 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' # should only keep one of these per input peptide
263 node['data']['count'] += 1 587 # multiple entries per trypticPeptide for pep2prot or pep2taxa
264 if seq is not None and seq not in node['data']['sequences']: 588 mapping = {}
265 node['data']['sequences'].append(seq) 589 for match in unipept_resp:
266 if child is None: 590 mapping.setdefault(match['peptide'], []).append(match)
267 node['data']['self_count'] += 1 591 for peptide in peptides:
268 elif child['id'] not in node['kids']: 592 # Get the intersection of matches to the tryptic parts
269 node['kids'].append(child['id']) 593 keyToMatch = None
270 node['children'].append(child) 594 for part in pepToParts[peptide]:
271 return node 595 if part in mapping:
272 root = get_node(0,'-.-.-.-',None,None) 596 temp = {match[dupkey]: match for match in mapping[part]}
273 for i in range(1,7): 597 if keyToMatch:
274 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) 598 dkeys = set(keyToMatch.keys()) - set(temp.keys())
275 get_node(0,'-.-.-.-',child,None) 599 for k in dkeys:
276 for i,pdict in enumerate(resp): 600 del keyToMatch[k]
277 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) 601 else:
278 seq = sequence 602 keyToMatch = temp
279 if 'ec' in pdict: 603 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp
280 for ec in pdict['ec']: 604 if not keyToMatch:
281 child = None 605 unmatched_peptides.append(peptide)
282 protein_count = ec['protein_count']
283 ec_number = ec['ec_number']
284 for ec_id in get_ids(ec_number):
285 ec_name = str(ec_id)
286 ## if len(ec_id) == 3:
287 ## ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])], ec_name_dict[str(ec_id)])
288 child = get_node(ec_id,ec_name,child,seq)
289 seq = None
290 if child:
291 get_node(0,'-.-.-.-',child,None)
292 return root
293
294 def get_taxon_dict(resp, column_order, extra=False, names=False):
295 found_keys = set()
296 results = []
297 for i,pdict in enumerate(resp):
298 results.append(pdict)
299 found_keys |= set(pdict.keys())
300 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
301 column_names = []
302 column_keys = []
303 for col in column_order:
304 if col in found_keys:
305 column_names.append(col)
306 column_keys.append(col)
307 elif names:
308 col_id = col+'_id'
309 col_name = col+'_name'
310 if extra:
311 if col_id in found_keys:
312 column_names.append(col_id)
313 column_keys.append(col_id)
314 if names:
315 if col_name in found_keys:
316 column_names.append(col)
317 column_keys.append(col_name)
318 else:
319 if col+'_name' in found_keys:
320 column_names.append(col)
321 column_keys.append(col+'_name')
322 elif col+'_id' in found_keys:
323 column_names.append(col)
324 column_keys.append(col+'_id')
325 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
326 taxa = dict() ## peptide : [taxonomy]
327 for i,pdict in enumerate(results):
328 peptide = pdict['peptide'] if 'peptide' in pdict else None
329 if peptide and peptide not in taxa:
330 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
331 taxa[peptide] = vals
332 return (taxa,column_names)
333
334 def get_ec_dict(resp, extra=False):
335 ec_cols = ['ec_numbers', 'ec_protein_counts']
336 if extra:
337 ec_cols.append('ec_names')
338 ec_dict = dict()
339 for i,pdict in enumerate(resp):
340 peptide = pdict['peptide']
341 ec_numbers = []
342 protein_counts = []
343 ec_names = []
344 if 'ec' in pdict:
345 for ec in pdict['ec']:
346 ec_numbers.append(ec['ec_number'])
347 protein_counts.append(str(ec['protein_count']))
348 if extra:
349 ec_names.append(ec['name'] if 'name' in ec else '')
350 vals = [','.join(ec_numbers),','.join(protein_counts)]
351 if extra:
352 vals.append(','.join(ec_names))
353 ec_dict[peptide] = vals
354 return (ec_dict, ec_cols)
355
356 def get_go_dict(resp, extra=False):
357 go_cols = ['go_terms', 'go_protein_counts']
358 if extra:
359 go_cols.append('go_names')
360 go_dict = dict()
361 for i,pdict in enumerate(resp):
362 peptide = pdict['peptide']
363 go_terms = []
364 protein_counts = []
365 go_names = []
366 if 'go' in pdict:
367 for go in pdict['go']:
368 if 'go_term' in go:
369 go_terms.append(go['go_term'])
370 protein_counts.append(str(go['protein_count']))
371 if extra:
372 go_names.append(go['name'] if 'name' in go else '')
373 else:
374 for go_type in go_types:
375 if go_type in go:
376 for _go in go[go_type]:
377 go_terms.append(_go['go_term'])
378 protein_counts.append(str(_go['protein_count']))
379 if extra:
380 go_names.append(_go['name'] if 'name' in _go else '')
381 vals = [','.join(go_terms),','.join(protein_counts)]
382 if extra:
383 vals.append(','.join(go_names))
384 go_dict[peptide] = vals
385 return (go_dict, go_cols)
386
387 def write_ec_table(outfile, resp, column_order):
388 with open(outfile,'w') as fh:
389 for i,pdict in enumerate(resp):
390 if 'ec' in pdict:
391 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
392 for ec in pdict['ec']:
393 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
394 fh.write('%s\n' % '\t'.join(tvals + vals))
395
396 def write_go_table(outfile, resp, column_order):
397 with open(outfile,'w') as fh:
398 for i,pdict in enumerate(resp):
399 if 'go' in pdict:
400 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
401 for go in pdict['go']:
402 if 'go_term' in go:
403 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
404 fh.write('%s\n' % '\t'.join(tvals + vals))
405 else: 606 else:
406 for go_type in go_types: 607 for key, match in keyToMatch.items():
407 if go_type in go: 608 match['tryptic_peptide'] = match['peptide']
408 for _go in go[go_type]: 609 match['peptide'] = peptide
409 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] 610 peptideMatches.append(match)
410 vals.append(go_type) 611 elif options.unipept in ['pept2lca', 'peptinfo']:
411 fh.write('%s\n' % '\t'.join(tvals + vals)) 612 # should be one response per trypticPeptide for pep2lca
412 613 respMap = {v['peptide']: v for v in unipept_resp}
413 #Parse Command Line 614 # map resp back to peptides
414 parser = optparse.OptionParser() 615 for peptide in peptides:
415 # unipept API choice 616 matches = list()
416 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], 617 for part in pepToParts[peptide]:
417 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) 618 if part in respMap:
418 # input files 619 matches.append(respMap[part])
419 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) 620 match = best_match(peptide, matches)
420 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) 621 if not match:
421 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) 622 unmatched_peptides.append(peptide)
422 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) 623 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
423 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) 624 match = {'peptide': longest_tryptic_peptide}
424 # Unipept Flags 625 match['tryptic_peptide'] = match['peptide']
425 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) 626 match['peptide'] = peptide
426 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) 627 peptideMatches.append(match)
427 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
428 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' )
429 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
430
431 # output fields
432 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
433 # Warn vs Error Flag
434 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
435 # output files
436 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
437 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
438 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
439 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
440 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
441 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
442 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
443 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
444 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' )
445 # debug
446 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' )
447 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
448 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
449 (options, args) = parser.parse_args()
450 if options.version:
451 print >> sys.stdout,"%s" % version
452 sys.exit(0)
453 invalid_ec = 2 if options.strict else None
454 peptides = []
455 ## Get peptide sequences
456 if options.mzid:
457 peptides += read_mzid(options.mzid)
458 if options.pepxml:
459 peptides += read_pepxml(options.pepxml)
460 if options.tabular:
461 peptides += read_tabular(options.tabular,options.column)
462 if options.fasta:
463 peptides += read_fasta(options.fasta)
464 if args and len(args) > 0:
465 for i,peptide in enumerate(args):
466 if not re.match(pep_pat,peptide):
467 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec)
468 peptides.append(peptide)
469 if len(peptides) < 1:
470 warn_err("No peptides input!",exit_code=1)
471 column_order = pept2lca_column_order
472 if options.unipept == 'pept2prot':
473 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order
474 else:
475 if options.extra or options.names:
476 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order
477 else: 628 else:
478 column_order = pept2lca_column_order 629 respMap = {v['peptide']: v for v in unipept_resp}
479 ## map to tryptic peptides 630 # map resp back to peptides
480 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} 631 for peptide in peptides:
481 partToPeps = {} 632 matches = list()
482 for peptide, parts in pepToParts.iteritems(): 633 for part in pepToParts[peptide]:
483 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) 634 if part in respMap and 'total_protein_count' in respMap[part]:
484 for part in parts: 635 matches.append(respMap[part])
485 if len(part) > 50: 636 match = best_match(peptide, matches)
486 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) 637 if not match:
487 if 5 <= len(part) <= 50: 638 unmatched_peptides.append(peptide)
488 partToPeps.setdefault(part,[]).append(peptide) 639 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
489 trypticPeptides = partToPeps.keys() 640 match = {'peptide': longest_tryptic_peptide}
490 ## unipept 641 match['tryptic_peptide'] = match['peptide']
491 unipept_resp = [] 642 match['peptide'] = peptide
492 idx = range(0,len(trypticPeptides),options.max_request) 643 peptideMatches.append(match)
493 idx.append(len(trypticPeptides)) 644 resp = peptideMatches
494 for i in range(len(idx)-1): 645 if options.debug:
495 post_data = [] 646 print("\nmapped response: %s\n" % str(resp), file=sys.stderr)
496 if options.equate_il: 647 # output results
497 post_data.append(("equate_il","true")) 648 if not (options.unmatched or options.json or options.tsv or options.csv):
498 if options.names or options.json: 649 print(str(resp))
499 post_data.append(("extra","true")) 650 if options.unmatched:
500 post_data.append(("names","true")) 651 with open(options.unmatched, 'w') as outputFile:
501 elif options.extra or options.json: 652 for peptide in peptides:
502 post_data.append(("extra","true")) 653 if peptide in unmatched_peptides:
503 if options.domains: 654 outputFile.write("%s\n" % peptide)
504 post_data.append(("domains","true")) 655 if options.json:
505 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] 656 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
506 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) 657 root = get_taxon_json(resp)
507 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} 658 with open(options.json, 'w') as outputFile:
508 ## headers = {'Accept': 'application/json'} 659 outputFile.write(json.dumps(root))
509 url = '%s/%s' % (options.url.rstrip('/'),options.unipept) 660 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct']:
510 if options.get: 661 with open(options.json, 'w') as outputFile:
511 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) 662 outputFile.write(str(resp))
512 url = '%s.json?%s' % (url,params) 663 if options.ec_json:
513 req = urllib2.Request( url ) 664 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
514 else: 665 root = get_ec_json(resp)
515 url = '%s.json' % (url) 666 with open(options.ec_json, 'w') as outputFile:
516 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) 667 outputFile.write(json.dumps(root))
517 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) 668 if options.tsv or options.csv:
518 try: 669 rows = []
519 resp = urllib2.urlopen( req ) 670 column_names = None
520 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) 671 if options.unipept in ['pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo']:
521 if resp.getcode() == 200: 672 taxa = None
522 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) 673 ec_dict = None
523 except Exception, e: 674 go_dict = None
524 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) 675 ipr_dict = None
525 unmatched_peptides = [] 676 if options.unipept in ['peptinfo']:
526 peptideMatches = [] 677 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
527 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) 678 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
528 if options.unipept in ['pept2prot', 'pept2taxa']: 679 (ec_dict, ec_cols) = get_ec_dict(resp, extra=options.extra)
529 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide 680 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
530 ## multiple entries per trypticPeptide for pep2prot or pep2taxa 681 (go_dict, go_cols) = get_go_dict(resp, extra=options.extra)
531 mapping = {} 682 if options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']:
532 for match in unipept_resp: 683 (ipr_dict, ipr_cols) = get_ipr_dict(resp, extra=options.extra)
533 mapping.setdefault(match['peptide'],[]).append(match) 684 for i, pdict in enumerate(resp):
534 for peptide in peptides: 685 peptide = pdict['peptide']
535 # Get the intersection of matches to the tryptic parts 686 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
536 keyToMatch = None 687 column_names = ['peptide', 'total_protein_count']
537 for part in pepToParts[peptide]: 688 vals = [peptide, total_protein_count]
538 if part in mapping: 689 if ec_dict:
539 temp = {match[dupkey] : match for match in mapping[part]} 690 vals += ec_dict[peptide]
540 if keyToMatch: 691 column_names += ec_cols
541 dkeys = set(keyToMatch.keys()) - set(temp.keys()) 692 if go_dict:
542 for k in dkeys: 693 vals += go_dict[peptide]
543 del keyToMatch[k] 694 column_names += go_cols
544 else: 695 if ipr_dict:
545 keyToMatch = temp 696 vals += ipr_dict[peptide]
546 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp 697 column_names += ipr_cols
547 if not keyToMatch: 698 if taxa:
548 unmatched_peptides.append(peptide) 699 vals += taxa[peptide][1:]
549 else: 700 column_names += taxon_cols[1:]
550 for key,match in keyToMatch.iteritems(): 701 rows.append(vals)
551 match['tryptic_peptide'] = match['peptide'] 702 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
552 match['peptide'] = peptide 703 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
553 peptideMatches.append(match) 704 column_names = taxon_cols
554 elif options.unipept in ['pept2lca', 'peptinfo']: 705 rows = list(taxa.values())
555 ## should be one response per trypticPeptide for pep2lca 706 for peptide, vals in taxa.items():
556 respMap = {v['peptide']:v for v in unipept_resp} 707 rows.append(vals)
557 ## map resp back to peptides 708 if options.tsv:
558 for peptide in peptides: 709 with open(options.tsv, 'w') as outputFile:
559 matches = list() 710 if column_names:
560 for part in pepToParts[peptide]: 711 outputFile.write("#%s\n" % '\t'.join(column_names))
561 if part in respMap: 712 for vals in rows:
562 matches.append(respMap[part]) 713 outputFile.write("%s\n" % '\t'.join(vals))
563 match = best_match(peptide,matches) 714 if options.csv:
564 if not match: 715 with open(options.csv, 'w') as outputFile:
565 unmatched_peptides.append(peptide) 716 if column_names:
566 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] 717 outputFile.write("%s\n" % ','.join(column_names))
567 match = {'peptide' : longest_tryptic_peptide} 718 for vals in rows:
568 match['tryptic_peptide'] = match['peptide'] 719 outputFile.write("%s\n" % ','.join(['"%s"' % (v if v else '') for v in vals]))
569 match['peptide'] = peptide 720 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
570 peptideMatches.append(match) 721 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
571 else: 722 write_ec_table(options.ec_tsv, resp, column_order)
572 respMap = {v['peptide']:v for v in unipept_resp} 723 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
573 ## map resp back to peptides 724 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
574 for peptide in peptides: 725 write_go_table(options.go_tsv, resp, column_order)
575 matches = list() 726 if options.ipr_tsv and options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']:
576 for part in pepToParts[peptide]: 727 column_order = pept2interpro_extra_column_order if options.extra else pept2interpro_column_order
577 if part in respMap and 'total_protein_count' in respMap[part]: 728 write_ipr_table(options.ipr_tsv, resp, column_order)
578 matches.append(respMap[part]) 729
579 match = best_match(peptide,matches) 730
580 if not match: 731 if __name__ == "__main__":
581 unmatched_peptides.append(peptide) 732 __main__()
582 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
583 match = {'peptide' : longest_tryptic_peptide}
584 match['tryptic_peptide'] = match['peptide']
585 match['peptide'] = peptide
586 peptideMatches.append(match)
587 resp = peptideMatches
588 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp)
589 ## output results
590 if not (options.unmatched or options.json or options.tsv or options.csv):
591 print >> sys.stdout, str(resp)
592 if options.unmatched:
593 with open(options.unmatched,'w') as outputFile:
594 for peptide in peptides:
595 if peptide in unmatched_peptides:
596 outputFile.write("%s\n" % peptide)
597 if options.json:
598 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
599 root = get_taxon_json(resp)
600 with open(options.json,'w') as outputFile:
601 outputFile.write(json.dumps(root))
602 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']:
603 with open(options.json,'w') as outputFile:
604 outputFile.write(str(resp))
605 if options.ec_json:
606 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
607 root = get_ec_json(resp)
608 with open(options.ec_json,'w') as outputFile:
609 outputFile.write(json.dumps(root))
610 if options.tsv or options.csv:
611 rows = []
612 column_names = None
613 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']:
614 taxa = None
615 ec_dict = None
616 go_dict = None
617 if options.unipept in ['peptinfo']:
618 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
619 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
620 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra)
621 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
622 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra)
623 for i,pdict in enumerate(resp):
624 peptide = pdict['peptide']
625 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
626 column_names = ['peptide', 'total_protein_count']
627 vals = [peptide,total_protein_count]
628 if ec_dict:
629 vals += ec_dict[peptide]
630 column_names += ec_cols
631 if go_dict:
632 vals += go_dict[peptide]
633 column_names += go_cols
634 if taxa:
635 vals += taxa[peptide][1:]
636 column_names += taxon_cols[1:]
637 rows.append(vals)
638 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
639 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
640 column_names = taxon_cols
641 rows = taxa.values()
642 for peptide,vals in taxa.iteritems():
643 rows.append(vals)
644 if options.tsv:
645 with open(options.tsv,'w') as outputFile:
646 if column_names:
647 outputFile.write("#%s\n"% '\t'.join(column_names))
648 for vals in rows:
649 outputFile.write("%s\n"% '\t'.join(vals))
650 if options.csv:
651 with open(options.csv,'w') as outputFile:
652 if column_names:
653 outputFile.write("%s\n"% ','.join(column_names))
654 for vals in rows:
655 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
656 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
657 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
658 write_ec_table(options.ec_tsv, resp, column_order)
659 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
660 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
661 write_go_table(options.go_tsv, resp, column_order)
662
663 if __name__ == "__main__" : __main__()