comparison unipept.py @ 4:4953dcd7dd39 draft

planemo upload for repository http://unipept.ugent.be/apidocs commit e91b0fe16bf468b34884508652359b91847d1f95-dirty
author galaxyp
date Wed, 23 Jan 2019 09:16:38 -0500
parents 34758ab8aaa4
children 917fd3ebc223
comparison
equal deleted inserted replaced
3:34758ab8aaa4 4:4953dcd7dd39
19 import os 19 import os
20 import sys 20 import sys
21 import re 21 import re
22 import urllib 22 import urllib
23 import urllib2 23 import urllib2
24
25 """
26 pept2taxa json
27 pept2lca json
28 pept2prot
29 pept2ec ecjson ec
30 pept2go go
31 pept2funct go ec
32 peptinfo json ecjson ec go
33
34 """
35
24 try: 36 try:
25 import xml.etree.cElementTree as ET 37 import xml.etree.cElementTree as ET
26 except ImportError: 38 except ImportError:
27 import xml.etree.ElementTree as ET 39 import xml.etree.ElementTree as ET
28 40
29 def warn_err(msg,exit_code=1): 41 def warn_err(msg,exit_code=1):
30 sys.stderr.write(msg) 42 sys.stderr.write(msg)
31 if exit_code: 43 if exit_code:
32 sys.exit(exit_code) 44 sys.exit(exit_code)
33 45
46 go_types = ['biological process', 'molecular function', 'cellular component']
47 ec_name_dict = {
48 '1' : 'Oxidoreductase',
49 '1.1' : 'act on the CH-OH group of donors',
50 '1.2' : 'act on the aldehyde or oxo group of donors',
51 '1.3' : 'act on the CH-CH group of donors',
52 '1.4' : 'act on the CH-NH2 group of donors',
53 '1.5' : 'act on CH-NH group of donors',
54 '1.6' : 'act on NADH or NADPH',
55 '1.7' : 'act on other nitrogenous compounds as donors',
56 '1.8' : 'act on a sulfur group of donors',
57 '1.9' : 'act on a heme group of donors',
58 '1.10' : 'act on diphenols and related substances as donors',
59 '1.11' : 'act on peroxide as an acceptor -- peroxidases',
60 '1.12' : 'act on hydrogen as a donor',
61 '1.13' : 'act on single donors with incorporation of molecular oxygen',
62 '1.14' : 'act on paired donors with incorporation of molecular oxygen',
63 '1.15' : 'act on superoxide radicals as acceptors',
64 '1.16' : 'oxidize metal ions',
65 '1.17' : 'act on CH or CH2 groups',
66 '1.18' : 'act on iron-sulfur proteins as donors',
67 '1.19' : 'act on reduced flavodoxin as donor',
68 '1.20' : 'act on phosphorus or arsenic as donors',
69 '1.21' : 'act on X-H and Y-H to form an X-Y bond',
70 '1.97' : 'other oxidoreductases',
71 '2' : 'Transferase',
72 '2.1' : 'transfer one-carbon groups, Methylase',
73 '2.2' : 'transfer aldehyde or ketone groups',
74 '2.3' : 'acyltransferases',
75 '2.4' : 'glycosyltransferases',
76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups',
77 '2.6' : 'transfer nitrogenous groups',
78 '2.7' : 'transfer phosphorus-containing groups',
79 '2.8' : 'transfer sulfur-containing groups',
80 '2.9' : 'transfer selenium-containing groups',
81 '3' : 'Hydrolase',
82 '3.1' : 'act on ester bonds',
83 '3.2' : 'act on sugars - glycosylases',
84 '3.3' : 'act on ether bonds',
85 '3.4' : 'act on peptide bonds - Peptidase',
86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds',
87 '3.6' : 'act on acid anhydrides',
88 '3.7' : 'act on carbon-carbon bonds',
89 '3.8' : 'act on halide bonds',
90 '3.9' : 'act on phosphorus-nitrogen bonds',
91 '3.10' : 'act on sulfur-nitrogen bonds',
92 '3.11' : 'act on carbon-phosphorus bonds',
93 '3.12' : 'act on sulfur-sulfur bonds',
94 '3.13' : 'act on carbon-sulfur bonds',
95 '4' : 'Lyase',
96 '4.1' : 'carbon-carbon lyases',
97 '4.2' : 'carbon-oxygen lyases',
98 '4.3' : 'carbon-nitrogen lyases',
99 '4.4' : 'carbon-sulfur lyases',
100 '4.5' : 'carbon-halide lyases',
101 '4.6' : 'phosphorus-oxygen lyases',
102 '5' : 'Isomerase',
103 '5.1' : 'racemases and epimerases',
104 '5.2' : 'cis-trans-isomerases',
105 '5.3' : 'intramolecular oxidoreductases',
106 '5.4' : 'intramolecular transferases -- mutases',
107 '5.5' : 'intramolecular lyases',
108 '5.99' : 'other isomerases',
109 '6' : 'Ligase',
110 '6.1' : 'form carbon-oxygen bonds',
111 '6.2' : 'form carbon-sulfur bonds',
112 '6.3' : 'form carbon-nitrogen bonds',
113 '6.4' : 'form carbon-carbon bonds',
114 '6.5' : 'form phosphoric ester bonds',
115 '6.6' : 'form nitrogen-metal bonds',
116 }
34 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] 117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name']
35 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] 118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ]
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] 119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] 120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] 121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']]
123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']]
124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']]
125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']]
126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go']
39 127
40 def __main__(): 128 def __main__():
41 version = '2.0' 129 version = '2.0'
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' 130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
43 131
93 def best_match(peptide,matches): 181 def best_match(peptide,matches):
94 if not matches: 182 if not matches:
95 return None 183 return None
96 elif len(matches) == 1: 184 elif len(matches) == 1:
97 return matches[0].copy() 185 return matches[0].copy()
98 else: 186 elif 'taxon_rank' in matches[0]:
99 # find the most specific match (peptide is always the first column order field) 187 # find the most specific match (peptide is always the first column order field)
100 for col in reversed(pept2lca_extra_column_order[1:]): 188 for col in reversed(pept2lca_extra_column_order[1:]):
101 col_id = col+"_id" if options.extra else col 189 col_id = col+"_id" if options.extra else col
102 for match in matches: 190 for match in matches:
103 if 'taxon_rank' in match and match['taxon_rank'] == col: 191 if 'taxon_rank' in match and match['taxon_rank'] == col:
104 return match.copy() 192 return match.copy()
105 if col_id in match and match[col_id]: 193 if col_id in match and match[col_id]:
106 return match.copy() 194 return match.copy()
195 else:
196 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy()
107 return None 197 return None
198
199 def get_taxon_json(resp):
200 found_keys = set()
201 for i,pdict in enumerate(resp):
202 found_keys |= set(pdict.keys())
203 taxa_cols = []
204 for col in pept2lca_extra_column_order[-1:0:-1]:
205 if col+'_id' in found_keys:
206 taxa_cols.append(col)
207 id_to_node = dict()
208 def get_node(id,name,rank,child,seq):
209 if id not in id_to_node:
210 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] }
211 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
212 id_to_node[id] = node
213 else:
214 node = id_to_node[id]
215 node['data']['count'] += 1
216 if seq is not None and seq not in node['data']['sequences']:
217 node['data']['sequences'].append(seq)
218 if child is None:
219 node['data']['self_count'] += 1
220 elif child['id'] not in node['kids']:
221 node['kids'].append(child['id'])
222 node['children'].append(child)
223 return node
224 root = get_node(1,'root','no rank',None,None)
225 for i,pdict in enumerate(resp):
226 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
227 seq = sequence
228 child = None
229 for col in taxa_cols:
230 col_id = col+'_id'
231 if col_id in pdict and pdict.get(col_id):
232 col_name = col if col in found_keys else col+'_name'
233 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
234 seq = None
235 if child:
236 get_node(1,'root','no rank',child,None)
237 return root
238
239 def get_ec_json(resp):
240 ecMap = dict()
241 for pdict in resp:
242 if 'ec' in pdict:
243 for ec in pdict['ec']:
244 ec_number = ec['ec_number']
245 if ec_number not in ecMap:
246 ecMap[ec_number] = []
247 ecMap[ec_number].append(pdict)
248 def get_ids(ec):
249 ids = []
250 i = len(ec)
251 while i >= 0:
252 ids.append(ec[:i])
253 i = ec.rfind('.',0,i - 1)
254 return ids
255 id_to_node = dict()
256 def get_node(id,name,child,seq):
257 if id not in id_to_node:
258 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] }
259 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
260 id_to_node[id] = node
261 else:
262 node = id_to_node[id]
263 node['data']['count'] += 1
264 if seq is not None and seq not in node['data']['sequences']:
265 node['data']['sequences'].append(seq)
266 if child is None:
267 node['data']['self_count'] += 1
268 elif child['id'] not in node['kids']:
269 node['kids'].append(child['id'])
270 node['children'].append(child)
271 return node
272 root = get_node(0,'-.-.-.-',None,None)
273 for i in range(1,7):
274 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None)
275 get_node(0,'-.-.-.-',child,None)
276 for i,pdict in enumerate(resp):
277 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
278 seq = sequence
279 if 'ec' in pdict:
280 for ec in pdict['ec']:
281 child = None
282 protein_count = ec['protein_count']
283 ec_number = ec['ec_number']
284 for ec_id in get_ids(ec_number):
285 ec_name = str(ec_id)
286 ## if len(ec_id) == 3:
287 ## ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])], ec_name_dict[str(ec_id)])
288 child = get_node(ec_id,ec_name,child,seq)
289 seq = None
290 if child:
291 get_node(0,'-.-.-.-',child,None)
292 return root
293
294 def get_taxon_dict(resp, column_order, extra=False, names=False):
295 found_keys = set()
296 results = []
297 for i,pdict in enumerate(resp):
298 results.append(pdict)
299 found_keys |= set(pdict.keys())
300 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
301 column_names = []
302 column_keys = []
303 for col in column_order:
304 if col in found_keys:
305 column_names.append(col)
306 column_keys.append(col)
307 elif names:
308 col_id = col+'_id'
309 col_name = col+'_name'
310 if extra:
311 if col_id in found_keys:
312 column_names.append(col_id)
313 column_keys.append(col_id)
314 if names:
315 if col_name in found_keys:
316 column_names.append(col)
317 column_keys.append(col_name)
318 else:
319 if col+'_name' in found_keys:
320 column_names.append(col)
321 column_keys.append(col+'_name')
322 elif col+'_id' in found_keys:
323 column_names.append(col)
324 column_keys.append(col+'_id')
325 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
326 taxa = dict() ## peptide : [taxonomy]
327 for i,pdict in enumerate(results):
328 peptide = pdict['peptide'] if 'peptide' in pdict else None
329 if peptide and peptide not in taxa:
330 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
331 taxa[peptide] = vals
332 return (taxa,column_names)
333
334 def get_ec_dict(resp, extra=False):
335 ec_cols = ['ec_numbers', 'ec_protein_counts']
336 if extra:
337 ec_cols.append('ec_names')
338 ec_dict = dict()
339 for i,pdict in enumerate(resp):
340 peptide = pdict['peptide']
341 ec_numbers = []
342 protein_counts = []
343 ec_names = []
344 if 'ec' in pdict:
345 for ec in pdict['ec']:
346 ec_numbers.append(ec['ec_number'])
347 protein_counts.append(str(ec['protein_count']))
348 if extra:
349 ec_names.append(ec['name'] if 'name' in ec else '')
350 vals = [','.join(ec_numbers),','.join(protein_counts)]
351 if extra:
352 vals.append(','.join(ec_names))
353 ec_dict[peptide] = vals
354 return (ec_dict, ec_cols)
355
356 def get_go_dict(resp, extra=False):
357 go_cols = ['go_terms', 'go_protein_counts']
358 if extra:
359 go_cols.append('go_names')
360 go_dict = dict()
361 for i,pdict in enumerate(resp):
362 peptide = pdict['peptide']
363 go_terms = []
364 protein_counts = []
365 go_names = []
366 if 'go' in pdict:
367 for go in pdict['go']:
368 if 'go_term' in go:
369 go_terms.append(go['go_term'])
370 protein_counts.append(str(go['protein_count']))
371 if extra:
372 go_names.append(go['name'] if 'name' in go else '')
373 else:
374 for go_type in go_types:
375 if go_type in go:
376 for _go in go[go_type]:
377 go_terms.append(_go['go_term'])
378 protein_counts.append(str(_go['protein_count']))
379 if extra:
380 go_names.append(_go['name'] if 'name' in _go else '')
381 vals = [','.join(go_terms),','.join(protein_counts)]
382 if extra:
383 vals.append(','.join(go_names))
384 go_dict[peptide] = vals
385 return (go_dict, go_cols)
386
387 def write_ec_table(outfile, resp, column_order):
388 with open(outfile,'w') as fh:
389 for i,pdict in enumerate(resp):
390 if 'ec' in pdict:
391 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
392 for ec in pdict['ec']:
393 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]]
394 fh.write('%s\n' % '\t'.join(tvals + vals))
395
396 def write_go_table(outfile, resp, column_order):
397 with open(outfile,'w') as fh:
398 for i,pdict in enumerate(resp):
399 if 'go' in pdict:
400 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]]
401 for go in pdict['go']:
402 if 'go_term' in go:
403 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]]
404 fh.write('%s\n' % '\t'.join(tvals + vals))
405 else:
406 for go_type in go_types:
407 if go_type in go:
408 for _go in go[go_type]:
409 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]]
410 vals.append(go_type)
411 fh.write('%s\n' % '\t'.join(tvals + vals))
108 412
109 #Parse Command Line 413 #Parse Command Line
110 parser = optparse.OptionParser() 414 parser = optparse.OptionParser()
111 # unipept API choice 415 # unipept API choice
112 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) 416 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'],
417 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' )
113 # input files 418 # input files
114 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) 419 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' )
115 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) 420 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' )
116 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) 421 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' )
117 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) 422 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' )
118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) 423 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
119 # Unipept Flags 424 # Unipept Flags
120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) 425 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) 426 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) 427 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
428 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' )
123 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) 429 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
124 430
125 # output fields 431 # output fields
126 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) 432 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
127 # Warn vs Error Flag 433 # Warn vs Error Flag
128 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) 434 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
129 # output files 435 # output files
130 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') 436 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results')
437 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results')
438 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results')
439 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results')
440 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results')
131 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') 441 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results')
132 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') 442 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results')
133 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) 443 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' )
444 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' )
134 # debug 445 # debug
446 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' )
135 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) 447 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' )
136 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) 448 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' )
137 (options, args) = parser.parse_args() 449 (options, args) = parser.parse_args()
138 if options.version: 450 if options.version:
139 print >> sys.stdout,"%s" % version 451 print >> sys.stdout,"%s" % version
186 if options.names or options.json: 498 if options.names or options.json:
187 post_data.append(("extra","true")) 499 post_data.append(("extra","true"))
188 post_data.append(("names","true")) 500 post_data.append(("names","true"))
189 elif options.extra or options.json: 501 elif options.extra or options.json:
190 post_data.append(("extra","true")) 502 post_data.append(("extra","true"))
503 if options.domains:
504 post_data.append(("domains","true"))
191 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] 505 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
506 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data))
192 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} 507 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
193 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept 508 ## headers = {'Accept': 'application/json'}
194 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) 509 url = '%s/%s' % (options.url.rstrip('/'),options.unipept)
195 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) 510 if options.get:
511 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data])
512 url = '%s.json?%s' % (url,params)
513 req = urllib2.Request( url )
514 else:
515 url = '%s.json' % (url)
516 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
517 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url))
518 try:
519 resp = urllib2.urlopen( req )
520 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode()))
521 if resp.getcode() == 200:
522 unipept_resp += json.loads( urllib2.urlopen( req ).read() )
523 except Exception, e:
524 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None)
196 unmatched_peptides = [] 525 unmatched_peptides = []
197 peptideMatches = [] 526 peptideMatches = []
198 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) 527 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
199 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': 528 if options.unipept in ['pept2prot', 'pept2taxa']:
200 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide 529 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
201 ## multiple entries per trypticPeptide for pep2prot or pep2taxa 530 ## multiple entries per trypticPeptide for pep2prot or pep2taxa
202 mapping = {} 531 mapping = {}
203 for match in unipept_resp: 532 for match in unipept_resp:
204 mapping.setdefault(match['peptide'],[]).append(match) 533 mapping.setdefault(match['peptide'],[]).append(match)
220 else: 549 else:
221 for key,match in keyToMatch.iteritems(): 550 for key,match in keyToMatch.iteritems():
222 match['tryptic_peptide'] = match['peptide'] 551 match['tryptic_peptide'] = match['peptide']
223 match['peptide'] = peptide 552 match['peptide'] = peptide
224 peptideMatches.append(match) 553 peptideMatches.append(match)
225 else: 554 elif options.unipept in ['pept2lca', 'peptinfo']:
226 ## should be one response per trypticPeptide for pep2lca 555 ## should be one response per trypticPeptide for pep2lca
227 respMap = {v['peptide']:v for v in unipept_resp} 556 respMap = {v['peptide']:v for v in unipept_resp}
228 ## map resp back to peptides 557 ## map resp back to peptides
229 for peptide in peptides: 558 for peptide in peptides:
230 matches = list() 559 matches = list()
231 for part in pepToParts[peptide]: 560 for part in pepToParts[peptide]:
232 if part in respMap: 561 if part in respMap:
562 matches.append(respMap[part])
563 match = best_match(peptide,matches)
564 if not match:
565 unmatched_peptides.append(peptide)
566 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
567 match = {'peptide' : longest_tryptic_peptide}
568 match['tryptic_peptide'] = match['peptide']
569 match['peptide'] = peptide
570 peptideMatches.append(match)
571 else:
572 respMap = {v['peptide']:v for v in unipept_resp}
573 ## map resp back to peptides
574 for peptide in peptides:
575 matches = list()
576 for part in pepToParts[peptide]:
577 if part in respMap and 'total_protein_count' in respMap[part]:
233 matches.append(respMap[part]) 578 matches.append(respMap[part])
234 match = best_match(peptide,matches) 579 match = best_match(peptide,matches)
235 if not match: 580 if not match:
236 unmatched_peptides.append(peptide) 581 unmatched_peptides.append(peptide)
237 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] 582 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1]
248 with open(options.unmatched,'w') as outputFile: 593 with open(options.unmatched,'w') as outputFile:
249 for peptide in peptides: 594 for peptide in peptides:
250 if peptide in unmatched_peptides: 595 if peptide in unmatched_peptides:
251 outputFile.write("%s\n" % peptide) 596 outputFile.write("%s\n" % peptide)
252 if options.json: 597 if options.json:
253 if options.unipept == 'pept2prot': 598 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']:
599 root = get_taxon_json(resp)
600 with open(options.json,'w') as outputFile:
601 outputFile.write(json.dumps(root))
602 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']:
254 with open(options.json,'w') as outputFile: 603 with open(options.json,'w') as outputFile:
255 outputFile.write(str(resp)) 604 outputFile.write(str(resp))
256 else: 605 if options.ec_json:
257 found_keys = set() 606 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
607 root = get_ec_json(resp)
608 with open(options.ec_json,'w') as outputFile:
609 outputFile.write(json.dumps(root))
610 if options.tsv or options.csv:
611 rows = []
612 column_names = None
613 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']:
614 taxa = None
615 ec_dict = None
616 go_dict = None
617 if options.unipept in ['peptinfo']:
618 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
619 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
620 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra)
621 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
622 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra)
258 for i,pdict in enumerate(resp): 623 for i,pdict in enumerate(resp):
259 found_keys |= set(pdict.keys()) 624 peptide = pdict['peptide']
260 taxa_cols = [] 625 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0'
261 for col in pept2lca_extra_column_order[-1:0:-1]: 626 column_names = ['peptide', 'total_protein_count']
262 if col+'_id' in found_keys: 627 vals = [peptide,total_protein_count]
263 taxa_cols.append(col) 628 if ec_dict:
264 id_to_node = dict() 629 vals += ec_dict[peptide]
265 def get_node(id,name,rank,child,seq): 630 column_names += ec_cols
266 if id not in id_to_node: 631 if go_dict:
267 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } 632 vals += go_dict[peptide]
268 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } 633 column_names += go_cols
269 id_to_node[id] = node 634 if taxa:
270 else: 635 vals += taxa[peptide][1:]
271 node = id_to_node[id] 636 column_names += taxon_cols[1:]
272 node['data']['count'] += 1 637 rows.append(vals)
273 if seq is not None and seq not in node['data']['sequences']: 638 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']:
274 node['data']['sequences'].append(seq) 639 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names)
275 if child is None: 640 column_names = taxon_cols
276 node['data']['self_count'] += 1 641 rows = taxa.values()
277 elif child['id'] not in node['kids']: 642 for peptide,vals in taxa.iteritems():
278 node['kids'].append(child['id']) 643 rows.append(vals)
279 node['children'].append(child)
280 return node
281 root = get_node(1,'root','no rank',None,None)
282 for i,pdict in enumerate(resp):
283 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
284 seq = sequence
285 child = None
286 for col in taxa_cols:
287 col_id = col+'_id'
288 if col_id in pdict and pdict.get(col_id):
289 col_name = col if col in found_keys else col+'_name'
290 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
291 seq = None
292 if child:
293 get_node(1,'root','no rank',child,None)
294 with open(options.json,'w') as outputFile:
295 outputFile.write(json.dumps(root))
296 if options.tsv or options.csv:
297 # 'pept2lca','pept2taxa','pept2prot'
298 found_keys = set()
299 results = []
300 for i,pdict in enumerate(resp):
301 results.append(pdict)
302 found_keys |= set(pdict.keys())
303 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys)
304 column_names = []
305 column_keys = []
306 for col in column_order:
307 if col in found_keys:
308 column_names.append(col)
309 column_keys.append(col)
310 elif options.extra or options.names:
311 col_id = col+'_id'
312 col_name = col+'_name'
313 if options.extra:
314 if col_id in found_keys:
315 column_names.append(col_id)
316 column_keys.append(col_id)
317 if options.names:
318 if col_name in found_keys:
319 column_names.append(col)
320 column_keys.append(col_name)
321 else:
322 if col+'_name' in found_keys:
323 column_names.append(col)
324 column_keys.append(col+'_name')
325 elif col+'_id' in found_keys:
326 column_names.append(col)
327 column_keys.append(col+'_id')
328 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys)
329 taxa = []
330 for i,pdict in enumerate(results):
331 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys]
332 if vals not in taxa:
333 taxa.append(vals)
334 if options.tsv: 644 if options.tsv:
335 with open(options.tsv,'w') as outputFile: 645 with open(options.tsv,'w') as outputFile:
336 outputFile.write("#%s\n"% '\t'.join(column_names)) 646 if column_names:
337 for vals in taxa: 647 outputFile.write("#%s\n"% '\t'.join(column_names))
648 for vals in rows:
338 outputFile.write("%s\n"% '\t'.join(vals)) 649 outputFile.write("%s\n"% '\t'.join(vals))
339 if options.csv: 650 if options.csv:
340 with open(options.csv,'w') as outputFile: 651 with open(options.csv,'w') as outputFile:
341 outputFile.write("%s\n"% ','.join(column_names)) 652 if column_names:
342 for vals in taxa: 653 outputFile.write("%s\n"% ','.join(column_names))
654 for vals in rows:
343 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) 655 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals]))
656 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']:
657 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order
658 write_ec_table(options.ec_tsv, resp, column_order)
659 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']:
660 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order
661 write_go_table(options.go_tsv, resp, column_order)
344 662
345 if __name__ == "__main__" : __main__() 663 if __name__ == "__main__" : __main__()