Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 4:4953dcd7dd39 draft
planemo upload for repository http://unipept.ugent.be/apidocs commit e91b0fe16bf468b34884508652359b91847d1f95-dirty
author | galaxyp |
---|---|
date | Wed, 23 Jan 2019 09:16:38 -0500 |
parents | 34758ab8aaa4 |
children | 917fd3ebc223 |
comparison
equal
deleted
inserted
replaced
3:34758ab8aaa4 | 4:4953dcd7dd39 |
---|---|
19 import os | 19 import os |
20 import sys | 20 import sys |
21 import re | 21 import re |
22 import urllib | 22 import urllib |
23 import urllib2 | 23 import urllib2 |
24 | |
25 """ | |
26 pept2taxa json | |
27 pept2lca json | |
28 pept2prot | |
29 pept2ec ecjson ec | |
30 pept2go go | |
31 pept2funct go ec | |
32 peptinfo json ecjson ec go | |
33 | |
34 """ | |
35 | |
24 try: | 36 try: |
25 import xml.etree.cElementTree as ET | 37 import xml.etree.cElementTree as ET |
26 except ImportError: | 38 except ImportError: |
27 import xml.etree.ElementTree as ET | 39 import xml.etree.ElementTree as ET |
28 | 40 |
29 def warn_err(msg,exit_code=1): | 41 def warn_err(msg,exit_code=1): |
30 sys.stderr.write(msg) | 42 sys.stderr.write(msg) |
31 if exit_code: | 43 if exit_code: |
32 sys.exit(exit_code) | 44 sys.exit(exit_code) |
33 | 45 |
46 go_types = ['biological process', 'molecular function', 'cellular component'] | |
47 ec_name_dict = { | |
48 '1' : 'Oxidoreductase', | |
49 '1.1' : 'act on the CH-OH group of donors', | |
50 '1.2' : 'act on the aldehyde or oxo group of donors', | |
51 '1.3' : 'act on the CH-CH group of donors', | |
52 '1.4' : 'act on the CH-NH2 group of donors', | |
53 '1.5' : 'act on CH-NH group of donors', | |
54 '1.6' : 'act on NADH or NADPH', | |
55 '1.7' : 'act on other nitrogenous compounds as donors', | |
56 '1.8' : 'act on a sulfur group of donors', | |
57 '1.9' : 'act on a heme group of donors', | |
58 '1.10' : 'act on diphenols and related substances as donors', | |
59 '1.11' : 'act on peroxide as an acceptor -- peroxidases', | |
60 '1.12' : 'act on hydrogen as a donor', | |
61 '1.13' : 'act on single donors with incorporation of molecular oxygen', | |
62 '1.14' : 'act on paired donors with incorporation of molecular oxygen', | |
63 '1.15' : 'act on superoxide radicals as acceptors', | |
64 '1.16' : 'oxidize metal ions', | |
65 '1.17' : 'act on CH or CH2 groups', | |
66 '1.18' : 'act on iron-sulfur proteins as donors', | |
67 '1.19' : 'act on reduced flavodoxin as donor', | |
68 '1.20' : 'act on phosphorus or arsenic as donors', | |
69 '1.21' : 'act on X-H and Y-H to form an X-Y bond', | |
70 '1.97' : 'other oxidoreductases', | |
71 '2' : 'Transferase', | |
72 '2.1' : 'transfer one-carbon groups, Methylase', | |
73 '2.2' : 'transfer aldehyde or ketone groups', | |
74 '2.3' : 'acyltransferases', | |
75 '2.4' : 'glycosyltransferases', | |
76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups', | |
77 '2.6' : 'transfer nitrogenous groups', | |
78 '2.7' : 'transfer phosphorus-containing groups', | |
79 '2.8' : 'transfer sulfur-containing groups', | |
80 '2.9' : 'transfer selenium-containing groups', | |
81 '3' : 'Hydrolase', | |
82 '3.1' : 'act on ester bonds', | |
83 '3.2' : 'act on sugars - glycosylases', | |
84 '3.3' : 'act on ether bonds', | |
85 '3.4' : 'act on peptide bonds - Peptidase', | |
86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', | |
87 '3.6' : 'act on acid anhydrides', | |
88 '3.7' : 'act on carbon-carbon bonds', | |
89 '3.8' : 'act on halide bonds', | |
90 '3.9' : 'act on phosphorus-nitrogen bonds', | |
91 '3.10' : 'act on sulfur-nitrogen bonds', | |
92 '3.11' : 'act on carbon-phosphorus bonds', | |
93 '3.12' : 'act on sulfur-sulfur bonds', | |
94 '3.13' : 'act on carbon-sulfur bonds', | |
95 '4' : 'Lyase', | |
96 '4.1' : 'carbon-carbon lyases', | |
97 '4.2' : 'carbon-oxygen lyases', | |
98 '4.3' : 'carbon-nitrogen lyases', | |
99 '4.4' : 'carbon-sulfur lyases', | |
100 '4.5' : 'carbon-halide lyases', | |
101 '4.6' : 'phosphorus-oxygen lyases', | |
102 '5' : 'Isomerase', | |
103 '5.1' : 'racemases and epimerases', | |
104 '5.2' : 'cis-trans-isomerases', | |
105 '5.3' : 'intramolecular oxidoreductases', | |
106 '5.4' : 'intramolecular transferases -- mutases', | |
107 '5.5' : 'intramolecular lyases', | |
108 '5.99' : 'other isomerases', | |
109 '6' : 'Ligase', | |
110 '6.1' : 'form carbon-oxygen bonds', | |
111 '6.2' : 'form carbon-sulfur bonds', | |
112 '6.3' : 'form carbon-nitrogen bonds', | |
113 '6.4' : 'form carbon-carbon bonds', | |
114 '6.5' : 'form phosphoric ester bonds', | |
115 '6.6' : 'form nitrogen-metal bonds', | |
116 } | |
34 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] | 117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] |
35 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | 118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] |
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | 119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] |
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | 120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] |
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | 121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] |
122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | |
123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | |
124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | |
125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] | |
126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] | |
39 | 127 |
40 def __main__(): | 128 def __main__(): |
41 version = '2.0' | 129 version = '2.0' |
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | 130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' |
43 | 131 |
93 def best_match(peptide,matches): | 181 def best_match(peptide,matches): |
94 if not matches: | 182 if not matches: |
95 return None | 183 return None |
96 elif len(matches) == 1: | 184 elif len(matches) == 1: |
97 return matches[0].copy() | 185 return matches[0].copy() |
98 else: | 186 elif 'taxon_rank' in matches[0]: |
99 # find the most specific match (peptide is always the first column order field) | 187 # find the most specific match (peptide is always the first column order field) |
100 for col in reversed(pept2lca_extra_column_order[1:]): | 188 for col in reversed(pept2lca_extra_column_order[1:]): |
101 col_id = col+"_id" if options.extra else col | 189 col_id = col+"_id" if options.extra else col |
102 for match in matches: | 190 for match in matches: |
103 if 'taxon_rank' in match and match['taxon_rank'] == col: | 191 if 'taxon_rank' in match and match['taxon_rank'] == col: |
104 return match.copy() | 192 return match.copy() |
105 if col_id in match and match[col_id]: | 193 if col_id in match and match[col_id]: |
106 return match.copy() | 194 return match.copy() |
195 else: | |
196 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() | |
107 return None | 197 return None |
198 | |
199 def get_taxon_json(resp): | |
200 found_keys = set() | |
201 for i,pdict in enumerate(resp): | |
202 found_keys |= set(pdict.keys()) | |
203 taxa_cols = [] | |
204 for col in pept2lca_extra_column_order[-1:0:-1]: | |
205 if col+'_id' in found_keys: | |
206 taxa_cols.append(col) | |
207 id_to_node = dict() | |
208 def get_node(id,name,rank,child,seq): | |
209 if id not in id_to_node: | |
210 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | |
211 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
212 id_to_node[id] = node | |
213 else: | |
214 node = id_to_node[id] | |
215 node['data']['count'] += 1 | |
216 if seq is not None and seq not in node['data']['sequences']: | |
217 node['data']['sequences'].append(seq) | |
218 if child is None: | |
219 node['data']['self_count'] += 1 | |
220 elif child['id'] not in node['kids']: | |
221 node['kids'].append(child['id']) | |
222 node['children'].append(child) | |
223 return node | |
224 root = get_node(1,'root','no rank',None,None) | |
225 for i,pdict in enumerate(resp): | |
226 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
227 seq = sequence | |
228 child = None | |
229 for col in taxa_cols: | |
230 col_id = col+'_id' | |
231 if col_id in pdict and pdict.get(col_id): | |
232 col_name = col if col in found_keys else col+'_name' | |
233 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | |
234 seq = None | |
235 if child: | |
236 get_node(1,'root','no rank',child,None) | |
237 return root | |
238 | |
239 def get_ec_json(resp): | |
240 ecMap = dict() | |
241 for pdict in resp: | |
242 if 'ec' in pdict: | |
243 for ec in pdict['ec']: | |
244 ec_number = ec['ec_number'] | |
245 if ec_number not in ecMap: | |
246 ecMap[ec_number] = [] | |
247 ecMap[ec_number].append(pdict) | |
248 def get_ids(ec): | |
249 ids = [] | |
250 i = len(ec) | |
251 while i >= 0: | |
252 ids.append(ec[:i]) | |
253 i = ec.rfind('.',0,i - 1) | |
254 return ids | |
255 id_to_node = dict() | |
256 def get_node(id,name,child,seq): | |
257 if id not in id_to_node: | |
258 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } | |
259 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
260 id_to_node[id] = node | |
261 else: | |
262 node = id_to_node[id] | |
263 node['data']['count'] += 1 | |
264 if seq is not None and seq not in node['data']['sequences']: | |
265 node['data']['sequences'].append(seq) | |
266 if child is None: | |
267 node['data']['self_count'] += 1 | |
268 elif child['id'] not in node['kids']: | |
269 node['kids'].append(child['id']) | |
270 node['children'].append(child) | |
271 return node | |
272 root = get_node(0,'-.-.-.-',None,None) | |
273 for i in range(1,7): | |
274 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) | |
275 get_node(0,'-.-.-.-',child,None) | |
276 for i,pdict in enumerate(resp): | |
277 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
278 seq = sequence | |
279 if 'ec' in pdict: | |
280 for ec in pdict['ec']: | |
281 child = None | |
282 protein_count = ec['protein_count'] | |
283 ec_number = ec['ec_number'] | |
284 for ec_id in get_ids(ec_number): | |
285 ec_name = str(ec_id) | |
286 ## if len(ec_id) == 3: | |
287 ## ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])], ec_name_dict[str(ec_id)]) | |
288 child = get_node(ec_id,ec_name,child,seq) | |
289 seq = None | |
290 if child: | |
291 get_node(0,'-.-.-.-',child,None) | |
292 return root | |
293 | |
294 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
295 found_keys = set() | |
296 results = [] | |
297 for i,pdict in enumerate(resp): | |
298 results.append(pdict) | |
299 found_keys |= set(pdict.keys()) | |
300 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
301 column_names = [] | |
302 column_keys = [] | |
303 for col in column_order: | |
304 if col in found_keys: | |
305 column_names.append(col) | |
306 column_keys.append(col) | |
307 elif names: | |
308 col_id = col+'_id' | |
309 col_name = col+'_name' | |
310 if extra: | |
311 if col_id in found_keys: | |
312 column_names.append(col_id) | |
313 column_keys.append(col_id) | |
314 if names: | |
315 if col_name in found_keys: | |
316 column_names.append(col) | |
317 column_keys.append(col_name) | |
318 else: | |
319 if col+'_name' in found_keys: | |
320 column_names.append(col) | |
321 column_keys.append(col+'_name') | |
322 elif col+'_id' in found_keys: | |
323 column_names.append(col) | |
324 column_keys.append(col+'_id') | |
325 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
326 taxa = dict() ## peptide : [taxonomy] | |
327 for i,pdict in enumerate(results): | |
328 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
329 if peptide and peptide not in taxa: | |
330 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
331 taxa[peptide] = vals | |
332 return (taxa,column_names) | |
333 | |
334 def get_ec_dict(resp, extra=False): | |
335 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
336 if extra: | |
337 ec_cols.append('ec_names') | |
338 ec_dict = dict() | |
339 for i,pdict in enumerate(resp): | |
340 peptide = pdict['peptide'] | |
341 ec_numbers = [] | |
342 protein_counts = [] | |
343 ec_names = [] | |
344 if 'ec' in pdict: | |
345 for ec in pdict['ec']: | |
346 ec_numbers.append(ec['ec_number']) | |
347 protein_counts.append(str(ec['protein_count'])) | |
348 if extra: | |
349 ec_names.append(ec['name'] if 'name' in ec else '') | |
350 vals = [','.join(ec_numbers),','.join(protein_counts)] | |
351 if extra: | |
352 vals.append(','.join(ec_names)) | |
353 ec_dict[peptide] = vals | |
354 return (ec_dict, ec_cols) | |
355 | |
356 def get_go_dict(resp, extra=False): | |
357 go_cols = ['go_terms', 'go_protein_counts'] | |
358 if extra: | |
359 go_cols.append('go_names') | |
360 go_dict = dict() | |
361 for i,pdict in enumerate(resp): | |
362 peptide = pdict['peptide'] | |
363 go_terms = [] | |
364 protein_counts = [] | |
365 go_names = [] | |
366 if 'go' in pdict: | |
367 for go in pdict['go']: | |
368 if 'go_term' in go: | |
369 go_terms.append(go['go_term']) | |
370 protein_counts.append(str(go['protein_count'])) | |
371 if extra: | |
372 go_names.append(go['name'] if 'name' in go else '') | |
373 else: | |
374 for go_type in go_types: | |
375 if go_type in go: | |
376 for _go in go[go_type]: | |
377 go_terms.append(_go['go_term']) | |
378 protein_counts.append(str(_go['protein_count'])) | |
379 if extra: | |
380 go_names.append(_go['name'] if 'name' in _go else '') | |
381 vals = [','.join(go_terms),','.join(protein_counts)] | |
382 if extra: | |
383 vals.append(','.join(go_names)) | |
384 go_dict[peptide] = vals | |
385 return (go_dict, go_cols) | |
386 | |
387 def write_ec_table(outfile, resp, column_order): | |
388 with open(outfile,'w') as fh: | |
389 for i,pdict in enumerate(resp): | |
390 if 'ec' in pdict: | |
391 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
392 for ec in pdict['ec']: | |
393 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
394 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
395 | |
396 def write_go_table(outfile, resp, column_order): | |
397 with open(outfile,'w') as fh: | |
398 for i,pdict in enumerate(resp): | |
399 if 'go' in pdict: | |
400 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
401 for go in pdict['go']: | |
402 if 'go_term' in go: | |
403 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
404 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
405 else: | |
406 for go_type in go_types: | |
407 if go_type in go: | |
408 for _go in go[go_type]: | |
409 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | |
410 vals.append(go_type) | |
411 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
108 | 412 |
109 #Parse Command Line | 413 #Parse Command Line |
110 parser = optparse.OptionParser() | 414 parser = optparse.OptionParser() |
111 # unipept API choice | 415 # unipept API choice |
112 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot'], help='The unipept application: pept2lca, pept2taxa, or pept2prot' ) | 416 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], |
417 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) | |
113 # input files | 418 # input files |
114 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | 419 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) |
115 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | 420 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) |
116 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | 421 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) |
117 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | 422 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) |
118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | 423 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) |
119 # Unipept Flags | 424 # Unipept Flags |
120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | 425 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) |
121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | 426 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) |
122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | 427 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) |
428 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) | |
123 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) | 429 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) |
124 | 430 |
125 # output fields | 431 # output fields |
126 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | 432 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) |
127 # Warn vs Error Flag | 433 # Warn vs Error Flag |
128 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | 434 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) |
129 # output files | 435 # output files |
130 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | 436 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') |
437 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
438 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
439 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
440 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
131 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | 441 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') |
132 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | 442 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') |
133 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) | 443 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) |
444 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) | |
134 # debug | 445 # debug |
446 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) | |
135 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) | 447 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) |
136 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) | 448 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) |
137 (options, args) = parser.parse_args() | 449 (options, args) = parser.parse_args() |
138 if options.version: | 450 if options.version: |
139 print >> sys.stdout,"%s" % version | 451 print >> sys.stdout,"%s" % version |
186 if options.names or options.json: | 498 if options.names or options.json: |
187 post_data.append(("extra","true")) | 499 post_data.append(("extra","true")) |
188 post_data.append(("names","true")) | 500 post_data.append(("names","true")) |
189 elif options.extra or options.json: | 501 elif options.extra or options.json: |
190 post_data.append(("extra","true")) | 502 post_data.append(("extra","true")) |
503 if options.domains: | |
504 post_data.append(("domains","true")) | |
191 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] | 505 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] |
506 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) | |
192 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | 507 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} |
193 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept | 508 ## headers = {'Accept': 'application/json'} |
194 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | 509 url = '%s/%s' % (options.url.rstrip('/'),options.unipept) |
195 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | 510 if options.get: |
511 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) | |
512 url = '%s.json?%s' % (url,params) | |
513 req = urllib2.Request( url ) | |
514 else: | |
515 url = '%s.json' % (url) | |
516 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | |
517 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) | |
518 try: | |
519 resp = urllib2.urlopen( req ) | |
520 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) | |
521 if resp.getcode() == 200: | |
522 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | |
523 except Exception, e: | |
524 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) | |
196 unmatched_peptides = [] | 525 unmatched_peptides = [] |
197 peptideMatches = [] | 526 peptideMatches = [] |
198 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | 527 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) |
199 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': | 528 if options.unipept in ['pept2prot', 'pept2taxa']: |
200 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | 529 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide |
201 ## multiple entries per trypticPeptide for pep2prot or pep2taxa | 530 ## multiple entries per trypticPeptide for pep2prot or pep2taxa |
202 mapping = {} | 531 mapping = {} |
203 for match in unipept_resp: | 532 for match in unipept_resp: |
204 mapping.setdefault(match['peptide'],[]).append(match) | 533 mapping.setdefault(match['peptide'],[]).append(match) |
220 else: | 549 else: |
221 for key,match in keyToMatch.iteritems(): | 550 for key,match in keyToMatch.iteritems(): |
222 match['tryptic_peptide'] = match['peptide'] | 551 match['tryptic_peptide'] = match['peptide'] |
223 match['peptide'] = peptide | 552 match['peptide'] = peptide |
224 peptideMatches.append(match) | 553 peptideMatches.append(match) |
225 else: | 554 elif options.unipept in ['pept2lca', 'peptinfo']: |
226 ## should be one response per trypticPeptide for pep2lca | 555 ## should be one response per trypticPeptide for pep2lca |
227 respMap = {v['peptide']:v for v in unipept_resp} | 556 respMap = {v['peptide']:v for v in unipept_resp} |
228 ## map resp back to peptides | 557 ## map resp back to peptides |
229 for peptide in peptides: | 558 for peptide in peptides: |
230 matches = list() | 559 matches = list() |
231 for part in pepToParts[peptide]: | 560 for part in pepToParts[peptide]: |
232 if part in respMap: | 561 if part in respMap: |
562 matches.append(respMap[part]) | |
563 match = best_match(peptide,matches) | |
564 if not match: | |
565 unmatched_peptides.append(peptide) | |
566 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
567 match = {'peptide' : longest_tryptic_peptide} | |
568 match['tryptic_peptide'] = match['peptide'] | |
569 match['peptide'] = peptide | |
570 peptideMatches.append(match) | |
571 else: | |
572 respMap = {v['peptide']:v for v in unipept_resp} | |
573 ## map resp back to peptides | |
574 for peptide in peptides: | |
575 matches = list() | |
576 for part in pepToParts[peptide]: | |
577 if part in respMap and 'total_protein_count' in respMap[part]: | |
233 matches.append(respMap[part]) | 578 matches.append(respMap[part]) |
234 match = best_match(peptide,matches) | 579 match = best_match(peptide,matches) |
235 if not match: | 580 if not match: |
236 unmatched_peptides.append(peptide) | 581 unmatched_peptides.append(peptide) |
237 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 582 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
248 with open(options.unmatched,'w') as outputFile: | 593 with open(options.unmatched,'w') as outputFile: |
249 for peptide in peptides: | 594 for peptide in peptides: |
250 if peptide in unmatched_peptides: | 595 if peptide in unmatched_peptides: |
251 outputFile.write("%s\n" % peptide) | 596 outputFile.write("%s\n" % peptide) |
252 if options.json: | 597 if options.json: |
253 if options.unipept == 'pept2prot': | 598 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: |
599 root = get_taxon_json(resp) | |
600 with open(options.json,'w') as outputFile: | |
601 outputFile.write(json.dumps(root)) | |
602 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: | |
254 with open(options.json,'w') as outputFile: | 603 with open(options.json,'w') as outputFile: |
255 outputFile.write(str(resp)) | 604 outputFile.write(str(resp)) |
256 else: | 605 if options.ec_json: |
257 found_keys = set() | 606 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
607 root = get_ec_json(resp) | |
608 with open(options.ec_json,'w') as outputFile: | |
609 outputFile.write(json.dumps(root)) | |
610 if options.tsv or options.csv: | |
611 rows = [] | |
612 column_names = None | |
613 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: | |
614 taxa = None | |
615 ec_dict = None | |
616 go_dict = None | |
617 if options.unipept in ['peptinfo']: | |
618 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
619 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
620 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) | |
621 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
622 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) | |
258 for i,pdict in enumerate(resp): | 623 for i,pdict in enumerate(resp): |
259 found_keys |= set(pdict.keys()) | 624 peptide = pdict['peptide'] |
260 taxa_cols = [] | 625 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' |
261 for col in pept2lca_extra_column_order[-1:0:-1]: | 626 column_names = ['peptide', 'total_protein_count'] |
262 if col+'_id' in found_keys: | 627 vals = [peptide,total_protein_count] |
263 taxa_cols.append(col) | 628 if ec_dict: |
264 id_to_node = dict() | 629 vals += ec_dict[peptide] |
265 def get_node(id,name,rank,child,seq): | 630 column_names += ec_cols |
266 if id not in id_to_node: | 631 if go_dict: |
267 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | 632 vals += go_dict[peptide] |
268 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | 633 column_names += go_cols |
269 id_to_node[id] = node | 634 if taxa: |
270 else: | 635 vals += taxa[peptide][1:] |
271 node = id_to_node[id] | 636 column_names += taxon_cols[1:] |
272 node['data']['count'] += 1 | 637 rows.append(vals) |
273 if seq is not None and seq not in node['data']['sequences']: | 638 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: |
274 node['data']['sequences'].append(seq) | 639 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
275 if child is None: | 640 column_names = taxon_cols |
276 node['data']['self_count'] += 1 | 641 rows = taxa.values() |
277 elif child['id'] not in node['kids']: | 642 for peptide,vals in taxa.iteritems(): |
278 node['kids'].append(child['id']) | 643 rows.append(vals) |
279 node['children'].append(child) | |
280 return node | |
281 root = get_node(1,'root','no rank',None,None) | |
282 for i,pdict in enumerate(resp): | |
283 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
284 seq = sequence | |
285 child = None | |
286 for col in taxa_cols: | |
287 col_id = col+'_id' | |
288 if col_id in pdict and pdict.get(col_id): | |
289 col_name = col if col in found_keys else col+'_name' | |
290 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | |
291 seq = None | |
292 if child: | |
293 get_node(1,'root','no rank',child,None) | |
294 with open(options.json,'w') as outputFile: | |
295 outputFile.write(json.dumps(root)) | |
296 if options.tsv or options.csv: | |
297 # 'pept2lca','pept2taxa','pept2prot' | |
298 found_keys = set() | |
299 results = [] | |
300 for i,pdict in enumerate(resp): | |
301 results.append(pdict) | |
302 found_keys |= set(pdict.keys()) | |
303 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
304 column_names = [] | |
305 column_keys = [] | |
306 for col in column_order: | |
307 if col in found_keys: | |
308 column_names.append(col) | |
309 column_keys.append(col) | |
310 elif options.extra or options.names: | |
311 col_id = col+'_id' | |
312 col_name = col+'_name' | |
313 if options.extra: | |
314 if col_id in found_keys: | |
315 column_names.append(col_id) | |
316 column_keys.append(col_id) | |
317 if options.names: | |
318 if col_name in found_keys: | |
319 column_names.append(col) | |
320 column_keys.append(col_name) | |
321 else: | |
322 if col+'_name' in found_keys: | |
323 column_names.append(col) | |
324 column_keys.append(col+'_name') | |
325 elif col+'_id' in found_keys: | |
326 column_names.append(col) | |
327 column_keys.append(col+'_id') | |
328 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
329 taxa = [] | |
330 for i,pdict in enumerate(results): | |
331 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
332 if vals not in taxa: | |
333 taxa.append(vals) | |
334 if options.tsv: | 644 if options.tsv: |
335 with open(options.tsv,'w') as outputFile: | 645 with open(options.tsv,'w') as outputFile: |
336 outputFile.write("#%s\n"% '\t'.join(column_names)) | 646 if column_names: |
337 for vals in taxa: | 647 outputFile.write("#%s\n"% '\t'.join(column_names)) |
648 for vals in rows: | |
338 outputFile.write("%s\n"% '\t'.join(vals)) | 649 outputFile.write("%s\n"% '\t'.join(vals)) |
339 if options.csv: | 650 if options.csv: |
340 with open(options.csv,'w') as outputFile: | 651 with open(options.csv,'w') as outputFile: |
341 outputFile.write("%s\n"% ','.join(column_names)) | 652 if column_names: |
342 for vals in taxa: | 653 outputFile.write("%s\n"% ','.join(column_names)) |
654 for vals in rows: | |
343 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) | 655 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) |
656 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
657 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order | |
658 write_ec_table(options.ec_tsv, resp, column_order) | |
659 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
660 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order | |
661 write_go_table(options.go_tsv, resp, column_order) | |
344 | 662 |
345 if __name__ == "__main__" : __main__() | 663 if __name__ == "__main__" : __main__() |