Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 5:917fd3ebc223 draft
"planemo upload for repository http://unipept.ugent.be/apidocs commit dd464f03c32f657fc555081117da18ba4c091af6-dirty"
author | galaxyp |
---|---|
date | Thu, 30 Apr 2020 07:39:28 -0400 |
parents | 4953dcd7dd39 |
children | 9aaa46d45472 |
comparison
equal
deleted
inserted
replaced
4:4953dcd7dd39 | 5:917fd3ebc223 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 """ | 2 """ |
3 # | 3 # |
4 #------------------------------------------------------------------------------ | |
5 # University of Minnesota | |
6 # Copyright 2015, Regents of the University of Minnesota | |
7 #------------------------------------------------------------------------------ | |
8 # Author: | 4 # Author: |
9 # | 5 # |
10 # James E Johnson | 6 # James E Johnson |
11 # | 7 # |
12 #------------------------------------------------------------------------------ | 8 #------------------------------------------------------------------------------ |
13 """ | 9 """ |
14 | |
15 import json | 10 import json |
16 import logging | |
17 import optparse | 11 import optparse |
18 from optparse import OptionParser | |
19 import os | |
20 import sys | 12 import sys |
21 import re | 13 import re |
22 import urllib | 14 import urllib.error |
23 import urllib2 | 15 import urllib.parse |
24 | 16 import urllib.request |
25 """ | 17 |
26 pept2taxa json | |
27 pept2lca json | |
28 pept2prot | |
29 pept2ec ecjson ec | |
30 pept2go go | |
31 pept2funct go ec | |
32 peptinfo json ecjson ec go | |
33 | |
34 """ | |
35 | 18 |
36 try: | 19 try: |
37 import xml.etree.cElementTree as ET | 20 import xml.etree.cElementTree as ET |
38 except ImportError: | 21 except ImportError: |
39 import xml.etree.ElementTree as ET | 22 import xml.etree.ElementTree as ET |
40 | 23 |
41 def warn_err(msg,exit_code=1): | 24 |
25 def warn_err(msg, exit_code=1): | |
42 sys.stderr.write(msg) | 26 sys.stderr.write(msg) |
43 if exit_code: | 27 if exit_code: |
44 sys.exit(exit_code) | 28 sys.exit(exit_code) |
29 | |
45 | 30 |
46 go_types = ['biological process', 'molecular function', 'cellular component'] | 31 go_types = ['biological process', 'molecular function', 'cellular component'] |
32 ipr_types = ['Domain', 'Family', 'Homologous_superfamily', 'Repeat', 'Conserved_site', 'Active_site', 'Binding_site', 'PTM'] | |
47 ec_name_dict = { | 33 ec_name_dict = { |
48 '1' : 'Oxidoreductase', | 34 '1': 'Oxidoreductase', |
49 '1.1' : 'act on the CH-OH group of donors', | 35 '1.1': 'act on the CH-OH group of donors', |
50 '1.2' : 'act on the aldehyde or oxo group of donors', | 36 '1.2': 'act on the aldehyde or oxo group of donors', |
51 '1.3' : 'act on the CH-CH group of donors', | 37 '1.3': 'act on the CH-CH group of donors', |
52 '1.4' : 'act on the CH-NH2 group of donors', | 38 '1.4': 'act on the CH-NH2 group of donors', |
53 '1.5' : 'act on CH-NH group of donors', | 39 '1.5': 'act on CH-NH group of donors', |
54 '1.6' : 'act on NADH or NADPH', | 40 '1.6': 'act on NADH or NADPH', |
55 '1.7' : 'act on other nitrogenous compounds as donors', | 41 '1.7': 'act on other nitrogenous compounds as donors', |
56 '1.8' : 'act on a sulfur group of donors', | 42 '1.8': 'act on a sulfur group of donors', |
57 '1.9' : 'act on a heme group of donors', | 43 '1.9': 'act on a heme group of donors', |
58 '1.10' : 'act on diphenols and related substances as donors', | 44 '1.10': 'act on diphenols and related substances as donors', |
59 '1.11' : 'act on peroxide as an acceptor -- peroxidases', | 45 '1.11': 'act on peroxide as an acceptor -- peroxidases', |
60 '1.12' : 'act on hydrogen as a donor', | 46 '1.12': 'act on hydrogen as a donor', |
61 '1.13' : 'act on single donors with incorporation of molecular oxygen', | 47 '1.13': 'act on single donors with incorporation of molecular oxygen', |
62 '1.14' : 'act on paired donors with incorporation of molecular oxygen', | 48 '1.14': 'act on paired donors with incorporation of molecular oxygen', |
63 '1.15' : 'act on superoxide radicals as acceptors', | 49 '1.15': 'act on superoxide radicals as acceptors', |
64 '1.16' : 'oxidize metal ions', | 50 '1.16': 'oxidize metal ions', |
65 '1.17' : 'act on CH or CH2 groups', | 51 '1.17': 'act on CH or CH2 groups', |
66 '1.18' : 'act on iron-sulfur proteins as donors', | 52 '1.18': 'act on iron-sulfur proteins as donors', |
67 '1.19' : 'act on reduced flavodoxin as donor', | 53 '1.19': 'act on reduced flavodoxin as donor', |
68 '1.20' : 'act on phosphorus or arsenic as donors', | 54 '1.20': 'act on phosphorus or arsenic as donors', |
69 '1.21' : 'act on X-H and Y-H to form an X-Y bond', | 55 '1.21': 'act on X-H and Y-H to form an X-Y bond', |
70 '1.97' : 'other oxidoreductases', | 56 '1.97': 'other oxidoreductases', |
71 '2' : 'Transferase', | 57 '2': 'Transferase', |
72 '2.1' : 'transfer one-carbon groups, Methylase', | 58 '2.1': 'transfer one-carbon groups, Methylase', |
73 '2.2' : 'transfer aldehyde or ketone groups', | 59 '2.2': 'transfer aldehyde or ketone groups', |
74 '2.3' : 'acyltransferases', | 60 '2.3': 'acyltransferases', |
75 '2.4' : 'glycosyltransferases', | 61 '2.4': 'glycosyltransferases', |
76 '2.5' : 'transfer alkyl or aryl groups, other than methyl groups', | 62 '2.5': 'transfer alkyl or aryl groups, other than methyl groups', |
77 '2.6' : 'transfer nitrogenous groups', | 63 '2.6': 'transfer nitrogenous groups', |
78 '2.7' : 'transfer phosphorus-containing groups', | 64 '2.7': 'transfer phosphorus-containing groups', |
79 '2.8' : 'transfer sulfur-containing groups', | 65 '2.8': 'transfer sulfur-containing groups', |
80 '2.9' : 'transfer selenium-containing groups', | 66 '2.9': 'transfer selenium-containing groups', |
81 '3' : 'Hydrolase', | 67 '3': 'Hydrolase', |
82 '3.1' : 'act on ester bonds', | 68 '3.1': 'act on ester bonds', |
83 '3.2' : 'act on sugars - glycosylases', | 69 '3.2': 'act on sugars - glycosylases', |
84 '3.3' : 'act on ether bonds', | 70 '3.3': 'act on ether bonds', |
85 '3.4' : 'act on peptide bonds - Peptidase', | 71 '3.4': 'act on peptide bonds - Peptidase', |
86 '3.5' : 'act on carbon-nitrogen bonds, other than peptide bonds', | 72 '3.5': 'act on carbon-nitrogen bonds, other than peptide bonds', |
87 '3.6' : 'act on acid anhydrides', | 73 '3.6': 'act on acid anhydrides', |
88 '3.7' : 'act on carbon-carbon bonds', | 74 '3.7': 'act on carbon-carbon bonds', |
89 '3.8' : 'act on halide bonds', | 75 '3.8': 'act on halide bonds', |
90 '3.9' : 'act on phosphorus-nitrogen bonds', | 76 '3.9': 'act on phosphorus-nitrogen bonds', |
91 '3.10' : 'act on sulfur-nitrogen bonds', | 77 '3.10': 'act on sulfur-nitrogen bonds', |
92 '3.11' : 'act on carbon-phosphorus bonds', | 78 '3.11': 'act on carbon-phosphorus bonds', |
93 '3.12' : 'act on sulfur-sulfur bonds', | 79 '3.12': 'act on sulfur-sulfur bonds', |
94 '3.13' : 'act on carbon-sulfur bonds', | 80 '3.13': 'act on carbon-sulfur bonds', |
95 '4' : 'Lyase', | 81 '4': 'Lyase', |
96 '4.1' : 'carbon-carbon lyases', | 82 '4.1': 'carbon-carbon lyases', |
97 '4.2' : 'carbon-oxygen lyases', | 83 '4.2': 'carbon-oxygen lyases', |
98 '4.3' : 'carbon-nitrogen lyases', | 84 '4.3': 'carbon-nitrogen lyases', |
99 '4.4' : 'carbon-sulfur lyases', | 85 '4.4': 'carbon-sulfur lyases', |
100 '4.5' : 'carbon-halide lyases', | 86 '4.5': 'carbon-halide lyases', |
101 '4.6' : 'phosphorus-oxygen lyases', | 87 '4.6': 'phosphorus-oxygen lyases', |
102 '5' : 'Isomerase', | 88 '5': 'Isomerase', |
103 '5.1' : 'racemases and epimerases', | 89 '5.1': 'racemases and epimerases', |
104 '5.2' : 'cis-trans-isomerases', | 90 '5.2': 'cis-trans-isomerases', |
105 '5.3' : 'intramolecular oxidoreductases', | 91 '5.3': 'intramolecular oxidoreductases', |
106 '5.4' : 'intramolecular transferases -- mutases', | 92 '5.4': 'intramolecular transferases -- mutases', |
107 '5.5' : 'intramolecular lyases', | 93 '5.5': 'intramolecular lyases', |
108 '5.99' : 'other isomerases', | 94 '5.99': 'other isomerases', |
109 '6' : 'Ligase', | 95 '6': 'Ligase', |
110 '6.1' : 'form carbon-oxygen bonds', | 96 '6.1': 'form carbon-oxygen bonds', |
111 '6.2' : 'form carbon-sulfur bonds', | 97 '6.2': 'form carbon-sulfur bonds', |
112 '6.3' : 'form carbon-nitrogen bonds', | 98 '6.3': 'form carbon-nitrogen bonds', |
113 '6.4' : 'form carbon-carbon bonds', | 99 '6.4': 'form carbon-carbon bonds', |
114 '6.5' : 'form phosphoric ester bonds', | 100 '6.5': 'form phosphoric ester bonds', |
115 '6.6' : 'form nitrogen-metal bonds', | 101 '6.6': 'form nitrogen-metal bonds', |
116 } | 102 } |
117 pept2lca_column_order = ['peptide','taxon_rank','taxon_id','taxon_name'] | 103 pept2lca_column_order = ['peptide', 'taxon_rank', 'taxon_id', 'taxon_name'] |
118 pept2lca_extra_column_order = ['peptide','superkingdom','kingdom','subkingdom','superphylum','phylum','subphylum','superclass','class','subclass','infraclass','superorder','order','suborder','infraorder','parvorder','superfamily','family','subfamily','tribe','subtribe','genus','subgenus','species_group','species_subgroup','species','subspecies','varietas','forma' ] | 104 pept2lca_extra_column_order = ['peptide', 'superkingdom', 'kingdom', 'subkingdom', 'superphylum', 'phylum', 'subphylum', 'superclass', 'class', 'subclass', 'infraclass', 'superorder', 'order', 'suborder', 'infraorder', 'parvorder', 'superfamily', 'family', 'subfamily', 'tribe', 'subtribe', 'genus', 'subgenus', 'species_group', 'species_subgroup', 'species', 'subspecies', 'varietas', 'forma'] |
119 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | 105 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] |
120 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | 106 pept2prot_column_order = ['peptide', 'uniprot_id', 'taxon_id'] |
121 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | 107 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name', 'ec_references', 'go_references', 'refseq_ids', 'refseq_protein_ids', 'insdc_ids', 'insdc_protein_ids'] |
122 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] | 108 pept2ec_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count']] |
123 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] | 109 pept2ec_extra_column_order = [['peptide', 'total_protein_count'], ['ec_number', 'protein_count', 'name']] |
124 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] | 110 pept2go_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count']] |
125 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] | 111 pept2go_extra_column_order = [['peptide', 'total_protein_count'], ['go_term', 'protein_count', 'name']] |
126 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go'] | 112 pept2interpro_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count']] |
113 pept2interpro_extra_column_order = [['peptide', 'total_protein_count'], ['code', 'protein_count', 'type', 'name']] | |
114 pept2funct_column_order = ['peptide', 'total_protein_count', 'ec', 'go', 'ipr'] | |
115 | |
127 | 116 |
128 def __main__(): | 117 def __main__(): |
129 version = '2.0' | 118 version = '4.3' |
130 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | 119 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' |
131 | 120 |
132 def read_tabular(filepath,col): | 121 def read_tabular(filepath, col): |
122 peptides = [] | |
123 with open(filepath) as fp: | |
124 for i, line in enumerate(fp): | |
125 if line.strip() == '' or line.startswith('#'): | |
126 continue | |
127 fields = line.rstrip('\n').split('\t') | |
128 peptide = fields[col] | |
129 if not re.match(pep_pat, peptide): | |
130 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide, i, col, filepath), exit_code=invalid_ec) | |
131 peptides.append(peptide) | |
132 return peptides | |
133 | |
134 def get_fasta_entries(fp): | |
135 name, seq = None, [] | |
136 for line in fp: | |
137 line = line.rstrip() | |
138 if line.startswith(">"): | |
139 if name: | |
140 yield (name, ''.join(seq)) | |
141 name, seq = line, [] | |
142 else: | |
143 seq.append(line) | |
144 if name: | |
145 yield (name, ''.join(seq)) | |
146 | |
147 def read_fasta(filepath): | |
148 peptides = [] | |
149 with open(filepath) as fp: | |
150 for id, peptide in get_fasta_entries(fp): | |
151 if not re.match(pep_pat, peptide): | |
152 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide, id, filepath), exit_code=invalid_ec) | |
153 peptides.append(peptide) | |
154 return peptides | |
155 | |
156 def read_mzid(fp): | |
157 peptides = [] | |
158 for event, elem in ET.iterparse(fp): | |
159 if event == 'end': | |
160 if re.search('PeptideSequence', elem.tag): | |
161 peptides.append(elem.text) | |
162 return peptides | |
163 | |
164 def read_pepxml(fp): | |
165 peptides = [] | |
166 for event, elem in ET.iterparse(fp): | |
167 if event == 'end': | |
168 if re.search('search_hit', elem.tag): | |
169 peptides.append(elem.get('peptide')) | |
170 return peptides | |
171 | |
172 def best_match(peptide, matches): | |
173 if not matches: | |
174 return None | |
175 elif len(matches) == 1: | |
176 return matches[0].copy() | |
177 elif 'taxon_rank' in matches[0]: | |
178 # find the most specific match (peptide is always the first column order field) | |
179 for col in reversed(pept2lca_extra_column_order[1:]): | |
180 col_id = col + "_id" if options.extra else col | |
181 for match in matches: | |
182 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
183 return match.copy() | |
184 if col_id in match and match[col_id]: | |
185 return match.copy() | |
186 else: | |
187 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() | |
188 return None | |
189 | |
190 def get_taxon_json(resp): | |
191 found_keys = set() | |
192 for i, pdict in enumerate(resp): | |
193 found_keys |= set(pdict.keys()) | |
194 taxa_cols = [] | |
195 for col in pept2lca_extra_column_order[-1:0:-1]: | |
196 if col + '_id' in found_keys: | |
197 taxa_cols.append(col) | |
198 id_to_node = dict() | |
199 | |
200 def get_node(id, name, rank, child, seq): | |
201 if id not in id_to_node: | |
202 data = {'count': 0, 'self_count': 0, 'valid_taxon': 1, 'rank': rank, 'sequences': []} | |
203 node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data} | |
204 id_to_node[id] = node | |
205 else: | |
206 node = id_to_node[id] | |
207 node['data']['count'] += 1 | |
208 if seq is not None and seq not in node['data']['sequences']: | |
209 node['data']['sequences'].append(seq) | |
210 if child is None: | |
211 node['data']['self_count'] += 1 | |
212 elif child['id'] not in node['kids']: | |
213 node['kids'].append(child['id']) | |
214 node['children'].append(child) | |
215 return node | |
216 root = get_node(1, 'root', 'no rank', None, None) | |
217 for i, pdict in enumerate(resp): | |
218 sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None)) | |
219 seq = sequence | |
220 child = None | |
221 for col in taxa_cols: | |
222 col_id = col + '_id' | |
223 if col_id in pdict and pdict.get(col_id): | |
224 col_name = col if col in found_keys else col + '_name' | |
225 child = get_node(pdict.get(col_id, None), pdict.get(col_name, ''), col, child, seq) | |
226 seq = None | |
227 if child is not None: | |
228 get_node(1, 'root', 'no rank', child, None) | |
229 return root | |
230 | |
231 def get_ec_json(resp): | |
232 ecMap = dict() | |
233 for pdict in resp: | |
234 if 'ec' in pdict: | |
235 for ec in pdict['ec']: | |
236 ec_number = ec['ec_number'] | |
237 if ec_number not in ecMap: | |
238 ecMap[ec_number] = [] | |
239 ecMap[ec_number].append(pdict) | |
240 | |
241 def get_ids(ec): | |
242 ids = [] | |
243 i = len(ec) | |
244 while i >= 0: | |
245 ids.append(ec[:i]) | |
246 i = ec.rfind('.', 0, i - 1) | |
247 return ids | |
248 id_to_node = dict() | |
249 | |
250 def get_node(id, name, child, seq): | |
251 if id not in id_to_node: | |
252 data = {'count': 0, 'self_count': 0, 'sequences': []} | |
253 node = {'id': id, 'name': name, 'children': [], 'kids': [], 'data': data} | |
254 id_to_node[id] = node | |
255 else: | |
256 node = id_to_node[id] | |
257 node['data']['count'] += 1 | |
258 if seq is not None and seq not in node['data']['sequences']: | |
259 node['data']['sequences'].append(seq) | |
260 if child is None: | |
261 node['data']['self_count'] += 1 | |
262 elif child['id'] not in node['kids']: | |
263 node['kids'].append(child['id']) | |
264 node['children'].append(child) | |
265 return node | |
266 root = get_node(0, '-.-.-.-', None, None) | |
267 for i in range(1, 7): | |
268 child = get_node(str(i), '%s\n%s' % (str(i), ec_name_dict[str(i)]), None, None) | |
269 get_node(0, '-.-.-.-', child, None) | |
270 for i, pdict in enumerate(resp): | |
271 sequence = pdict.get('peptide', pdict.get('tryptic_peptide', None)) | |
272 seq = sequence | |
273 if 'ec' in pdict: | |
274 for ec in pdict['ec']: | |
275 child = None | |
276 ec_number = ec['ec_number'] | |
277 for ec_id in get_ids(ec_number): | |
278 ec_name = str(ec_id) | |
279 child = get_node(ec_id, ec_name, child, seq) | |
280 seq = None | |
281 if child: | |
282 get_node(0, '-.-.-.-', child, None) | |
283 return root | |
284 | |
285 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
286 found_keys = set() | |
287 results = [] | |
288 for i, pdict in enumerate(resp): | |
289 results.append(pdict) | |
290 found_keys |= set(pdict.keys()) | |
291 # print >> sys.stderr, "%s\n%s" % (pdict.keys(), found_keys) | |
292 column_names = [] | |
293 column_keys = [] | |
294 for col in column_order: | |
295 if col in found_keys: | |
296 column_names.append(col) | |
297 column_keys.append(col) | |
298 elif names: | |
299 col_id = col + '_id' | |
300 col_name = col + '_name' | |
301 if extra: | |
302 if col_id in found_keys: | |
303 column_names.append(col_id) | |
304 column_keys.append(col_id) | |
305 if names: | |
306 if col_name in found_keys: | |
307 column_names.append(col) | |
308 column_keys.append(col_name) | |
309 else: | |
310 if col + '_name' in found_keys: | |
311 column_names.append(col) | |
312 column_keys.append(col + '_name') | |
313 elif col + '_id' in found_keys: | |
314 column_names.append(col) | |
315 column_keys.append(col + '_id') | |
316 # print >> sys.stderr, "%s\n%s" % (column_names, column_keys) | |
317 taxa = dict() # peptide: [taxonomy] | |
318 for i, pdict in enumerate(results): | |
319 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
320 if peptide and peptide not in taxa: | |
321 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
322 taxa[peptide] = vals | |
323 return (taxa, column_names) | |
324 | |
325 def get_ec_dict(resp, extra=False): | |
326 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
327 if extra: | |
328 ec_cols.append('ec_names') | |
329 ec_dict = dict() | |
330 for i, pdict in enumerate(resp): | |
331 peptide = pdict['peptide'] | |
332 ec_numbers = [] | |
333 protein_counts = [] | |
334 ec_names = [] | |
335 if 'ec' in pdict: | |
336 for ec in pdict['ec']: | |
337 ec_numbers.append(ec['ec_number']) | |
338 protein_counts.append(str(ec['protein_count'])) | |
339 if extra: | |
340 ec_names.append(ec['name'] if 'name' in ec else '') | |
341 vals = [','.join(ec_numbers), ','.join(protein_counts)] | |
342 if extra: | |
343 vals.append(','.join(ec_names)) | |
344 ec_dict[peptide] = vals | |
345 return (ec_dict, ec_cols) | |
346 | |
347 def get_go_dict(resp, extra=False): | |
348 go_cols = ['go_terms', 'go_protein_counts'] | |
349 if extra: | |
350 go_cols.append('go_names') | |
351 go_dict = dict() | |
352 for i, pdict in enumerate(resp): | |
353 peptide = pdict['peptide'] | |
354 go_terms = [] | |
355 protein_counts = [] | |
356 go_names = [] | |
357 if 'go' in pdict: | |
358 for go in pdict['go']: | |
359 if 'go_term' in go: | |
360 go_terms.append(go['go_term']) | |
361 protein_counts.append(str(go['protein_count'])) | |
362 if extra: | |
363 go_names.append(go['name'] if 'name' in go else '') | |
364 else: | |
365 for go_type in go_types: | |
366 if go_type in go: | |
367 for _go in go[go_type]: | |
368 go_terms.append(_go['go_term']) | |
369 protein_counts.append(str(_go['protein_count'])) | |
370 if extra: | |
371 go_names.append(_go['name'] if 'name' in _go else '') | |
372 vals = [','.join(go_terms), ','.join(protein_counts)] | |
373 if extra: | |
374 vals.append(','.join(go_names)) | |
375 go_dict[peptide] = vals | |
376 return (go_dict, go_cols) | |
377 | |
378 def get_ipr_dict(resp, extra=False): | |
379 ipr_cols = ['ipr_codes', 'ipr_protein_counts'] | |
380 if extra: | |
381 ipr_cols.append('ipr_types') | |
382 ipr_cols.append('ipr_names') | |
383 ipr_dict = dict() | |
384 for i, pdict in enumerate(resp): | |
385 peptide = pdict['peptide'] | |
386 ipr_codes = [] | |
387 protein_counts = [] | |
388 ipr_names = [] | |
389 ipr_types = [] | |
390 if 'ipr' in pdict: | |
391 for ipr in pdict['ipr']: | |
392 if 'code' in ipr: | |
393 ipr_codes.append(ipr['code']) | |
394 protein_counts.append(str(ipr['protein_count'])) | |
395 if extra: | |
396 ipr_types.append(ipr['type'] if 'type' in ipr else '') | |
397 ipr_names.append(ipr['name'] if 'name' in ipr else '') | |
398 else: | |
399 for ipr_type in ipr_types: | |
400 if ipr_type in ipr: | |
401 for _ipr in ipr[ipr_type]: | |
402 ipr_codes.append(_ipr['code']) | |
403 protein_counts.append(str(_ipr['protein_count'])) | |
404 if extra: | |
405 ipr_types.append(ipr_type) | |
406 ipr_names.append(_ipr['name'] if 'name' in _ipr else '') | |
407 vals = [','.join(ipr_codes), ','.join(protein_counts)] | |
408 if extra: | |
409 vals.append(','.join(ipr_types)) | |
410 vals.append(','.join(ipr_names)) | |
411 ipr_dict[peptide] = vals | |
412 return (ipr_dict, ipr_cols) | |
413 | |
414 def write_ec_table(outfile, resp, column_order): | |
415 with open(outfile, 'w') as fh: | |
416 for i, pdict in enumerate(resp): | |
417 if 'ec' in pdict: | |
418 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
419 for ec in pdict['ec']: | |
420 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
421 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
422 | |
423 def write_go_table(outfile, resp, column_order): | |
424 with open(outfile, 'w') as fh: | |
425 for i, pdict in enumerate(resp): | |
426 if 'go' in pdict: | |
427 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
428 for go in pdict['go']: | |
429 if 'go_term' in go: | |
430 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
431 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
432 else: | |
433 for go_type in go_types: | |
434 if go_type in go: | |
435 for _go in go[go_type]: | |
436 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | |
437 vals.append(go_type) | |
438 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
439 | |
440 def write_ipr_table(outfile, resp, column_order): | |
441 with open(outfile, 'w') as fh: | |
442 for i, pdict in enumerate(resp): | |
443 if 'ipr' in pdict: | |
444 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
445 for ipr in pdict['ipr']: | |
446 if 'code' in ipr: | |
447 vals = [str(ipr[x]) if x in ipr and ipr[x] else '' for x in column_order[-1]] | |
448 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
449 else: | |
450 for ipr_type in ipr_types: | |
451 if ipr_type in ipr: | |
452 for _ipr in ipr[ipr_type]: | |
453 vals = [str(_ipr[x]) if x in _ipr and _ipr[x] else '' for x in column_order[-1]] | |
454 vals.append(ipr_type) | |
455 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
456 | |
457 # Parse Command Line | |
458 parser = optparse.OptionParser() | |
459 # unipept API choice | |
460 parser.add_option('-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca', 'pept2taxa', 'pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo'], | |
461 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo') | |
462 # input files | |
463 parser.add_option('-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column') | |
464 parser.add_option('-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences') | |
465 parser.add_option('-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences') | |
466 parser.add_option('-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences') | |
467 parser.add_option('-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences') | |
468 # Unipept Flags | |
469 parser.add_option('-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records') | |
470 parser.add_option('-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor') | |
471 parser.add_option('-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor') | |
472 parser.add_option('-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component') | |
473 parser.add_option('-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request') | |
474 | |
475 # output fields | |
476 parser.add_option('-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs') | |
477 # Warn vs Error Flag | |
478 parser.add_option('-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide') | |
479 # output files | |
480 parser.add_option('-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
481 parser.add_option('-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
482 parser.add_option('-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
483 parser.add_option('-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
484 parser.add_option('-I', '--ipr_tsv', dest='ipr_tsv', default=None, help='Output file path for InterPro TAB-separated-values (.tsv) formatted results') | |
485 parser.add_option('-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
486 parser.add_option('-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
487 parser.add_option('-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
488 parser.add_option('-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches') | |
489 parser.add_option('-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/') | |
490 # debug | |
491 parser.add_option('-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST') | |
492 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging') | |
493 parser.add_option('-v', '--version', dest='version', action='store_true', default=False, help='print version and exit') | |
494 (options, args) = parser.parse_args() | |
495 if options.version: | |
496 print('%s' % version) | |
497 sys.exit(0) | |
498 invalid_ec = 2 if options.strict else None | |
133 peptides = [] | 499 peptides = [] |
134 with open(filepath) as fp: | 500 # Get peptide sequences |
135 for i,line in enumerate(fp): | 501 if options.mzid: |
136 if line.strip() == '' or line.startswith('#'): | 502 peptides += read_mzid(options.mzid) |
137 continue | 503 if options.pepxml: |
138 fields = line.rstrip('\n').split('\t') | 504 peptides += read_pepxml(options.pepxml) |
139 peptide = fields[col] | 505 if options.tabular: |
140 if not re.match(pep_pat,peptide): | 506 peptides += read_tabular(options.tabular, options.column) |
141 warn_err('"%s" is not a peptide (line %d column %d of tabular file: %s)\n' % (peptide,i,col,filepath),exit_code=invalid_ec) | 507 if options.fasta: |
142 peptides.append(peptide) | 508 peptides += read_fasta(options.fasta) |
143 return peptides | 509 if args and len(args) > 0: |
144 | 510 for i, peptide in enumerate(args): |
145 def get_fasta_entries(fp): | 511 if not re.match(pep_pat, peptide): |
146 name, seq = None, [] | 512 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide, i), exit_code=invalid_ec) |
147 for line in fp: | 513 peptides.append(peptide) |
148 line = line.rstrip() | 514 if len(peptides) < 1: |
149 if line.startswith(">"): | 515 warn_err("No peptides input!", exit_code=1) |
150 if name: yield (name, ''.join(seq)) | 516 column_order = pept2lca_column_order |
151 name, seq = line, [] | 517 if options.unipept == 'pept2prot': |
152 else: | 518 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order |
153 seq.append(line) | |
154 if name: yield (name, ''.join(seq)) | |
155 | |
156 def read_fasta(filepath): | |
157 peptides = [] | |
158 with open(filepath) as fp: | |
159 for id, peptide in get_fasta_entries(fp): | |
160 if not re.match(pep_pat,peptide): | |
161 warn_err('"%s" is not a peptide (id %s of fasta file: %s)\n' % (peptide,id,filepath),exit_code=invalid_ec) | |
162 peptides.append(peptide) | |
163 return peptides | |
164 | |
165 def read_mzid(fp): | |
166 peptides = [] | |
167 for event, elem in ET.iterparse(fp): | |
168 if event == 'end': | |
169 if re.search('PeptideSequence',elem.tag): | |
170 peptides.append(elem.text) | |
171 return peptides | |
172 | |
173 def read_pepxml(fp): | |
174 peptides = [] | |
175 for event, elem in ET.iterparse(fp): | |
176 if event == 'end': | |
177 if re.search('search_hit',elem.tag): | |
178 peptides.append(elem.get('peptide')) | |
179 return peptides | |
180 | |
181 def best_match(peptide,matches): | |
182 if not matches: | |
183 return None | |
184 elif len(matches) == 1: | |
185 return matches[0].copy() | |
186 elif 'taxon_rank' in matches[0]: | |
187 # find the most specific match (peptide is always the first column order field) | |
188 for col in reversed(pept2lca_extra_column_order[1:]): | |
189 col_id = col+"_id" if options.extra else col | |
190 for match in matches: | |
191 if 'taxon_rank' in match and match['taxon_rank'] == col: | |
192 return match.copy() | |
193 if col_id in match and match[col_id]: | |
194 return match.copy() | |
195 else: | 519 else: |
196 return sorted(matches, key=lambda x: len(x['peptide']))[-1].copy() | 520 if options.extra or options.names: |
197 return None | 521 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order |
198 | 522 else: |
199 def get_taxon_json(resp): | 523 column_order = pept2lca_column_order |
200 found_keys = set() | 524 # map to tryptic peptides |
201 for i,pdict in enumerate(resp): | 525 pepToParts = {p: re.split('\n', re.sub(r'(?<=[RK])(?=[^P])', '\n', p)) for p in peptides} |
202 found_keys |= set(pdict.keys()) | 526 partToPeps = {} |
203 taxa_cols = [] | 527 for peptide, parts in pepToParts.items(): |
204 for col in pept2lca_extra_column_order[-1:0:-1]: | 528 if options.debug: |
205 if col+'_id' in found_keys: | 529 print("peptide: %s\ttryptic: %s\n" % (peptide, parts), file=sys.stderr) |
206 taxa_cols.append(col) | 530 for part in parts: |
207 id_to_node = dict() | 531 if len(part) > 50: |
208 def get_node(id,name,rank,child,seq): | 532 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide, len(part), part), exit_code=None) |
209 if id not in id_to_node: | 533 if 5 <= len(part) <= 50: |
210 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | 534 partToPeps.setdefault(part, []).append(peptide) |
211 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | 535 trypticPeptides = list(partToPeps.keys()) |
212 id_to_node[id] = node | 536 # unipept |
213 else: | 537 unipept_resp = [] |
214 node = id_to_node[id] | 538 idx = list(range(0, len(trypticPeptides), options.max_request)) |
215 node['data']['count'] += 1 | 539 idx.append(len(trypticPeptides)) |
216 if seq is not None and seq not in node['data']['sequences']: | 540 for i in range(len(idx) - 1): |
217 node['data']['sequences'].append(seq) | 541 post_data = [] |
218 if child is None: | 542 if options.equate_il: |
219 node['data']['self_count'] += 1 | 543 post_data.append(('equate_il', 'true')) |
220 elif child['id'] not in node['kids']: | 544 if options.names or options.json: |
221 node['kids'].append(child['id']) | 545 post_data.append(('extra', 'true')) |
222 node['children'].append(child) | 546 post_data.append(('names', 'true')) |
223 return node | 547 elif options.extra or options.json: |
224 root = get_node(1,'root','no rank',None,None) | 548 post_data.append(('extra', 'true')) |
225 for i,pdict in enumerate(resp): | 549 if options.domains: |
226 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | 550 post_data.append(('domains', 'true')) |
227 seq = sequence | 551 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i + 1]]] |
228 child = None | 552 if options.debug: |
229 for col in taxa_cols: | 553 print('post_data: %s\n' % (str(post_data)), file=sys.stderr) |
230 col_id = col+'_id' | 554 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} |
231 if col_id in pdict and pdict.get(col_id): | 555 url = '%s/%s' % (options.url.rstrip('/'), options.unipept) |
232 col_name = col if col in found_keys else col+'_name' | 556 if options.get: |
233 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | 557 params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data]) |
234 seq = None | 558 url = '%s.json?%s' % (url, params) |
235 if child: | 559 req = urllib.request.Request(url) |
236 get_node(1,'root','no rank',child,None) | 560 else: |
237 return root | 561 url = '%s.json' % (url) |
238 | 562 data = urllib.parse.urlencode(post_data).encode() |
239 def get_ec_json(resp): | 563 params = '&'.join(["%s=%s" % (i[0], i[1]) for i in post_data]) |
240 ecMap = dict() | 564 if options.debug: |
241 for pdict in resp: | 565 print('data:\n%s\n' % (data), file=sys.stderr) |
242 if 'ec' in pdict: | 566 req = urllib.request.Request(url, headers=headers, data=urllib.parse.urlencode(post_data).encode(), method='POST') |
243 for ec in pdict['ec']: | 567 if options.debug: |
244 ec_number = ec['ec_number'] | 568 print("url: %s\n" % (str(url)), file=sys.stderr) |
245 if ec_number not in ecMap: | 569 try: |
246 ecMap[ec_number] = [] | 570 resp = urllib.request.urlopen(req) |
247 ecMap[ec_number].append(pdict) | 571 rdata = resp.read() |
248 def get_ids(ec): | 572 if options.debug: |
249 ids = [] | 573 print("%s %s\n" % (url, str(resp.getcode())), file=sys.stderr) |
250 i = len(ec) | 574 if resp.getcode() == 200: |
251 while i >= 0: | 575 if options.debug: |
252 ids.append(ec[:i]) | 576 print("rdata: \n%s\n\n" % (rdata), file=sys.stderr) |
253 i = ec.rfind('.',0,i - 1) | 577 unipept_resp += json.loads(rdata) |
254 return ids | 578 # unipept_resp += json.loads(urllib.request.urlopen(req).read()) |
255 id_to_node = dict() | 579 except Exception as e: |
256 def get_node(id,name,child,seq): | 580 warn_err('HTTP Error %s\n' % (str(e)), exit_code=None) |
257 if id not in id_to_node: | 581 unmatched_peptides = [] |
258 data = {'count' : 0, 'self_count' : 0, 'sequences' : [] } | 582 peptideMatches = [] |
259 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | 583 if options.debug: |
260 id_to_node[id] = node | 584 print("unipept response: %s\n" % str(unipept_resp), file=sys.stderr) |
261 else: | 585 if options.unipept in ['pept2prot', 'pept2taxa']: |
262 node = id_to_node[id] | 586 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' # should only keep one of these per input peptide |
263 node['data']['count'] += 1 | 587 # multiple entries per trypticPeptide for pep2prot or pep2taxa |
264 if seq is not None and seq not in node['data']['sequences']: | 588 mapping = {} |
265 node['data']['sequences'].append(seq) | 589 for match in unipept_resp: |
266 if child is None: | 590 mapping.setdefault(match['peptide'], []).append(match) |
267 node['data']['self_count'] += 1 | 591 for peptide in peptides: |
268 elif child['id'] not in node['kids']: | 592 # Get the intersection of matches to the tryptic parts |
269 node['kids'].append(child['id']) | 593 keyToMatch = None |
270 node['children'].append(child) | 594 for part in pepToParts[peptide]: |
271 return node | 595 if part in mapping: |
272 root = get_node(0,'-.-.-.-',None,None) | 596 temp = {match[dupkey]: match for match in mapping[part]} |
273 for i in range(1,7): | 597 if keyToMatch: |
274 child = get_node(str(i),'%s\n%s' %(str(i), ec_name_dict[str(i)] ),None,None) | 598 dkeys = set(keyToMatch.keys()) - set(temp.keys()) |
275 get_node(0,'-.-.-.-',child,None) | 599 for k in dkeys: |
276 for i,pdict in enumerate(resp): | 600 del keyToMatch[k] |
277 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | 601 else: |
278 seq = sequence | 602 keyToMatch = temp |
279 if 'ec' in pdict: | 603 # keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp |
280 for ec in pdict['ec']: | 604 if not keyToMatch: |
281 child = None | 605 unmatched_peptides.append(peptide) |
282 protein_count = ec['protein_count'] | |
283 ec_number = ec['ec_number'] | |
284 for ec_id in get_ids(ec_number): | |
285 ec_name = str(ec_id) | |
286 ## if len(ec_id) == 3: | |
287 ## ec_name = '%s\n%s\n%s' %(str(ec_id), ec_name_dict[str(ec_id[0])], ec_name_dict[str(ec_id)]) | |
288 child = get_node(ec_id,ec_name,child,seq) | |
289 seq = None | |
290 if child: | |
291 get_node(0,'-.-.-.-',child,None) | |
292 return root | |
293 | |
294 def get_taxon_dict(resp, column_order, extra=False, names=False): | |
295 found_keys = set() | |
296 results = [] | |
297 for i,pdict in enumerate(resp): | |
298 results.append(pdict) | |
299 found_keys |= set(pdict.keys()) | |
300 # print >> sys.stderr, "%s\n%s" % (pdict.keys(),found_keys) | |
301 column_names = [] | |
302 column_keys = [] | |
303 for col in column_order: | |
304 if col in found_keys: | |
305 column_names.append(col) | |
306 column_keys.append(col) | |
307 elif names: | |
308 col_id = col+'_id' | |
309 col_name = col+'_name' | |
310 if extra: | |
311 if col_id in found_keys: | |
312 column_names.append(col_id) | |
313 column_keys.append(col_id) | |
314 if names: | |
315 if col_name in found_keys: | |
316 column_names.append(col) | |
317 column_keys.append(col_name) | |
318 else: | |
319 if col+'_name' in found_keys: | |
320 column_names.append(col) | |
321 column_keys.append(col+'_name') | |
322 elif col+'_id' in found_keys: | |
323 column_names.append(col) | |
324 column_keys.append(col+'_id') | |
325 # print >> sys.stderr, "%s\n%s" % (column_names,column_keys) | |
326 taxa = dict() ## peptide : [taxonomy] | |
327 for i,pdict in enumerate(results): | |
328 peptide = pdict['peptide'] if 'peptide' in pdict else None | |
329 if peptide and peptide not in taxa: | |
330 vals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_keys] | |
331 taxa[peptide] = vals | |
332 return (taxa,column_names) | |
333 | |
334 def get_ec_dict(resp, extra=False): | |
335 ec_cols = ['ec_numbers', 'ec_protein_counts'] | |
336 if extra: | |
337 ec_cols.append('ec_names') | |
338 ec_dict = dict() | |
339 for i,pdict in enumerate(resp): | |
340 peptide = pdict['peptide'] | |
341 ec_numbers = [] | |
342 protein_counts = [] | |
343 ec_names = [] | |
344 if 'ec' in pdict: | |
345 for ec in pdict['ec']: | |
346 ec_numbers.append(ec['ec_number']) | |
347 protein_counts.append(str(ec['protein_count'])) | |
348 if extra: | |
349 ec_names.append(ec['name'] if 'name' in ec else '') | |
350 vals = [','.join(ec_numbers),','.join(protein_counts)] | |
351 if extra: | |
352 vals.append(','.join(ec_names)) | |
353 ec_dict[peptide] = vals | |
354 return (ec_dict, ec_cols) | |
355 | |
356 def get_go_dict(resp, extra=False): | |
357 go_cols = ['go_terms', 'go_protein_counts'] | |
358 if extra: | |
359 go_cols.append('go_names') | |
360 go_dict = dict() | |
361 for i,pdict in enumerate(resp): | |
362 peptide = pdict['peptide'] | |
363 go_terms = [] | |
364 protein_counts = [] | |
365 go_names = [] | |
366 if 'go' in pdict: | |
367 for go in pdict['go']: | |
368 if 'go_term' in go: | |
369 go_terms.append(go['go_term']) | |
370 protein_counts.append(str(go['protein_count'])) | |
371 if extra: | |
372 go_names.append(go['name'] if 'name' in go else '') | |
373 else: | |
374 for go_type in go_types: | |
375 if go_type in go: | |
376 for _go in go[go_type]: | |
377 go_terms.append(_go['go_term']) | |
378 protein_counts.append(str(_go['protein_count'])) | |
379 if extra: | |
380 go_names.append(_go['name'] if 'name' in _go else '') | |
381 vals = [','.join(go_terms),','.join(protein_counts)] | |
382 if extra: | |
383 vals.append(','.join(go_names)) | |
384 go_dict[peptide] = vals | |
385 return (go_dict, go_cols) | |
386 | |
387 def write_ec_table(outfile, resp, column_order): | |
388 with open(outfile,'w') as fh: | |
389 for i,pdict in enumerate(resp): | |
390 if 'ec' in pdict: | |
391 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
392 for ec in pdict['ec']: | |
393 vals = [str(ec[x]) if x in ec and ec[x] else '' for x in column_order[-1]] | |
394 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
395 | |
396 def write_go_table(outfile, resp, column_order): | |
397 with open(outfile,'w') as fh: | |
398 for i,pdict in enumerate(resp): | |
399 if 'go' in pdict: | |
400 tvals = [str(pdict[x]) if x in pdict and pdict[x] else '' for x in column_order[0]] | |
401 for go in pdict['go']: | |
402 if 'go_term' in go: | |
403 vals = [str(go[x]) if x in go and go[x] else '' for x in column_order[-1]] | |
404 fh.write('%s\n' % '\t'.join(tvals + vals)) | |
405 else: | 606 else: |
406 for go_type in go_types: | 607 for key, match in keyToMatch.items(): |
407 if go_type in go: | 608 match['tryptic_peptide'] = match['peptide'] |
408 for _go in go[go_type]: | 609 match['peptide'] = peptide |
409 vals = [str(_go[x]) if x in _go and _go[x] else '' for x in column_order[-1]] | 610 peptideMatches.append(match) |
410 vals.append(go_type) | 611 elif options.unipept in ['pept2lca', 'peptinfo']: |
411 fh.write('%s\n' % '\t'.join(tvals + vals)) | 612 # should be one response per trypticPeptide for pep2lca |
412 | 613 respMap = {v['peptide']: v for v in unipept_resp} |
413 #Parse Command Line | 614 # map resp back to peptides |
414 parser = optparse.OptionParser() | 615 for peptide in peptides: |
415 # unipept API choice | 616 matches = list() |
416 parser.add_option( '-a', '--api', dest='unipept', default='pept2lca', choices=['pept2lca','pept2taxa','pept2prot', 'pept2ec', 'pept2go', 'pept2funct', 'peptinfo'], | 617 for part in pepToParts[peptide]: |
417 help='The unipept application: pept2lca, pept2taxa, pept2prot, pept2ec, pept2go, pept2funct, or peptinfo' ) | 618 if part in respMap: |
418 # input files | 619 matches.append(respMap[part]) |
419 parser.add_option( '-t', '--tabular', dest='tabular', default=None, help='A tabular file that contains a peptide column' ) | 620 match = best_match(peptide, matches) |
420 parser.add_option( '-c', '--column', dest='column', type='int', default=0, help='The column (zero-based) in the tabular file that contains peptide sequences' ) | 621 if not match: |
421 parser.add_option( '-f', '--fasta', dest='fasta', default=None, help='A fasta file containing peptide sequences' ) | 622 unmatched_peptides.append(peptide) |
422 parser.add_option( '-m', '--mzid', dest='mzid', default=None, help='A mxIdentML file containing peptide sequences' ) | 623 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
423 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | 624 match = {'peptide': longest_tryptic_peptide} |
424 # Unipept Flags | 625 match['tryptic_peptide'] = match['peptide'] |
425 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | 626 match['peptide'] = peptide |
426 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | 627 peptideMatches.append(match) |
427 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | |
428 parser.add_option( '-D', '--domains', dest='domains', action='store_true', default=False, help='group response by GO namaspace: biological process, molecular function, cellular component' ) | |
429 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) | |
430 | |
431 # output fields | |
432 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | |
433 # Warn vs Error Flag | |
434 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | |
435 # output files | |
436 parser.add_option( '-J', '--json', dest='json', default=None, help='Output file path for json formatted results') | |
437 parser.add_option( '-j', '--ec_json', dest='ec_json', default=None, help='Output file path for json formatted results') | |
438 parser.add_option( '-E', '--ec_tsv', dest='ec_tsv', default=None, help='Output file path for EC TAB-separated-values (.tsv) formatted results') | |
439 parser.add_option( '-G', '--go_tsv', dest='go_tsv', default=None, help='Output file path for GO TAB-separated-values (.tsv) formatted results') | |
440 parser.add_option( '-L', '--lineage_tsv', dest='lineage_tsv', default=None, help='Output file path for Lineage TAB-separated-values (.tsv) formatted results') | |
441 parser.add_option( '-T', '--tsv', dest='tsv', default=None, help='Output file path for TAB-separated-values (.tsv) formatted results') | |
442 parser.add_option( '-C', '--csv', dest='csv', default=None, help='Output file path for Comma-separated-values (.csv) formatted results') | |
443 parser.add_option( '-U', '--unmatched', dest='unmatched', default=None, help='Output file path for peptide with no matches' ) | |
444 parser.add_option( '-u', '--url', dest='url', default='http://api.unipept.ugent.be/api/v1/', help='unipept url http://api.unipept.ugent.be/api/v1/' ) | |
445 # debug | |
446 parser.add_option( '-g', '--get', dest='get', action='store_true', default=False, help='Use GET instead of POST' ) | |
447 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turning on debugging' ) | |
448 parser.add_option( '-v', '--version', dest='version', action='store_true', default=False, help='pring version and exit' ) | |
449 (options, args) = parser.parse_args() | |
450 if options.version: | |
451 print >> sys.stdout,"%s" % version | |
452 sys.exit(0) | |
453 invalid_ec = 2 if options.strict else None | |
454 peptides = [] | |
455 ## Get peptide sequences | |
456 if options.mzid: | |
457 peptides += read_mzid(options.mzid) | |
458 if options.pepxml: | |
459 peptides += read_pepxml(options.pepxml) | |
460 if options.tabular: | |
461 peptides += read_tabular(options.tabular,options.column) | |
462 if options.fasta: | |
463 peptides += read_fasta(options.fasta) | |
464 if args and len(args) > 0: | |
465 for i,peptide in enumerate(args): | |
466 if not re.match(pep_pat,peptide): | |
467 warn_err('"%s" is not a peptide (arg %d)\n' % (peptide,i),exit_code=invalid_ec) | |
468 peptides.append(peptide) | |
469 if len(peptides) < 1: | |
470 warn_err("No peptides input!",exit_code=1) | |
471 column_order = pept2lca_column_order | |
472 if options.unipept == 'pept2prot': | |
473 column_order = pept2prot_extra_column_order if options.extra else pept2prot_column_order | |
474 else: | |
475 if options.extra or options.names: | |
476 column_order = pept2lca_all_column_order if options.allfields else pept2lca_extra_column_order | |
477 else: | 628 else: |
478 column_order = pept2lca_column_order | 629 respMap = {v['peptide']: v for v in unipept_resp} |
479 ## map to tryptic peptides | 630 # map resp back to peptides |
480 pepToParts = {p: re.split("\n", re.sub(r'(?<=[RK])(?=[^P])','\n', p)) for p in peptides} | 631 for peptide in peptides: |
481 partToPeps = {} | 632 matches = list() |
482 for peptide, parts in pepToParts.iteritems(): | 633 for part in pepToParts[peptide]: |
483 if options.debug: print >> sys.stdout, "peptide: %s\ttryptic: %s\n" % (peptide, parts) | 634 if part in respMap and 'total_protein_count' in respMap[part]: |
484 for part in parts: | 635 matches.append(respMap[part]) |
485 if len(part) > 50: | 636 match = best_match(peptide, matches) |
486 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) | 637 if not match: |
487 if 5 <= len(part) <= 50: | 638 unmatched_peptides.append(peptide) |
488 partToPeps.setdefault(part,[]).append(peptide) | 639 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] |
489 trypticPeptides = partToPeps.keys() | 640 match = {'peptide': longest_tryptic_peptide} |
490 ## unipept | 641 match['tryptic_peptide'] = match['peptide'] |
491 unipept_resp = [] | 642 match['peptide'] = peptide |
492 idx = range(0,len(trypticPeptides),options.max_request) | 643 peptideMatches.append(match) |
493 idx.append(len(trypticPeptides)) | 644 resp = peptideMatches |
494 for i in range(len(idx)-1): | 645 if options.debug: |
495 post_data = [] | 646 print("\nmapped response: %s\n" % str(resp), file=sys.stderr) |
496 if options.equate_il: | 647 # output results |
497 post_data.append(("equate_il","true")) | 648 if not (options.unmatched or options.json or options.tsv or options.csv): |
498 if options.names or options.json: | 649 print(str(resp)) |
499 post_data.append(("extra","true")) | 650 if options.unmatched: |
500 post_data.append(("names","true")) | 651 with open(options.unmatched, 'w') as outputFile: |
501 elif options.extra or options.json: | 652 for peptide in peptides: |
502 post_data.append(("extra","true")) | 653 if peptide in unmatched_peptides: |
503 if options.domains: | 654 outputFile.write("%s\n" % peptide) |
504 post_data.append(("domains","true")) | 655 if options.json: |
505 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] | 656 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: |
506 if options.debug: print >> sys.stdout, "post_data: %s\n" % (str(post_data)) | 657 root = get_taxon_json(resp) |
507 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | 658 with open(options.json, 'w') as outputFile: |
508 ## headers = {'Accept': 'application/json'} | 659 outputFile.write(json.dumps(root)) |
509 url = '%s/%s' % (options.url.rstrip('/'),options.unipept) | 660 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2interpro', 'pept2funct']: |
510 if options.get: | 661 with open(options.json, 'w') as outputFile: |
511 params = '&'.join(['%s=%s' % (i[0],i[1]) for i in post_data]) | 662 outputFile.write(str(resp)) |
512 url = '%s.json?%s' % (url,params) | 663 if options.ec_json: |
513 req = urllib2.Request( url ) | 664 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
514 else: | 665 root = get_ec_json(resp) |
515 url = '%s.json' % (url) | 666 with open(options.ec_json, 'w') as outputFile: |
516 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | 667 outputFile.write(json.dumps(root)) |
517 if options.debug: print >> sys.stdout, "url: %s\n" % (str(url)) | 668 if options.tsv or options.csv: |
518 try: | 669 rows = [] |
519 resp = urllib2.urlopen( req ) | 670 column_names = None |
520 if options.debug: print >> sys.stdout,"%s %s\n" % (url,str(resp.getcode())) | 671 if options.unipept in ['pept2ec', 'pept2go', 'pept2interpro', 'pept2funct', 'peptinfo']: |
521 if resp.getcode() == 200: | 672 taxa = None |
522 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | 673 ec_dict = None |
523 except Exception, e: | 674 go_dict = None |
524 warn_err('HTTP Error %s\n' % (str(e)),exit_code=None) | 675 ipr_dict = None |
525 unmatched_peptides = [] | 676 if options.unipept in ['peptinfo']: |
526 peptideMatches = [] | 677 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
527 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | 678 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
528 if options.unipept in ['pept2prot', 'pept2taxa']: | 679 (ec_dict, ec_cols) = get_ec_dict(resp, extra=options.extra) |
529 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | 680 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: |
530 ## multiple entries per trypticPeptide for pep2prot or pep2taxa | 681 (go_dict, go_cols) = get_go_dict(resp, extra=options.extra) |
531 mapping = {} | 682 if options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']: |
532 for match in unipept_resp: | 683 (ipr_dict, ipr_cols) = get_ipr_dict(resp, extra=options.extra) |
533 mapping.setdefault(match['peptide'],[]).append(match) | 684 for i, pdict in enumerate(resp): |
534 for peptide in peptides: | 685 peptide = pdict['peptide'] |
535 # Get the intersection of matches to the tryptic parts | 686 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' |
536 keyToMatch = None | 687 column_names = ['peptide', 'total_protein_count'] |
537 for part in pepToParts[peptide]: | 688 vals = [peptide, total_protein_count] |
538 if part in mapping: | 689 if ec_dict: |
539 temp = {match[dupkey] : match for match in mapping[part]} | 690 vals += ec_dict[peptide] |
540 if keyToMatch: | 691 column_names += ec_cols |
541 dkeys = set(keyToMatch.keys()) - set(temp.keys()) | 692 if go_dict: |
542 for k in dkeys: | 693 vals += go_dict[peptide] |
543 del keyToMatch[k] | 694 column_names += go_cols |
544 else: | 695 if ipr_dict: |
545 keyToMatch = temp | 696 vals += ipr_dict[peptide] |
546 ## keyToMatch = keyToMatch.fromkeys([x for x in keyToMatch if x in temp]) if keyToMatch else temp | 697 column_names += ipr_cols |
547 if not keyToMatch: | 698 if taxa: |
548 unmatched_peptides.append(peptide) | 699 vals += taxa[peptide][1:] |
549 else: | 700 column_names += taxon_cols[1:] |
550 for key,match in keyToMatch.iteritems(): | 701 rows.append(vals) |
551 match['tryptic_peptide'] = match['peptide'] | 702 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: |
552 match['peptide'] = peptide | 703 (taxa, taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) |
553 peptideMatches.append(match) | 704 column_names = taxon_cols |
554 elif options.unipept in ['pept2lca', 'peptinfo']: | 705 rows = list(taxa.values()) |
555 ## should be one response per trypticPeptide for pep2lca | 706 for peptide, vals in taxa.items(): |
556 respMap = {v['peptide']:v for v in unipept_resp} | 707 rows.append(vals) |
557 ## map resp back to peptides | 708 if options.tsv: |
558 for peptide in peptides: | 709 with open(options.tsv, 'w') as outputFile: |
559 matches = list() | 710 if column_names: |
560 for part in pepToParts[peptide]: | 711 outputFile.write("#%s\n" % '\t'.join(column_names)) |
561 if part in respMap: | 712 for vals in rows: |
562 matches.append(respMap[part]) | 713 outputFile.write("%s\n" % '\t'.join(vals)) |
563 match = best_match(peptide,matches) | 714 if options.csv: |
564 if not match: | 715 with open(options.csv, 'w') as outputFile: |
565 unmatched_peptides.append(peptide) | 716 if column_names: |
566 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | 717 outputFile.write("%s\n" % ','.join(column_names)) |
567 match = {'peptide' : longest_tryptic_peptide} | 718 for vals in rows: |
568 match['tryptic_peptide'] = match['peptide'] | 719 outputFile.write("%s\n" % ','.join(['"%s"' % (v if v else '') for v in vals])) |
569 match['peptide'] = peptide | 720 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: |
570 peptideMatches.append(match) | 721 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order |
571 else: | 722 write_ec_table(options.ec_tsv, resp, column_order) |
572 respMap = {v['peptide']:v for v in unipept_resp} | 723 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: |
573 ## map resp back to peptides | 724 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order |
574 for peptide in peptides: | 725 write_go_table(options.go_tsv, resp, column_order) |
575 matches = list() | 726 if options.ipr_tsv and options.unipept in ['pept2interpro', 'pept2funct', 'peptinfo']: |
576 for part in pepToParts[peptide]: | 727 column_order = pept2interpro_extra_column_order if options.extra else pept2interpro_column_order |
577 if part in respMap and 'total_protein_count' in respMap[part]: | 728 write_ipr_table(options.ipr_tsv, resp, column_order) |
578 matches.append(respMap[part]) | 729 |
579 match = best_match(peptide,matches) | 730 |
580 if not match: | 731 if __name__ == "__main__": |
581 unmatched_peptides.append(peptide) | 732 __main__() |
582 longest_tryptic_peptide = sorted(pepToParts[peptide], key=lambda x: len(x))[-1] | |
583 match = {'peptide' : longest_tryptic_peptide} | |
584 match['tryptic_peptide'] = match['peptide'] | |
585 match['peptide'] = peptide | |
586 peptideMatches.append(match) | |
587 resp = peptideMatches | |
588 if options.debug: print >> sys.stdout,"\nmapped response: %s\n" % str(resp) | |
589 ## output results | |
590 if not (options.unmatched or options.json or options.tsv or options.csv): | |
591 print >> sys.stdout, str(resp) | |
592 if options.unmatched: | |
593 with open(options.unmatched,'w') as outputFile: | |
594 for peptide in peptides: | |
595 if peptide in unmatched_peptides: | |
596 outputFile.write("%s\n" % peptide) | |
597 if options.json: | |
598 if options.unipept in ['pept2lca', 'pept2taxa', 'peptinfo']: | |
599 root = get_taxon_json(resp) | |
600 with open(options.json,'w') as outputFile: | |
601 outputFile.write(json.dumps(root)) | |
602 elif options.unipept in ['pept2prot', 'pept2ec', 'pept2go', 'pept2funct']: | |
603 with open(options.json,'w') as outputFile: | |
604 outputFile.write(str(resp)) | |
605 if options.ec_json: | |
606 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
607 root = get_ec_json(resp) | |
608 with open(options.ec_json,'w') as outputFile: | |
609 outputFile.write(json.dumps(root)) | |
610 if options.tsv or options.csv: | |
611 rows = [] | |
612 column_names = None | |
613 if options.unipept in ['pept2ec', 'pept2go', 'pept2funct', 'peptinfo']: | |
614 taxa = None | |
615 ec_dict = None | |
616 go_dict = None | |
617 if options.unipept in ['peptinfo']: | |
618 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
619 if options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
620 (ec_dict,ec_cols) = get_ec_dict(resp, extra=options.extra) | |
621 if options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
622 (go_dict,go_cols) = get_go_dict(resp, extra=options.extra) | |
623 for i,pdict in enumerate(resp): | |
624 peptide = pdict['peptide'] | |
625 total_protein_count = str(pdict['total_protein_count']) if 'total_protein_count' in pdict else '0' | |
626 column_names = ['peptide', 'total_protein_count'] | |
627 vals = [peptide,total_protein_count] | |
628 if ec_dict: | |
629 vals += ec_dict[peptide] | |
630 column_names += ec_cols | |
631 if go_dict: | |
632 vals += go_dict[peptide] | |
633 column_names += go_cols | |
634 if taxa: | |
635 vals += taxa[peptide][1:] | |
636 column_names += taxon_cols[1:] | |
637 rows.append(vals) | |
638 elif options.unipept in ['pept2lca', 'pept2taxa', 'pept2prot']: | |
639 (taxa,taxon_cols) = get_taxon_dict(resp, column_order, extra=options.extra, names=options.names) | |
640 column_names = taxon_cols | |
641 rows = taxa.values() | |
642 for peptide,vals in taxa.iteritems(): | |
643 rows.append(vals) | |
644 if options.tsv: | |
645 with open(options.tsv,'w') as outputFile: | |
646 if column_names: | |
647 outputFile.write("#%s\n"% '\t'.join(column_names)) | |
648 for vals in rows: | |
649 outputFile.write("%s\n"% '\t'.join(vals)) | |
650 if options.csv: | |
651 with open(options.csv,'w') as outputFile: | |
652 if column_names: | |
653 outputFile.write("%s\n"% ','.join(column_names)) | |
654 for vals in rows: | |
655 outputFile.write("%s\n"% ','.join(['"%s"' % (v if v else '') for v in vals])) | |
656 if options.ec_tsv and options.unipept in ['pept2ec', 'pept2funct', 'peptinfo']: | |
657 column_order = pept2ec_extra_column_order if options.extra else pept2ec_column_order | |
658 write_ec_table(options.ec_tsv, resp, column_order) | |
659 if options.go_tsv and options.unipept in ['pept2go', 'pept2funct', 'peptinfo']: | |
660 column_order = pept2go_extra_column_order if options.extra else pept2go_column_order | |
661 write_go_table(options.go_tsv, resp, column_order) | |
662 | |
663 if __name__ == "__main__" : __main__() |