Mercurial > repos > galaxyp > unipept
comparison unipept.py @ 3:34758ab8aaa4 draft
Uploaded
author | galaxyp |
---|---|
date | Mon, 20 Feb 2017 10:32:03 -0500 |
parents | 0c1ee95282fa |
children | 4953dcd7dd39 |
comparison
equal
deleted
inserted
replaced
2:503ab8a39006 | 3:34758ab8aaa4 |
---|---|
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] | 36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] |
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] | 37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] |
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] | 38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] |
39 | 39 |
40 def __main__(): | 40 def __main__(): |
41 version = '1.1' | 41 version = '2.0' |
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' | 42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' |
43 | 43 |
44 def read_tabular(filepath,col): | 44 def read_tabular(filepath,col): |
45 peptides = [] | 45 peptides = [] |
46 with open(filepath) as fp: | 46 with open(filepath) as fp: |
118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) | 118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) |
119 # Unipept Flags | 119 # Unipept Flags |
120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) | 120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) |
121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) | 121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) |
122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) | 122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) |
123 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' ) | |
124 | |
123 # output fields | 125 # output fields |
124 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) | 126 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) |
125 # Warn vs Error Flag | 127 # Warn vs Error Flag |
126 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) | 128 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) |
127 # output files | 129 # output files |
172 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) | 174 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) |
173 if 5 <= len(part) <= 50: | 175 if 5 <= len(part) <= 50: |
174 partToPeps.setdefault(part,[]).append(peptide) | 176 partToPeps.setdefault(part,[]).append(peptide) |
175 trypticPeptides = partToPeps.keys() | 177 trypticPeptides = partToPeps.keys() |
176 ## unipept | 178 ## unipept |
177 post_data = [] | 179 unipept_resp = [] |
178 if options.equate_il: | 180 idx = range(0,len(trypticPeptides),options.max_request) |
179 post_data.append(("equate_il","true")) | 181 idx.append(len(trypticPeptides)) |
180 if options.names: | 182 for i in range(len(idx)-1): |
181 post_data.append(("extra","true")) | 183 post_data = [] |
182 post_data.append(("names","true")) | 184 if options.equate_il: |
183 elif options.extra: | 185 post_data.append(("equate_il","true")) |
184 post_data.append(("extra","true")) | 186 if options.names or options.json: |
185 post_data += [('input[]', x) for x in trypticPeptides] | 187 post_data.append(("extra","true")) |
186 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | 188 post_data.append(("names","true")) |
187 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept | 189 elif options.extra or options.json: |
188 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | 190 post_data.append(("extra","true")) |
189 unipept_resp = json.loads( urllib2.urlopen( req ).read() ) | 191 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]] |
192 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} | |
193 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept | |
194 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) | |
195 unipept_resp += json.loads( urllib2.urlopen( req ).read() ) | |
190 unmatched_peptides = [] | 196 unmatched_peptides = [] |
191 peptideMatches = [] | 197 peptideMatches = [] |
192 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) | 198 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) |
193 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': | 199 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': |
194 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide | 200 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide |
242 with open(options.unmatched,'w') as outputFile: | 248 with open(options.unmatched,'w') as outputFile: |
243 for peptide in peptides: | 249 for peptide in peptides: |
244 if peptide in unmatched_peptides: | 250 if peptide in unmatched_peptides: |
245 outputFile.write("%s\n" % peptide) | 251 outputFile.write("%s\n" % peptide) |
246 if options.json: | 252 if options.json: |
247 with open(options.json,'w') as outputFile: | 253 if options.unipept == 'pept2prot': |
248 outputFile.write(str(resp)) | 254 with open(options.json,'w') as outputFile: |
255 outputFile.write(str(resp)) | |
256 else: | |
257 found_keys = set() | |
258 for i,pdict in enumerate(resp): | |
259 found_keys |= set(pdict.keys()) | |
260 taxa_cols = [] | |
261 for col in pept2lca_extra_column_order[-1:0:-1]: | |
262 if col+'_id' in found_keys: | |
263 taxa_cols.append(col) | |
264 id_to_node = dict() | |
265 def get_node(id,name,rank,child,seq): | |
266 if id not in id_to_node: | |
267 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] } | |
268 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data } | |
269 id_to_node[id] = node | |
270 else: | |
271 node = id_to_node[id] | |
272 node['data']['count'] += 1 | |
273 if seq is not None and seq not in node['data']['sequences']: | |
274 node['data']['sequences'].append(seq) | |
275 if child is None: | |
276 node['data']['self_count'] += 1 | |
277 elif child['id'] not in node['kids']: | |
278 node['kids'].append(child['id']) | |
279 node['children'].append(child) | |
280 return node | |
281 root = get_node(1,'root','no rank',None,None) | |
282 for i,pdict in enumerate(resp): | |
283 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None)) | |
284 seq = sequence | |
285 child = None | |
286 for col in taxa_cols: | |
287 col_id = col+'_id' | |
288 if col_id in pdict and pdict.get(col_id): | |
289 col_name = col if col in found_keys else col+'_name' | |
290 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq) | |
291 seq = None | |
292 if child: | |
293 get_node(1,'root','no rank',child,None) | |
294 with open(options.json,'w') as outputFile: | |
295 outputFile.write(json.dumps(root)) | |
249 if options.tsv or options.csv: | 296 if options.tsv or options.csv: |
250 # 'pept2lca','pept2taxa','pept2prot' | 297 # 'pept2lca','pept2taxa','pept2prot' |
251 found_keys = set() | 298 found_keys = set() |
252 results = [] | 299 results = [] |
253 for i,pdict in enumerate(resp): | 300 for i,pdict in enumerate(resp): |