comparison unipept.py @ 3:34758ab8aaa4 draft

Uploaded
author galaxyp
date Mon, 20 Feb 2017 10:32:03 -0500
parents 0c1ee95282fa
children 4953dcd7dd39
comparison
equal deleted inserted replaced
2:503ab8a39006 3:34758ab8aaa4
36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:] 36 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id'] 37 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids'] 38 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
39 39
40 def __main__(): 40 def __main__():
41 version = '1.1' 41 version = '2.0'
42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$' 42 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
43 43
44 def read_tabular(filepath,col): 44 def read_tabular(filepath,col):
45 peptides = [] 45 peptides = []
46 with open(filepath) as fp: 46 with open(filepath) as fp:
118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' ) 118 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
119 # Unipept Flags 119 # Unipept Flags
120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' ) 120 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' ) 121 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' ) 122 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
123 parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
124
123 # output fields 125 # output fields
124 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' ) 126 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
125 # Warn vs Error Flag 127 # Warn vs Error Flag
126 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' ) 128 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
127 # output files 129 # output files
172 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None) 174 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
173 if 5 <= len(part) <= 50: 175 if 5 <= len(part) <= 50:
174 partToPeps.setdefault(part,[]).append(peptide) 176 partToPeps.setdefault(part,[]).append(peptide)
175 trypticPeptides = partToPeps.keys() 177 trypticPeptides = partToPeps.keys()
176 ## unipept 178 ## unipept
177 post_data = [] 179 unipept_resp = []
178 if options.equate_il: 180 idx = range(0,len(trypticPeptides),options.max_request)
179 post_data.append(("equate_il","true")) 181 idx.append(len(trypticPeptides))
180 if options.names: 182 for i in range(len(idx)-1):
181 post_data.append(("extra","true")) 183 post_data = []
182 post_data.append(("names","true")) 184 if options.equate_il:
183 elif options.extra: 185 post_data.append(("equate_il","true"))
184 post_data.append(("extra","true")) 186 if options.names or options.json:
185 post_data += [('input[]', x) for x in trypticPeptides] 187 post_data.append(("extra","true"))
186 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'} 188 post_data.append(("names","true"))
187 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept 189 elif options.extra or options.json:
188 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) ) 190 post_data.append(("extra","true"))
189 unipept_resp = json.loads( urllib2.urlopen( req ).read() ) 191 post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
192 headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
193 url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
194 req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
195 unipept_resp += json.loads( urllib2.urlopen( req ).read() )
190 unmatched_peptides = [] 196 unmatched_peptides = []
191 peptideMatches = [] 197 peptideMatches = []
192 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp) 198 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
193 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa': 199 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
194 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide 200 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
242 with open(options.unmatched,'w') as outputFile: 248 with open(options.unmatched,'w') as outputFile:
243 for peptide in peptides: 249 for peptide in peptides:
244 if peptide in unmatched_peptides: 250 if peptide in unmatched_peptides:
245 outputFile.write("%s\n" % peptide) 251 outputFile.write("%s\n" % peptide)
246 if options.json: 252 if options.json:
247 with open(options.json,'w') as outputFile: 253 if options.unipept == 'pept2prot':
248 outputFile.write(str(resp)) 254 with open(options.json,'w') as outputFile:
255 outputFile.write(str(resp))
256 else:
257 found_keys = set()
258 for i,pdict in enumerate(resp):
259 found_keys |= set(pdict.keys())
260 taxa_cols = []
261 for col in pept2lca_extra_column_order[-1:0:-1]:
262 if col+'_id' in found_keys:
263 taxa_cols.append(col)
264 id_to_node = dict()
265 def get_node(id,name,rank,child,seq):
266 if id not in id_to_node:
267 data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1, 'rank' : rank, 'sequences' : [] }
268 node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
269 id_to_node[id] = node
270 else:
271 node = id_to_node[id]
272 node['data']['count'] += 1
273 if seq is not None and seq not in node['data']['sequences']:
274 node['data']['sequences'].append(seq)
275 if child is None:
276 node['data']['self_count'] += 1
277 elif child['id'] not in node['kids']:
278 node['kids'].append(child['id'])
279 node['children'].append(child)
280 return node
281 root = get_node(1,'root','no rank',None,None)
282 for i,pdict in enumerate(resp):
283 sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
284 seq = sequence
285 child = None
286 for col in taxa_cols:
287 col_id = col+'_id'
288 if col_id in pdict and pdict.get(col_id):
289 col_name = col if col in found_keys else col+'_name'
290 child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
291 seq = None
292 if child:
293 get_node(1,'root','no rank',child,None)
294 with open(options.json,'w') as outputFile:
295 outputFile.write(json.dumps(root))
249 if options.tsv or options.csv: 296 if options.tsv or options.csv:
250 # 'pept2lca','pept2taxa','pept2prot' 297 # 'pept2lca','pept2taxa','pept2prot'
251 found_keys = set() 298 found_keys = set()
252 results = [] 299 results = []
253 for i,pdict in enumerate(resp): 300 for i,pdict in enumerate(resp):