unipept: unipept.py comparison

comparison unipept.py @ 3:34758ab8aaa4 draft

Uploaded

author	galaxyp
date	Mon, 20 Feb 2017 10:32:03 -0500
parents	0c1ee95282fa
children	4953dcd7dd39

comparison

equal deleted inserted replaced

-:503ab8a39006
+:34758ab8aaa4
 pept2lca_all_column_order = pept2lca_column_order + pept2lca_extra_column_order[1:]
 pept2prot_column_order = ['peptide','uniprot_id','taxon_id']
 pept2prot_extra_column_order = pept2prot_column_order + ['taxon_name','ec_references','go_references','refseq_ids','refseq_protein_ids','insdc_ids','insdc_protein_ids']
 def __main__():
-version = '1.1'
+version = '2.0'
 pep_pat = '^([ABCDEFGHIKLMNPQRSTVWXYZ]+)$'
 def read_tabular(filepath,col):
 peptides = []
 with open(filepath) as fp:
 parser.add_option( '-p', '--pepxml', dest='pepxml', default=None, help='A pepxml file containing peptide sequences' )
 # Unipept Flags
 parser.add_option( '-e', '--equate_il', dest='equate_il', action='store_true', default=False, help='isoleucine (I) and leucine (L) are equated when matching tryptic peptides to UniProt records' )
 parser.add_option( '-x', '--extra', dest='extra', action='store_true', default=False, help='return the complete lineage of the taxonomic lowest common ancestor' )
 parser.add_option( '-n', '--names', dest='names', action='store_true', default=False, help='return the names of all ranks in the lineage of the taxonomic lowest common ancestor' )
+parser.add_option( '-M', '--max_request', dest='max_request', type='int', default=200, help='The maximum number of entries per unipept request' )
 # output fields
 parser.add_option( '-A', '--allfields', dest='allfields', action='store_true', default=False, help='inlcude fields: taxon_rank,taxon_id,taxon_name csv and tsv outputs' )
 # Warn vs Error Flag
 parser.add_option( '-S', '--strict', dest='strict', action='store_true', default=False, help='Print exit on invalid peptide' )
 # output files
 warn_err("peptide: %s tryptic fragment len %d > 50 for %s\n" % (peptide,len(part),part),exit_code=None)
 if 5 <= len(part) <= 50:
 partToPeps.setdefault(part,[]).append(peptide)
 trypticPeptides = partToPeps.keys()
 ## unipept
-post_data = []
+unipept_resp = []
-if options.equate_il:
+idx = range(0,len(trypticPeptides),options.max_request)
-post_data.append(("equate_il","true"))
+idx.append(len(trypticPeptides))
-if options.names:
+for i in range(len(idx)-1):
-post_data.append(("extra","true"))
+post_data = []
-post_data.append(("names","true"))
+if options.equate_il:
-elif options.extra:
+post_data.append(("equate_il","true"))
-post_data.append(("extra","true"))
+if options.names or options.json:
-post_data += [('input[]', x) for x in trypticPeptides]
+post_data.append(("extra","true"))
-headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
+post_data.append(("names","true"))
-url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
+elif options.extra or options.json:
-req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
+post_data.append(("extra","true"))
-unipept_resp = json.loads( urllib2.urlopen( req ).read() )
+post_data += [('input[]', x) for x in trypticPeptides[idx[i]:idx[i+1]]]
+headers = {'Content-Type': 'application/x-www-form-urlencoded',  'Accept': 'application/json'}
+url = 'http://api.unipept.ugent.be/api/v1/%s' % options.unipept
+req = urllib2.Request( url, headers = headers, data = urllib.urlencode(post_data) )
+unipept_resp += json.loads( urllib2.urlopen( req ).read() )
 unmatched_peptides = []
 peptideMatches = []
 if options.debug: print >> sys.stdout,"unipept response: %s\n" % str(unipept_resp)
 if options.unipept == 'pept2prot' or options.unipept == 'pept2taxa':
 dupkey = 'uniprot_id' if options.unipept == 'pept2prot' else 'taxon_id' ## should only keep one of these per input peptide
 with open(options.unmatched,'w') as outputFile:
 for peptide in peptides:
 if peptide in unmatched_peptides:
 outputFile.write("%s\n" % peptide)
 if options.json:
-with open(options.json,'w') as outputFile:
+if options.unipept == 'pept2prot':
-outputFile.write(str(resp))
+with open(options.json,'w') as outputFile:
+outputFile.write(str(resp))
+else:
+found_keys = set()
+for i,pdict in enumerate(resp):
+found_keys |= set(pdict.keys())
+taxa_cols = []
+for col in pept2lca_extra_column_order[-1:0:-1]:
+if col+'_id' in found_keys:
+taxa_cols.append(col)
+id_to_node = dict()
+def get_node(id,name,rank,child,seq):
+if id not in id_to_node:
+data = {'count' : 0, 'self_count' : 0, 'valid_taxon' : 1,  'rank' : rank, 'sequences' : [] }
+node = {'id' : id, 'name' : name, 'children' : [], 'kids': [],'data' : data }
+id_to_node[id] = node
+else:
+node = id_to_node[id]
+node['data']['count'] += 1
+if seq is not None and seq not in node['data']['sequences']:
+node['data']['sequences'].append(seq)
+if child is None:
+node['data']['self_count'] += 1
+elif child['id'] not in node['kids']:
+node['kids'].append(child['id'])
+node['children'].append(child)
+return node
+root = get_node(1,'root','no rank',None,None)
+for i,pdict in enumerate(resp):
+sequence = pdict.get('peptide',pdict.get('tryptic_peptide',None))
+seq = sequence
+child = None
+for col in taxa_cols:
+col_id = col+'_id'
+if col_id in pdict and pdict.get(col_id):
+col_name = col if col in found_keys else col+'_name'
+child = get_node(pdict.get(col_id,None),pdict.get(col_name,''),col,child,seq)
+seq = None
+if child:
+get_node(1,'root','no rank',child,None)
+with open(options.json,'w') as outputFile:
+outputFile.write(json.dumps(root))
 if options.tsv or options.csv:
 # 'pept2lca','pept2taxa','pept2prot'
 found_keys = set()
 results = []
 for i,pdict in enumerate(resp):

Mercurial > repos > galaxyp > unipept

comparison unipept.py @ 3:34758ab8aaa4 draft