comparison dgidb_annotator.py @ 3:c5bb987015c5 draft default tip

Uploaded
author devteam
date Fri, 07 Mar 2014 16:34:26 -0500
parents 792f3cb0eff4
children
comparison
equal deleted inserted replaced
2:792f3cb0eff4 3:c5bb987015c5
1 ''' 1 '''
2 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database. 2 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database.
3 ''' 3 '''
4 4
5 import optparse, json, urllib2, sys, re 5 import optparse, json, urllib2, sys
6 6
7 def __main__(): 7 def __main__():
8 # -- Parse command line. -- 8 # -- Parse command line. --
9 parser = optparse.OptionParser() 9 parser = optparse.OptionParser()
10 parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names') 10 parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names')
17 if len(args) > 0: 17 if len(args) > 0:
18 input_file = open(args[0], 'r') 18 input_file = open(args[0], 'r')
19 else: 19 else:
20 input_file = sys.stdin 20 input_file = sys.stdin
21 21
22 # -- Make connection and get results. -- 22 # -- Set up gene list queries. --
23 23
24 # Get gene list. 24 # Get gene list.
25 gene_list = [] 25 gene_list = []
26 lines = [] 26 lines = []
27 for line in input_file: 27 for line in input_file:
29 # Some annotations may be of the form 29 # Some annotations may be of the form
30 # <gene_name>(<splicing_info>) or <gene_name>;<gene_name>(splicing_info) 30 # <gene_name>(<splicing_info>) or <gene_name>;<gene_name>(splicing_info)
31 gene_list.append(entry.split(';')[0].split('(')[0]) 31 gene_list.append(entry.split(';')[0].split('(')[0])
32 lines.append(line.strip()) 32 lines.append(line.strip())
33 33
34 # Set up gene lists to be ~8K because this is near the max HTTP request length.
35 gene_list = ','.join(set(gene_list))
36 queries = []
37 MAX_QUERY_SIZE = 8000
38 if len(gene_list) > MAX_QUERY_SIZE:
39 # Break queries.
40 queries = [ gene_list[i:i + MAX_QUERY_SIZE] for i in range(0, len(gene_list), MAX_QUERY_SIZE) ]
41
42 # Adjust queries to include whole genes.
43 for i, query in enumerate( queries[1:] ):
44 part_gene, comma, remainder = query.partition(',')
45 queries[i] += part_gene
46 queries[i+1] = remainder
47 else:
48 queries = [ gene_list ]
49
50 # -- Query and process results. --
51
34 # Query for results. 52 # Query for results.
35 query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % ','.join(set(gene_list)) 53 results = []
36 if options.expert_curated: 54 for genes in queries:
37 query_str += '&source_trust_levels=Expert%20curated' 55 query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % genes
38 results = urllib2.urlopen(query_str).read() 56 if options.expert_curated:
39 results_dict = json.loads(results) 57 query_str += '&source_trust_levels=Expert%20curated'
40 58 raw_results = urllib2.urlopen(query_str).read()
59 results_dict = json.loads(raw_results)
60 results.extend(results_dict['matchedTerms'])
61
41 # Process results. 62 # Process results.
42 matched_results = results_dict['matchedTerms'] 63 for result in results:
43 for result in matched_results:
44 # Process result. 64 # Process result.
45 processed_results = [] 65 processed_results = []
46 result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ] 66 result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ]
47 for interaction in result['interactions']: 67 for interaction in result['interactions']:
48 result_fields = result_fields[0:3] 68 result_fields = result_fields[0:3]