0
|
1 '''
|
|
2 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database.
|
|
3 '''
|
|
4
|
3
|
5 import optparse, json, urllib2, sys
|
0
|
6
|
|
7 def __main__():
|
|
8 # -- Parse command line. --
|
|
9 parser = optparse.OptionParser()
|
|
10 parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names')
|
|
11 parser.add_option('-a', '--print-all', dest='print_all', action='store_true', help='print all lines, even though without a result')
|
|
12 parser.add_option('-e', '--expert-curated', dest='expert_curated', action='store_true', help='use only expert curated results')
|
|
13 (options, args) = parser.parse_args()
|
|
14 gene_name_col = int(options.gene_name_col) - 1
|
|
15
|
|
16 # Open input stream.
|
|
17 if len(args) > 0:
|
|
18 input_file = open(args[0], 'r')
|
|
19 else:
|
|
20 input_file = sys.stdin
|
|
21
|
3
|
22 # -- Set up gene list queries. --
|
0
|
23
|
|
24 # Get gene list.
|
|
25 gene_list = []
|
|
26 lines = []
|
|
27 for line in input_file:
|
1
|
28 entry = line.split('\t')[gene_name_col].strip()
|
|
29 # Some annotations may be of the form
|
|
30 # <gene_name>(<splicing_info>) or <gene_name>;<gene_name>(splicing_info)
|
|
31 gene_list.append(entry.split(';')[0].split('(')[0])
|
0
|
32 lines.append(line.strip())
|
|
33
|
3
|
34 # Set up gene lists to be ~8K because this is near the max HTTP request length.
|
|
35 gene_list = ','.join(set(gene_list))
|
|
36 queries = []
|
|
37 MAX_QUERY_SIZE = 8000
|
|
38 if len(gene_list) > MAX_QUERY_SIZE:
|
|
39 # Break queries.
|
|
40 queries = [ gene_list[i:i + MAX_QUERY_SIZE] for i in range(0, len(gene_list), MAX_QUERY_SIZE) ]
|
|
41
|
|
42 # Adjust queries to include whole genes.
|
|
43 for i, query in enumerate( queries[1:] ):
|
|
44 part_gene, comma, remainder = query.partition(',')
|
|
45 queries[i] += part_gene
|
|
46 queries[i+1] = remainder
|
|
47 else:
|
|
48 queries = [ gene_list ]
|
|
49
|
|
50 # -- Query and process results. --
|
|
51
|
0
|
52 # Query for results.
|
3
|
53 results = []
|
|
54 for genes in queries:
|
|
55 query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % genes
|
|
56 if options.expert_curated:
|
|
57 query_str += '&source_trust_levels=Expert%20curated'
|
|
58 raw_results = urllib2.urlopen(query_str).read()
|
|
59 results_dict = json.loads(raw_results)
|
|
60 results.extend(results_dict['matchedTerms'])
|
|
61
|
0
|
62 # Process results.
|
3
|
63 for result in results:
|
0
|
64 # Process result.
|
|
65 processed_results = []
|
|
66 result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ]
|
|
67 for interaction in result['interactions']:
|
|
68 result_fields = result_fields[0:3]
|
|
69 result_fields.extend( [
|
|
70 interaction['interactionType'], interaction['drugName'], interaction['source']
|
|
71 ] )
|
|
72 processed_results.append( '\t'.join( result_fields ) )
|
|
73
|
|
74 # Store processed results.
|
|
75 results_dict[ result['searchTerm'] ] = processed_results
|
|
76
|
|
77 # -- Annotate input file and produce output. --
|
|
78 for line in lines:
|
|
79 fields = line.split('\t')
|
|
80 gene = fields[gene_name_col]
|
|
81 if gene in results_dict:
|
|
82 for result in results_dict[gene]:
|
|
83 print line.strip() + '\t' + result
|
|
84 elif options.print_all:
|
|
85 print line
|
|
86
|
1
|
87 if __name__=="__main__": __main__()
|