Mercurial > repos > devteam > dgidb_annotator
comparison dgidb_annotator.py @ 3:c5bb987015c5 draft default tip
Uploaded
author | devteam |
---|---|
date | Fri, 07 Mar 2014 16:34:26 -0500 |
parents | 792f3cb0eff4 |
children |
comparison
equal
deleted
inserted
replaced
2:792f3cb0eff4 | 3:c5bb987015c5 |
---|---|
1 ''' | 1 ''' |
2 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database. | 2 Annotates a tabular file with information from the Drug-Gene Interaction (DGI) database. |
3 ''' | 3 ''' |
4 | 4 |
5 import optparse, json, urllib2, sys, re | 5 import optparse, json, urllib2, sys |
6 | 6 |
7 def __main__(): | 7 def __main__(): |
8 # -- Parse command line. -- | 8 # -- Parse command line. -- |
9 parser = optparse.OptionParser() | 9 parser = optparse.OptionParser() |
10 parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names') | 10 parser.add_option('-g', '--gene-name-col', dest='gene_name_col', help='column of gene names') |
17 if len(args) > 0: | 17 if len(args) > 0: |
18 input_file = open(args[0], 'r') | 18 input_file = open(args[0], 'r') |
19 else: | 19 else: |
20 input_file = sys.stdin | 20 input_file = sys.stdin |
21 | 21 |
22 # -- Make connection and get results. -- | 22 # -- Set up gene list queries. -- |
23 | 23 |
24 # Get gene list. | 24 # Get gene list. |
25 gene_list = [] | 25 gene_list = [] |
26 lines = [] | 26 lines = [] |
27 for line in input_file: | 27 for line in input_file: |
29 # Some annotations may be of the form | 29 # Some annotations may be of the form |
30 # <gene_name>(<splicing_info>) or <gene_name>;<gene_name>(splicing_info) | 30 # <gene_name>(<splicing_info>) or <gene_name>;<gene_name>(splicing_info) |
31 gene_list.append(entry.split(';')[0].split('(')[0]) | 31 gene_list.append(entry.split(';')[0].split('(')[0]) |
32 lines.append(line.strip()) | 32 lines.append(line.strip()) |
33 | 33 |
34 # Set up gene lists to be ~8K because this is near the max HTTP request length. | |
35 gene_list = ','.join(set(gene_list)) | |
36 queries = [] | |
37 MAX_QUERY_SIZE = 8000 | |
38 if len(gene_list) > MAX_QUERY_SIZE: | |
39 # Break queries. | |
40 queries = [ gene_list[i:i + MAX_QUERY_SIZE] for i in range(0, len(gene_list), MAX_QUERY_SIZE) ] | |
41 | |
42 # Adjust queries to include whole genes. | |
43 for i, query in enumerate( queries[1:] ): | |
44 part_gene, comma, remainder = query.partition(',') | |
45 queries[i] += part_gene | |
46 queries[i+1] = remainder | |
47 else: | |
48 queries = [ gene_list ] | |
49 | |
50 # -- Query and process results. -- | |
51 | |
34 # Query for results. | 52 # Query for results. |
35 query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % ','.join(set(gene_list)) | 53 results = [] |
36 if options.expert_curated: | 54 for genes in queries: |
37 query_str += '&source_trust_levels=Expert%20curated' | 55 query_str = 'http://dgidb.genome.wustl.edu/api/v1/interactions.json?genes=%s' % genes |
38 results = urllib2.urlopen(query_str).read() | 56 if options.expert_curated: |
39 results_dict = json.loads(results) | 57 query_str += '&source_trust_levels=Expert%20curated' |
40 | 58 raw_results = urllib2.urlopen(query_str).read() |
59 results_dict = json.loads(raw_results) | |
60 results.extend(results_dict['matchedTerms']) | |
61 | |
41 # Process results. | 62 # Process results. |
42 matched_results = results_dict['matchedTerms'] | 63 for result in results: |
43 for result in matched_results: | |
44 # Process result. | 64 # Process result. |
45 processed_results = [] | 65 processed_results = [] |
46 result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ] | 66 result_fields = [ result['geneName'], result['geneLongName'], ','.join( result['geneCategories'] ) ] |
47 for interaction in result['interactions']: | 67 for interaction in result['interactions']: |
48 result_fields = result_fields[0:3] | 68 result_fields = result_fields[0:3] |