comparison cravat_annotate/cravat_annotate.py @ 10:152227fa7851 draft

Uploaded
author in_silico
date Tue, 12 Jun 2018 11:04:25 -0400
parents
children
comparison
equal deleted inserted replaced
9:3da92d305671 10:152227fa7851
1 """
2 A galaxy wrapper for the /rest/service/query API endpoint on Cravat.
3
4
5 Notes on Mapping:
6 -----------------
7 The CravatQuery class uses static method 'from_array' to interpret an array of values
8 into a query string for the /rest/service/query API service on the cravat server.
9 This involves using a mapping dictionary to know how to associate the array's index positions
10 in to query-ing attributes, such as the chromosome, position, etc. The CravatQuery
11 class contains a default value ('default_mapping'); however, this could also be
12 offered as a user-configurable option.
13
14
15 Remaining Items (including possible expansion features):
16 -----------------
17 TODO: Possibly provide user-configurability of CravatQuery array index mapping
18 TODO: Possibly provide user-configurability of delimiter value
19 TODO: Check if chromosomes are 0 or 1 based indexing
20 TODO: Port 'write headers' option and include in user prompts in galaxy xml
21 TODO: Try-catch structure on the query call to cravat so if one bad query doesn't get back a response,
22 the rest of the run can still execute. Report this to user.
23 """
24
25
26 import requests
27 import json
28 import sys
29 import re
30 ###
31 import ipdb
32
33
34 class CravatQueryException(Exception):
35
36 def __init__(self, message, errors=None):
37 super(CravatQueryException, self).__init__(message)
38 # Support for custom error codes
39 self.errors = errors
40
41
42 class CravatQuery(object):
43 """
44 : A class for handling Cravat query strings.
45 : Args (all required):
46 : chr - Chromosome
47 : pos - Position
48 : strand - Strand
49 : ref - Reference Base
50 : alt - Alternate Base
51 """
52
53 # The endpoint that CravatQuerys are submitted to
54 endpoint = 'http://cravat.us/CRAVAT/rest/service/query'
55
56 # The value delimiter used in the Cravat input file to delimit values
57 delimiter = "\t"
58
59 # Defualt indices for intepretting a cravat file's row of data in to a CravatQuery
60 default_mapping = {
61 'chromosome': 1,
62 'position': 2,
63 'strand': 3,
64 'reference': 4,
65 'alternate': 5
66 }
67
68 # Defualt values. Used as backup for CravatQuery to resolve query with incomplete information
69 default_values = {
70 'strand': '+'
71 }
72
73 # The neccessary attributes neeeded to submit a query.
74 query_keys = [
75 'chromosome', 'position', 'strand', 'reference', 'alternate'
76 ]
77
78 # Expected response keys from server. Ordered in list so that galaxy output has uniform column ordering run-to-run.
79 # If cravat server returns additional keys, they are appended to and included in output.
80 response_keys = [
81 "Chromosome", "Position", "Strand", "Reference base(s)", "Alternate base(s)",
82 "HUGO symbol", "S.O. transcript", "Sequence ontology protein change", "Sequence ontology",
83 "S.O. all transcripts", "gnomAD AF", "gnomAD AF (African)", "gnomAD AF (Amrican)",
84 "gnomAD AF (Ashkenazi Jewish)", "gnomAD AF (East Asian)", "gnomAD AF (Finnish)",
85 "gnomAD AF (Non-Finnish European)", "gnomAD AF (Other)", "gnomAD AF (South Asian)",
86 "1000 Genomes AF", "ESP6500 AF (average)", "ESP6500 AF (European American)",
87 "ESP6500 AF (African American)", "COSMIC transcript", "COSMIC protein change",
88 "COSMIC variant count [exact nucleotide change]", "cosmic_site_nt", "CGL driver class",
89 "TARGET", "dbSNP", "cgc_role", "cgc_inheritance", "cgc_tumor_type_somatic",
90 "cgc_tumor_type_germline", "ClinVar", "ClinVar disease identifier", "ClinVar XRef",
91 "GWAS Phenotype (GRASP)", "GWAS PMID (GRASP)", "Protein 3D variant"
92 ]
93
94
95 def __init__(self, _chr, pos, strand, ref, alt):
96 # '_chr' used to avoid naming confliction with python built-in 'chr'
97 self.chromosome = CravatQuery.format_chromosome(_chr)
98 self.position = pos
99 self.strand = strand
100 self.reference = ref
101 self.alternate = alt
102 self.values = [self.chromosome, self.position, self.strand, self.reference, self.alternate]
103
104
105 def __str__(self):
106 """
107 : Represent the CravatQuery as a valid query string for call to Cravat server
108 """
109 return "_".join(map(lambda x: str(x), self.values))
110
111
112 def as_query_string(self):
113 return str(self)
114
115
116 @staticmethod
117 def from_dictionary(d):
118 """
119 : Instantiate a CravatQuery from a dictionary representation.
120 : Args:
121 : d <dictionary>: A dictionary representing a CravatQuery, containing keys: [{}]
122 """.format(CravatQuery.query_keys)
123
124 for key in CravatQuery.query_keys:
125 if key not in d:
126 raise CravatQueryException("CravatQuery.from_dictionary requires keys: [{}], however key: '{}' was not provided "
127 .format(CravatQuery.query_keys, key))
128 return CravatQuery(d["chromosome"], d["position"], d["strand"], d["reference"], d["alternate"])
129
130
131 @staticmethod
132 def from_array(array, mapping=None):
133 """
134 : Instantiate a CravatQuery from an array of values. Useful when translating read lines from a file.
135 : Args:
136 : fmt <str> - Either 'cr' or 'vcf', describing input format
137 : array <list> - The values to instantiate the CravatQuery from
138 : mapping <dict> - Optional. A dictionary associating cravat parameters to indicies in the array.
139 Valid values are: 'chromosome', 'position', 'strand', 'reference', 'alternate'
140 """
141
142 # Set the mapping value. Either recieved from user, or obtained via defualt associated to 'fmt'
143 if mapping == None:
144 mapping = CravatQuery.default_mapping
145
146 # Build a dict of cravat querying keys to values.
147 d = {}
148 for key in CravatQuery.query_keys:
149 # Try to get index position from mapping by the key, and value from array by the index
150 if key in mapping:
151 index = mapping[key]
152 d[key] = array[index]
153 # If index not provided in mapping, check if there is a defualt value
154 elif key in CravatQuery.default_values:
155 d[key] = CravatQuery.default_values[key]
156 # Unable to get value for querying key, meaning can't construct the minimum requirements for query
157 else:
158 raise CravatQueryException("CravatQuery.from_array requires a mapping index for key: '{}', however value was not provided".format(key))
159 return CravatQuery.from_dictionary(d)
160
161
162
163 @staticmethod
164 def format_chromosome(_chr):
165 """
166 : Format a chromosome for use as query parameter. '_chr' name used to avoid python built-in name confliction.
167 : Args:
168 : _chr - Either an interger [1,23], or 'x'/'X', or 'y'/'Y', or a string of the form
169 : 'chr<z>' where '<z>' is one of the previously described values
170 """
171 inRange = lambda x: 1 <= x and x <= 23
172 _chr = _chr.lower()
173 _chr = _chr.strip('chr')
174 # Handler interger chromosomes 1 to 23
175 try:
176 _chr = int(_chr)
177 if inRange(_chr):
178 return 'chr' + str(_chr)
179 else:
180 raise CravatQueryException("Chromsomme of '{}' was out of range [1,23]".format(_chr))
181 except:
182 pass
183 # Handle chromosomes chromosomes x and y
184 if _chr == 'x' or _chr == 'y':
185 return 'chr' + _chr
186 raise CravatQueryException("Unable to resolve input: '{}' into a valid chromosome representation".format(_chr))
187
188
189 @staticmethod
190 def jump_header(in_file, out_file, headerlines=0):
191 """
192 : Jumps over a header space of line number 'headerlines'. Sets up in_file so that
193 : the next execution of in_file.readline() will return the first non-header line.
194 """
195 in_file.seek(0)
196 for line in range(headerlines):
197 in_file.readline()
198
199
200 def main(in_path, out_path, pre_callback=None, user_mapping=None):
201 """
202 : Read the file line by line and use data to query cravat server.
203 : Args:
204 : - fmt <str>: 'cr' or 'vcf'. The input format
205 : - in_path <str>: Path to input file
206 : - in_path <str>: Path to output file
207 : - header_callback <function>: A function to handle the header space. Executed
208 before main loop. Recieves in_file, out_file, and fmt as argumnets
209 """
210
211 with open(in_path, 'r') as in_file, \
212 open(out_path, 'w') as out_file:
213
214 # Perform any pre-processing steps, such as jumping a header space
215 if pre_callback:
216 pre_callback(in_file, out_file, fmt)
217
218 # main loop
219 for line in in_file:
220
221 # Create query from line of input data
222 line = line.strip().split('\t')
223 query = CravatQuery.from_array(line, user_mapping)
224 # Make request, and write respone data
225 call = requests.get(CravatQuery.endpoint, params={ 'mutation': query.as_query_string })
226 ipdb.set_trace()
227 try:
228 if call.status_code != 200 or call.text == "":
229 raise CravatQueryException("Bad Server Response. Respone code: '{}', Response Text: '{}'".format(call.status_code, call.text))
230 json_response = json.loads(call.text)
231 wrote = False
232 for key, val in json_response.items():
233 # Set numeric values to uniform format
234 try:
235 val = float(val)
236 val = format(val, ".4f")
237 except:
238 pass
239 if wrote:
240 out_file.write("\t")
241 out_file.write(val)
242 wrote = True
243 out_file.write("\n")
244 except CravatQueryException as e:
245 print(e)
246
247
248
249
250 if __name__ == "__main__":
251
252 # Input and output file paths, obtained form command line
253 in_path = sys.argv[1]
254 out_path = sys.argv[2]
255
256 # Possibly allow user mapping configuration thourgh here. Not fully implemented
257 if len(sys.argv) > 2:
258 user_mapping = sys.argv[3]
259
260 # Run the main operation
261 main(in_path, out_path)