comparison cravat_annotate/cravat_annotate.py @ 14:a28f1f52eb93 draft

Uploaded
author in_silico
date Tue, 12 Jun 2018 12:04:45 -0400
parents
children
comparison
equal deleted inserted replaced
13:20ebb3a5c1a4 14:a28f1f52eb93
1 """
2 A galaxy wrapper for the /rest/service/query API endpoint on Cravat.
3
4
5 Notes on Mapping:
6 -----------------
7 The CravatQuery class uses static method 'from_array' to interpret an array of values
8 into a query string for the /rest/service/query API service on the cravat server.
9 This involves using a mapping dictionary to know how to associate the array's index positions
10 in to query-ing attributes, such as the chromosome, position, etc. The CravatQuery
11 class contains a default value ('default_mapping'); however, this could also be
12 offered as a user-configurable option.
13 """
14
15
16 import requests
17 import json
18 import sys
19 import re
20
21
22 class CravatQueryException(Exception):
23
24 def __init__(self, message, errors=None):
25 super(CravatQueryException, self).__init__(message)
26 # Support for custom error codes
27 self.errors = errors
28
29
30 class CravatQuery(object):
31 """
32 : A class for handling Cravat query strings.
33 : Args (all required):
34 : chr - Chromosome
35 : pos - Position
36 : strand - Strand
37 : ref - Reference Base
38 : alt - Alternate Base
39 """
40
41 # The endpoint that CravatQuerys are submitted to
42 endpoint = 'http://cravat.us/CRAVAT/rest/service/query'
43
44 # The value delimiter used in the Cravat input file to delimit values
45 delimiter = "\t"
46
47 # Defualt indices for intepretting a cravat file's row of data in to a CravatQuery
48 default_mapping = {
49 'chromosome': 1,
50 'position': 2,
51 'strand': 3,
52 'reference': 4,
53 'alternate': 5
54 }
55
56 # Defualt values. Used as backup for CravatQuery to resolve query with incomplete information
57 default_values = {
58 'strand': '+'
59 }
60
61 # The neccessary attributes neeeded to submit a query.
62 query_keys = [
63 'chromosome', 'position', 'strand', 'reference', 'alternate'
64 ]
65
66 # Expected response keys from server. Ordered in list so that galaxy output has uniform column ordering run-to-run.
67 # If cravat server returns additional keys, they are appended to and included in output.
68 response_keys = [
69 "Chromosome", "Position", "Strand", "Reference base(s)", "Alternate base(s)",
70 "HUGO symbol", "S.O. transcript", "Sequence ontology protein change", "Sequence ontology",
71 "S.O. all transcripts", "gnomAD AF", "gnomAD AF (African)", "gnomAD AF (Amrican)",
72 "gnomAD AF (Ashkenazi Jewish)", "gnomAD AF (East Asian)", "gnomAD AF (Finnish)",
73 "gnomAD AF (Non-Finnish European)", "gnomAD AF (Other)", "gnomAD AF (South Asian)",
74 "1000 Genomes AF", "ESP6500 AF (average)", "ESP6500 AF (European American)",
75 "ESP6500 AF (African American)", "COSMIC transcript", "COSMIC protein change",
76 "COSMIC variant count [exact nucleotide change]", "cosmic_site_nt", "CGL driver class",
77 "TARGET", "dbSNP", "cgc_role", "cgc_inheritance", "cgc_tumor_type_somatic",
78 "cgc_tumor_type_germline", "ClinVar", "ClinVar disease identifier", "ClinVar XRef",
79 "GWAS Phenotype (GRASP)", "GWAS PMID (GRASP)", "Protein 3D variant"
80 ]
81
82
83 def __init__(self, _chr, pos, strand, ref, alt):
84 # '_chr' used to avoid naming confliction with python built-in 'chr'
85 self.chromosome = CravatQuery.format_chromosome(_chr)
86 self.position = pos
87 self.strand = strand
88 self.reference = ref
89 self.alternate = alt
90 self.values = [self.chromosome, self.position, self.strand, self.reference, self.alternate]
91
92
93 def __str__(self):
94 """ : Represent the CravatQuery as a valid query string for call to Cravat server """
95 return "_".join(map(lambda x: str(x), self.values))
96
97
98 def as_query_string(self):
99 return str(self)
100
101
102 @staticmethod
103 def from_dictionary(d):
104 """
105 : Instantiate a CravatQuery from a dictionary representation.
106 : Args:
107 : d <dictionary>: A dictionary representing a CravatQuery, containing keys: [{}]
108 """.format(CravatQuery.query_keys)
109
110 for key in CravatQuery.query_keys:
111 if key not in d:
112 raise CravatQueryException("CravatQuery.from_dictionary requires keys: [{}], however key: '{}' was not provided "
113 .format(CravatQuery.query_keys, key))
114 return CravatQuery(d["chromosome"], d["position"], d["strand"], d["reference"], d["alternate"])
115
116
117 @staticmethod
118 def from_array(array, mapping=None):
119 """
120 : Instantiate a CravatQuery from an array of values. Useful when translating read lines from a file.
121 : Args:
122 : fmt <str> - Either 'cr' or 'vcf', describing input format
123 : array <list> - The values to instantiate the CravatQuery from
124 : mapping <dict> - Optional. A dictionary associating cravat parameters to indicies in the array.
125 Valid values are: 'chromosome', 'position', 'strand', 'reference', 'alternate'
126 """
127
128 # Set the mapping value. Either recieved from user, or obtained via defualt associated to 'fmt'
129 if mapping == None:
130 mapping = CravatQuery.default_mapping
131
132 # Build a dict of cravat querying keys to values.
133 d = {}
134 for key in CravatQuery.query_keys:
135 # Try to get index position from mapping by the key, and value from array by the index
136 if key in mapping:
137 index = mapping[key]
138 d[key] = array[index]
139 # If index not provided in mapping, check if there is a defualt value
140 elif key in CravatQuery.default_values:
141 d[key] = CravatQuery.default_values[key]
142 # Unable to get value for querying key, meaning can't construct the minimum requirements for query
143 else:
144 raise CravatQueryException("CravatQuery.from_array requires a mapping index for key: '{}', however value was not provided".format(key))
145 return CravatQuery.from_dictionary(d)
146
147
148
149 @staticmethod
150 def format_chromosome(_chr):
151 """
152 : Format a chromosome for use as query parameter. '_chr' name used to avoid python built-in name confliction.
153 : Args:
154 : _chr - Either an interger [1,23], or 'x'/'X', or 'y'/'Y', or a string of the form
155 : 'chr<z>' where '<z>' is one of the previously described values
156 """
157 inRange = lambda x: 1 <= x and x <= 23
158 _chr = _chr.lower()
159 _chr = _chr.strip('chr')
160 # Handler interger chromosomes 1 to 23
161 try:
162 _chr = int(_chr)
163 if inRange(_chr):
164 return 'chr' + str(_chr)
165 else:
166 raise CravatQueryException("Chromsomme of '{}' was out of range [1,23]".format(_chr))
167 except:
168 pass
169 # Handle chromosomes chromosomes x and y
170 if _chr == 'x' or _chr == 'y':
171 return 'chr' + _chr
172 raise CravatQueryException("Unable to resolve input: '{}' into a valid chromosome representation".format(_chr))
173
174
175 @staticmethod
176 def jump_header(in_file, out_file, headerlines=0):
177 """
178 : Jumps over a header space of line number 'headerlines'. Sets up in_file so that
179 : the next execution of in_file.readline() will return the first non-header line.
180 """
181 in_file.seek(0)
182 for line in range(headerlines):
183 in_file.readline()
184
185
186 def main(in_path, out_path, pre_callback=None, user_mapping=None):
187 """
188 : Read the file line by line and use data to query cravat server.
189 : Args:
190 : fmt <str>: 'cr' or 'vcf'. The input format
191 : in_path <str>: Path to input file
192 : in_path <str>: Path to output file
193 : header_callback <function>: A function to handle the header space. Executed
194 before main loop. Recieves in_file, out_file, and fmt as argumnets
195 """
196
197 with open(in_path, 'r') as in_file, \
198 open(out_path, 'w') as out_file:
199
200 # Perform any pre-processing steps, such as jumping a header space
201 if pre_callback:
202 pre_callback(in_file, out_file, fmt)
203
204 # main loop
205 for line in in_file:
206
207 # Create query from line of input data
208 line = line.strip().split('\t')
209 query = CravatQuery.from_array(line, user_mapping)
210 # Make request, and write respone data
211 call = requests.get(CravatQuery.endpoint, params={ 'mutation': query.as_query_string })
212 try:
213 if call.status_code != 200 or call.text == "":
214 raise CravatQueryException("Bad Server Response. Respone code: '{}', Response Text: '{}'".format(call.status_code, call.text))
215 json_response = json.loads(call.text)
216 wrote = False
217 for key, val in json_response.items():
218 # Set numeric values to uniform format
219 try:
220 val = float(val)
221 val = format(val, ".4f")
222 except:
223 pass
224 if wrote:
225 out_file.write("\t")
226 out_file.write(val)
227 wrote = True
228 out_file.write("\n")
229 except CravatQueryException as e:
230 print(e)
231
232
233
234
235 if __name__ == "__main__":
236
237 # Input and output file paths, obtained form command line
238 in_path = sys.argv[1]
239 out_path = sys.argv[2]
240
241 # Possibly allow user mapping configuration thourgh here. Not fully implemented
242 if len(sys.argv) > 2:
243 user_mapping = sys.argv[3]
244
245 # Run the main operation
246 main(in_path, out_path)