view cravat_annotate/cravat_annotate.py @ 10:152227fa7851 draft

Uploaded
author in_silico
date Tue, 12 Jun 2018 11:04:25 -0400
parents
children
line wrap: on
line source

"""
A galaxy wrapper for the /rest/service/query API endpoint on Cravat.


Notes on Mapping:
-----------------
The CravatQuery class uses static method 'from_array' to interpret an array of values
into a query string for the /rest/service/query API service on the cravat server.
This involves using a mapping dictionary to know how to associate the array's index positions
in to query-ing attributes, such as the chromosome, position, etc. The CravatQuery
class contains a default value ('default_mapping'); however, this could also be
offered as a user-configurable option.


Remaining Items (including possible expansion features):
-----------------
TODO: Possibly provide user-configurability of CravatQuery array index mapping
TODO: Possibly provide user-configurability of delimiter value
TODO: Check if chromosomes are 0 or 1 based indexing
TODO: Port 'write headers' option and include in user prompts in galaxy xml
TODO: Try-catch structure on the query call to cravat so if one bad query doesn't get back a response,
		the rest of the run can still execute. Report this to user.
"""


import requests
import json
import sys
import re
###
import ipdb


class CravatQueryException(Exception):

	def __init__(self, message, errors=None):	 
		super(CravatQueryException, self).__init__(message)
		# Support for custom error codes
		self.errors = errors


class CravatQuery(object):
	"""
	: A class for handling Cravat query strings.
	: Args (all required):
	:	chr - Chromosome
	:	pos - Position
	:	strand - Strand
	:	ref - Reference Base
	:	alt - Alternate Base
	"""

	# The endpoint that CravatQuerys are submitted to
	endpoint = 'http://cravat.us/CRAVAT/rest/service/query'

	# The value delimiter used in the Cravat input file to delimit values
	delimiter = "\t"

	# Defualt indices for intepretting a cravat file's row of data in to a CravatQuery
	default_mapping = {
		'chromosome': 1,
		'position': 2,
		'strand': 3,
		'reference': 4,
		'alternate': 5
	}

	# Defualt values. Used as backup for CravatQuery to resolve query with incomplete information
	default_values = {
		'strand': '+'
	}

	# The neccessary attributes neeeded to submit a query.
	query_keys = [
		'chromosome', 'position', 'strand', 'reference', 'alternate'
	]

	# Expected response keys from server. Ordered in list so that galaxy output has uniform column ordering run-to-run.
	# If cravat server returns additional keys, they are appended to and included in output.
	response_keys = [
		"Chromosome", "Position", "Strand", "Reference base(s)", "Alternate base(s)",
	 	"HUGO symbol", "S.O. transcript", "Sequence ontology protein change", "Sequence ontology",
		"S.O. all transcripts", "gnomAD AF", "gnomAD AF (African)", "gnomAD AF (Amrican)",
		"gnomAD AF (Ashkenazi Jewish)", "gnomAD AF (East Asian)", "gnomAD AF (Finnish)",
		"gnomAD AF (Non-Finnish European)", "gnomAD AF (Other)", "gnomAD AF (South Asian)",
		"1000 Genomes AF", "ESP6500 AF (average)", "ESP6500 AF (European American)",
		"ESP6500 AF (African American)", "COSMIC transcript", "COSMIC protein change", 
		"COSMIC variant count [exact nucleotide change]", "cosmic_site_nt", "CGL driver class",
		"TARGET", "dbSNP", "cgc_role", "cgc_inheritance", "cgc_tumor_type_somatic",
		"cgc_tumor_type_germline", "ClinVar", "ClinVar disease identifier", "ClinVar XRef",
		"GWAS Phenotype (GRASP)", "GWAS PMID (GRASP)", "Protein 3D variant"
	]


	def __init__(self, _chr, pos, strand, ref, alt):
		# '_chr' used to avoid naming confliction with python built-in 'chr'
		self.chromosome = CravatQuery.format_chromosome(_chr)
		self.position = pos
		self.strand = strand
		self.reference = ref
		self.alternate = alt
		self.values = [self.chromosome, self.position, self.strand, self.reference, self.alternate]


	def __str__(self):
		"""
		: Represent the CravatQuery as a valid query string for call to Cravat server
		"""
		return "_".join(map(lambda x: str(x), self.values))

	
	def as_query_string(self):
		return str(self)

	
	@staticmethod
	def from_dictionary(d):
		"""
		: Instantiate a CravatQuery from a dictionary representation.
		: Args:
		:	d <dictionary>: A dictionary representing a CravatQuery, containing keys: [{}] 
		""".format(CravatQuery.query_keys)

		for key in CravatQuery.query_keys:
			if key not in d:
				raise CravatQueryException("CravatQuery.from_dictionary requires keys: [{}], however key: '{}' was not provided "
											.format(CravatQuery.query_keys, key))
		return CravatQuery(d["chromosome"], d["position"], d["strand"], d["reference"], d["alternate"])


	@staticmethod
	def from_array(array, mapping=None):
		"""
		: Instantiate a CravatQuery from an array of values. Useful when translating read lines from a file.
		: Args:
		:	fmt <str> - Either 'cr' or 'vcf', describing input format
		:	array <list> - The values to instantiate the CravatQuery from
		:	mapping <dict> - Optional. A dictionary associating cravat parameters to indicies in the array.
								Valid values are: 'chromosome', 'position', 'strand', 'reference', 'alternate'
		"""
		
		# Set the mapping value. Either recieved from user, or obtained via defualt associated to 'fmt'
		if mapping == None:
			mapping = CravatQuery.default_mapping
			
		# Build a dict of cravat querying keys to values.
		d = {}
		for key in CravatQuery.query_keys:
			# Try to get index position from mapping by the key, and value from array by the index
			if key in mapping:
				index = mapping[key]
				d[key] = array[index]
			# If index not provided in mapping, check if there is a defualt value
			elif key in CravatQuery.default_values:
				d[key] = CravatQuery.default_values[key]
			# Unable to get value for querying key, meaning can't construct the minimum requirements for query
			else:
				raise CravatQueryException("CravatQuery.from_array requires a mapping index for key: '{}', however value was not provided".format(key))
		return CravatQuery.from_dictionary(d)



	@staticmethod
	def format_chromosome(_chr):
		"""
		: Format a chromosome for use as query parameter. '_chr' name used to avoid python built-in name confliction.
		: Args:
		:	_chr - Either an interger [1,23], or 'x'/'X', or 'y'/'Y', or a string of the form
		:			'chr<z>' where '<z>' is one of the previously described values 
		"""
		inRange = lambda x: 1 <= x and x <= 23
		_chr = _chr.lower()
		_chr = _chr.strip('chr')
		# Handler interger chromosomes 1 to 23
		try:
			_chr = int(_chr)
			if inRange(_chr):
				return 'chr' + str(_chr)
			else:
				raise CravatQueryException("Chromsomme of '{}' was out of range [1,23]".format(_chr))
		except:
			pass
		# Handle chromosomes chromosomes x and y
		if _chr == 'x' or _chr == 'y':
			return 'chr' + _chr
		raise CravatQueryException("Unable to resolve input: '{}' into a valid chromosome representation".format(_chr))


	@staticmethod
	def jump_header(in_file, out_file, headerlines=0):
		"""
		: Jumps over a header space of line number 'headerlines'. Sets up in_file so that
		: the next execution of in_file.readline() will return the first non-header line.
		"""
		in_file.seek(0)
		for line in range(headerlines):
			in_file.readline()


def main(in_path, out_path, pre_callback=None, user_mapping=None):
	"""
	: Read the file line by line and use data to query cravat server.
	: Args:
	:	- fmt <str>: 'cr' or 'vcf'. The input format
	:	- in_path <str>: Path to input file
	:	- in_path <str>: Path to output file
	:	- header_callback <function>: A function to handle the header space. Executed
			before main loop. Recieves in_file, out_file, and fmt as argumnets
	"""

	with open(in_path, 'r') as in_file, \
	open(out_path, 'w') as out_file:

		# Perform any pre-processing steps, such as jumping a header space
		if pre_callback:
			pre_callback(in_file, out_file, fmt)

		# main loop
		for line in in_file:

			# Create query from line of input data
			line = line.strip().split('\t')
			query = CravatQuery.from_array(line, user_mapping)
			# Make request, and write respone data
			call = requests.get(CravatQuery.endpoint, params={ 'mutation': query.as_query_string })
			ipdb.set_trace()
			try:
				if call.status_code != 200 or call.text == "":
					raise CravatQueryException("Bad Server Response. Respone code: '{}', Response Text: '{}'".format(call.status_code, call.text))
				json_response = json.loads(call.text)
				wrote = False
				for key, val in json_response.items():
					# Set numeric values to uniform format
					try:
						val = float(val)
						val = format(val, ".4f")
					except:
						pass
					if wrote:
						out_file.write("\t")
					out_file.write(val)
					wrote = True
				out_file.write("\n")
			except CravatQueryException as e:
				print(e)
				
		


if __name__ == "__main__":

	# Input and output file paths, obtained form command line
	in_path = sys.argv[1]
	out_path = sys.argv[2]

	# Possibly allow user mapping configuration thourgh here. Not fully implemented
	if len(sys.argv) > 2:
		user_mapping = sys.argv[3]

	# Run the main operation
	main(in_path, out_path)