# HG changeset patch
# User gregory-minevich
# Date 1332255183 14400
# Node ID a43cb9a57a9a6dbcbccb1fc90314117bf40b855a

Uploaded

diff -r 000000000000 -r a43cb9a57a9a ._EMS_VariantDensityMapping.py
Binary file ._EMS_VariantDensityMapping.py has changed
diff -r 000000000000 -r a43cb9a57a9a ._EMS_VariantDensityMapping.xml
Binary file ._EMS_VariantDensityMapping.xml has changed
diff -r 000000000000 -r a43cb9a57a9a EMS_VariantDensityMapping.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/EMS_VariantDensityMapping.py	Tue Mar 20 10:53:03 2012 -0400
@@ -0,0 +1,122 @@
+#!/usr/bin/python
+
+import sys
+import optparse
+import csv
+from rpy import *
+
+def main():
+	parser = optparse.OptionParser()
+	parser.add_option('-s', '--snp_vcf', dest = 'snp_vcf', action = 'store', type = 'string', default = None, help = "VCF of SNPs")
+	parser.add_option('-c', '--hist_color', dest = 'hist_color', action = 'store', type = 'string', default = "darkgray", help = "Color for 1Mb histograms") 
+	parser.add_option('-y', '--ylim', dest = 'ylim', action = 'store', type = 'int', default= 100, help = "Upper limit of Y axis")
+	parser.add_option('-z', '--standardize', dest = 'standardize', default= 'false', help = "Standardize X-axis")
+	parser.add_option('-e', '--ems', dest = 'ems', default= 'false', help = "Whether EMS variants should be filtered for")
+	parser.add_option('-o', '--output', dest = 'plot_output', action = 'store', type = 'string', default = 'EMS_Variant_Density_Plot.pdf', help = "Output file name of plot")
+	(options, args) = parser.parse_args()
+
+
+	i, ii, iii, iv, v, x = parse_snp_vcf(snp_vcf = options.snp_vcf, ems=options.ems)
+	create_histograms(plot_output = options.plot_output, hist_color=options.hist_color, ylim=options.ylim, ems=options.ems, standardize=options.standardize, i = i, ii = ii, iii = iii, iv = iv, v = v, x = x)
+
+def create_histograms(plot_output = None, hist_color=None, ylim=None, ems=None, standardize=None , i = None, ii = None, iii = None, iv = None, v = None, x = None):
+	breaks = { 'I' : 16 , 'II' : 16,  'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 }
+
+	try:
+        	r.pdf(plot_output, 8, 8)
+		if len(i) > 0:
+		        plot_data(position_list = i, chr = "I", breaks = breaks["I"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize)
+        	if len(ii) > 0:
+			plot_data(position_list = ii, chr = "II", breaks = breaks["II"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize)
+		if len(iii) > 0:
+		        plot_data(position_list = iii, chr = "III", breaks = breaks["III"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize)
+        	if len(iv) > 0:
+			plot_data(position_list = iv, chr = "IV", breaks = breaks["IV"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize)
+		if len(v) > 0:
+		        plot_data(position_list = v, chr = "V", breaks = breaks["V"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize)
+		if len(x) > 0:
+	        	plot_data(position_list = x, chr = "X", breaks = breaks["X"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize)
+	        r.dev_off()
+    	except Exception as inst:
+        	print inst
+        	print "There was an error creating the plot pdf... Please try again"
+
+def parse_snp_vcf(snp_vcf = None, ems=None):
+	i_file = open(snp_vcf, 'rU')
+	reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE)
+
+    	skip_headers(reader = reader, i_file = i_file)
+
+	i_position_list = []
+	ii_position_list = []
+	iii_position_list = []
+	iv_position_list = []
+	v_position_list = []
+	x_position_list = []
+
+	for row in reader:
+		chromosome = row[0]
+		position = row[1]
+		ref_allele = row[3]
+		alt_allele = row[4]
+
+		if  (ems=='true'):
+			if (ref_allele =="G" or ref_allele =="C") and (alt_allele =="A" or alt_allele =="T"):
+				if chromosome == "I":
+					i_position_list.append(position)
+				elif chromosome == "II":
+					ii_position_list.append(position)
+				elif chromosome == "III":
+					iii_position_list.append(position)
+				elif chromosome == "IV":
+					iv_position_list.append(position)
+				elif chromosome == "V":
+					v_position_list.append(position)
+				elif chromosome == "X":
+					x_position_list.append(position)
+		elif (ems=='false'):
+			if chromosome == "I":
+				i_position_list.append(position)
+			elif chromosome == "II":
+				ii_position_list.append(position)
+			elif chromosome == "III":
+				iii_position_list.append(position)
+			elif chromosome == "IV":
+				iv_position_list.append(position)
+			elif chromosome == "V":
+				v_position_list.append(position)
+			elif chromosome == "X":
+				x_position_list.append(position)
+
+	return i_position_list, ii_position_list, iii_position_list, iv_position_list, v_position_list, x_position_list
+
+def skip_headers(reader = None, i_file = None):
+	# count headers
+	comment = 0
+	while reader.next()[0].startswith('#'):
+		comment = comment + 1
+	
+	# skip headers
+	i_file.seek(0)
+	for i in range(0, comment):
+		reader.next()
+
+def plot_data(position_list = None, chr = None, breaks = None, hist_color=None, ylim = None, ems=None, standardize=None):
+	positions = ",".join(map(str, map(lambda x: float(x) / 1000000, position_list)))
+	positions = "c(" + positions + ")"
+	
+	if (standardize=='true'):
+		r("hist(" + positions + ", xlim=c(0,21), ylim=c(0, %d "%ylim +"),col='"+ hist_color + "', breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=1), main = 'LG " + chr + "', ylab = 'Frequency Of SNPs', xlab = 'Location (Mb)')")
+		r("hist(" + positions + ", xlim=c(0,21), add=TRUE,  ylim=c(0, %d "%ylim +"), col=rgb(1, 0, 0, 1), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=.5), main = 'Chr " + chr + "', ylab = 'Number Of SNPs', xlab = 'Location (Mb)')")
+		r("axis(1, at=seq(0, 21, by=1), labels=FALSE, tcl=-0.5)")
+		r("axis(1, at=seq(0, 21, by=0.5), labels=FALSE, tcl=-0.25)")
+	elif (standardize=='false'):
+		r("hist(" + positions + ", xlim=c(0,as.integer( ' " + str(breaks) + " ')), ylim=c(0, %d "%ylim +"),col='"+ hist_color + "', breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=1), main = 'LG " + chr + "', ylab = 'Frequency Of SNPs', xlab = 'Location (Mb)')")
+		r("hist(" + positions + ", xlim=c(0,as.integer( ' " + str(breaks) + " ')), add=TRUE,  ylim=c(0, %d "%ylim +"), col=rgb(1, 0, 0, 1), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=.5), main = 'Chr " + chr + "', ylab = 'Number Of SNPs', xlab = 'Location (Mb)')")
+		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=1), labels=FALSE, tcl=-0.5)")
+		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=0.5), labels=FALSE, tcl=-0.25)")
+
+
+
+if __name__ == "__main__":
+	main()
diff -r 000000000000 -r a43cb9a57a9a EMS_VariantDensityMapping.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/EMS_VariantDensityMapping.xml	Tue Mar 20 10:53:03 2012 -0400
@@ -0,0 +1,73 @@
+<tool id="ems_variant_density_mapping" name="EMS Variant Density Mapping">
+    <description>Map a mutation by linkage to regions of high mutation density using WGS data</description>
+    <command interpreter="python">EMS_VariantDensityMapping.py  --snp_vcf $snp_vcf  --ylim $ylim --hist_color $hist_color --standardize $standardize --ems $ems --output $output </command>
+    <inputs>
+	<param name="snp_vcf" type="data" format="vcf" label="VCF of SNPs" help="Takes a VCF file of WGS SNPs present in a C.elegans mutant strain that has been backcrossed to its (pre-mutagenesis) starting strain"/>
+	<param name="ylim" size = "15" type="integer" value="200" label="Y-axis upper limit"/> 
+	<param name="hist_color" size = "15" type="text" value="darkgray" label="Color for 1Mb bins" help="See below for list of supported colors"/> 
+	<param name="standardize" type="boolean" truevalue="true" falsevalue="false" checked="false"  label="Standardize X-axis" help="Histogram plots from separate chromosomes will have uniform X-axis spacing for comparison"/>
+	<param name="ems" type="boolean" truevalue="true" falsevalue="false" checked="false"  label="Filter for most common EMS-induced variants (G/C—>A/T)"/> 
+    </inputs>
+    <outputs>
+        <data name="output" type="text" format="pdf" />
+    </outputs>
+    <requirements>
+        <requirement type="python-module">sys</requirement>
+        <requirement type="python-module">optparse</requirement>
+        <requirement type="python-module">csv</requirement>
+        <requirement type="python-module">re</requirement>
+	<requirement type="python-module">decimal</requirement>
+        <requirement type="python-module">rpy</requirement>
+    </requirements>
+    <tests>
+	<param name="snp_vcf" value="" />
+	<output name="output" file="" />
+    </tests>
+    <help>
+**What it does:** 
+
+Following the approach detailed in Zuryn et al., Genetics 2010, this tool plots histograms of SNP density in a mutant C.elegans strain that has been backcrossed to its (pre-mutagenesis) starting strain. Sample output where LG III shows linkage to the causal mutation is shown below. (Common variants from another strain have been subtracted and remaining variants have been filtered for most common EMS-induced mutations i.e. G/C --> A/T):
+
+.. image:: http://biochemistry.hs.columbia.edu/labs/hobert/CloudMap/EMS_Variant_Density.pdf
+
+
+
+
+
+The experimental approach is detailed in Figure 1a from Zuryn et al., Genetics 2010:
+
+.. image:: http://biochemistry.hs.columbia.edu/labs/hobert/CloudMap/Zuryn_2010_Genetics_Fig1a.pdf
+
+
+Subtracting common (non-phenotype causing) variants present in multiple WGS strains (using GATK Tools Select Variants) will result in less noise and a tighter mapping region. Additional backcrosses will also result in a smaller mapping region.
+
+------
+
+**Settings:**
+
+.. class:: infomark
+
+Supported colors for data points and loess regression line:
+
+http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
+
+http://research.stowers-institute.org/efg/R/Color/Chart/ColorChart.pdf
+
+
+
+
+.. class:: warningmark
+
+This tool requires that R has been installed on your system (http://www.r-project.org/).
+
+------
+
+**Citation:**
+
+This tool is part of the CloudMap package from the Hobert Lab. If you use this tool, please cite `Gregory Minevich, Danny Park, Richard J. Poole and Oliver Hobert CloudMap: A Cloud-based Pipeline for Analysis of Mutant Genome Sequences. (2012 In Preparation)`__
+
+    .. __: http://biochemistry.hs.columbia.edu/labs/hobert/literature.html
+
+
+    </help>
+</tool>