Mercurial > repos > gregory-minevich > cloudmap_variant_discovery_mapping

--- a/VDM_Mapping.py	Mon Oct 08 13:53:22 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,437 +0,0 @@
-#!/usr/bin/python
-
-import re
-import sys
-import optparse
-import csv
-import re
-import pprint
-from decimal import *
-from rpy import *
-
-def main():
-	csv.field_size_limit(1000000000)
-
-	parser = optparse.OptionParser()
-	parser.add_option('-v', '--sample_vcf', dest = 'sample_vcf', action = 'store', type = 'string', default = None, help = "Sample VCF from GATK Unified Genotyper")
-	parser.add_option('-l', '--loess_span', dest = 'loess_span', action = 'store', type = 'float', default = .01, help = "Loess span")
-	parser.add_option('-d', '--d_yaxis', dest = 'd_yaxis', action = 'store', type = 'float', default = .7, help = "y-axis upper limit for dot plot")
-	parser.add_option('-y', '--h_yaxis', dest = 'h_yaxis', action = 'store', type = 'int', default = 5, help = "y-axis upper limit for histogram plot")
-	parser.add_option('-c', '--points_color', dest = 'points_color', action = 'store', type = 'string', default = "gray27", help = "Color for data points")
-	parser.add_option('-k', '--loess_color', dest = 'loess_color', action = 'store', type = 'string', default = "red", help = "Color for loess regression line")
-	parser.add_option('-z', '--standardize', dest = 'standardize', default= 'true', help = "Standardize X-axis")
-	parser.add_option('-b', '--break_file', dest = 'break_file', action = 'store', type = 'string', default = 'C.elegans', help = "File defining the breaks per chromosome")
-	parser.add_option('-x', '--bin_size', dest = 'bin_size', action = 'store', type = 'int', default = 1000000, help = "Size of histogram bins, default is 1mb")
-	parser.add_option('-n', '--normalize_bins', dest = 'normalize_bins', default= 'true', help = "Normalize histograms")
-
-
-	parser.add_option('-o', '--output', dest = 'output', action = 'store', type = 'string', default = None, help = "Output file name")
-	parser.add_option('-s', '--location_plot_output', dest = 'location_plot_output', action = 'store', type = 'string', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")
-
-	(options, args) = parser.parse_args()
-
-	vcf_info = parse_vcf(sample_vcf = options.sample_vcf)
-
-	output_vcf_info(output = options.output, vcf_info = vcf_info)
-
-	#output plot with all ratios
-	rounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)
-
-	normalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = rounded_bin_size, normalize_bins = options.normalize_bins)
-	normalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(vcf_info = vcf_info, xbase = (rounded_bin_size / 2), normalize_bins = options.normalize_bins)
-
-	break_dict = parse_breaks(break_file = options.break_file)
-
-	output_scatter_plots_by_location(location_plot_output = options.location_plot_output, vcf_info = vcf_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size)
-
-
-def skip_headers(reader = None, i_file = None):
-	# count headers
-	comment = 0
-	while reader.next()[0].startswith('#'):
-		comment = comment + 1
-
-	# skip headers
-	i_file.seek(0)
-	for i in range(0, comment):
-		reader.next()
-
-def parse_breaks(break_file = None):
-	if break_file == 'C.elegans':
-		break_dict = { 'I' : 16 , 'II' : 16,  'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 }
-		return break_dict
-	elif break_file == 'Arabadopsis':
-		break_dict = { '1' : 31 , '2' : 20,  '3' : 24, '4' : 19, '5' : 27 }
-		return break_dict
-	else:
-		i_file = open(break_file, 'rU')
-		break_dict = {}
-		reader = csv.reader(i_file, delimiter = '\t')
-		for row in reader:
-			chromosome = row[0].upper()
-			chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
-			chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
-			break_count = row[1]
-			break_dict[chromosome] = int(break_count)
-		return break_dict
-
-
-def location_comparer(location_1, location_2):
-	chr_loc_1 = location_1.split(':')[0]
-	pos_loc_1 = int(location_1.split(':')[1])
-
-	chr_loc_2 = location_2.split(':')[0]
-	pos_loc_2 = int(location_2.split(':')[1])
-
-	if chr_loc_1 == chr_loc_2:
-		if pos_loc_1 < pos_loc_2:
-			return -1
-		elif pos_loc_1 == pos_loc_1:
-			return 0
-		elif pos_loc_1 > pos_loc_2:
-			return 1
-	elif chr_loc_1 < chr_loc_2:
-		return -1
-	elif chr_loc_1 > chr_loc_2:
-		return 1
-
-def output_vcf_info(output = None, vcf_info = None):
-	o_file = open(output, 'wb')
-	writer = csv.writer(o_file, delimiter = '\t')
-
-	writer.writerow(["#Chr\t", "Pos\t", "Alt Count\t", "Ref Count\t", "Read Depth\t", "Ratio\t"])
-
-	location_sorted_vcf_info_keys = sorted(vcf_info.keys(), cmp=location_comparer)
-
-	for location in location_sorted_vcf_info_keys:
-		alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
-
-		location_info = location.split(':')
-		chromosome = location_info[0]
-		position = location_info[1]
-
-		writer.writerow([chromosome, position, alt_allele_count, ref_allele_count, read_depth, ratio])
-
-	o_file.close()
-
-def output_scatter_plots_by_location(location_plot_output = None, vcf_info = None, loess_span="", d_yaxis="", h_yaxis="", points_color="", loess_color="", standardize=None, normalized_hist_per_mb = None, normalized_hist_per_5kb = None, breaks = None, rounded_bin_size = 1000000):
-	positions = {}
-	current_chr = ""
-	prev_chr = ""
-
-	x_label = "Location (Mb)"
-	filtered_label = ''
-
-	location_sorted_vcf_info_keys = sorted(vcf_info.keys(), cmp=location_comparer)
-
-	break_unit = Decimal(rounded_bin_size) / Decimal(1000000)
-	max_breaks = max(breaks.values())
-
-	try:
-		r.pdf(location_plot_output, 8, 8)
-
-		for location in location_sorted_vcf_info_keys:
-			current_chr = location.split(':')[0]
-			position = location.split(':')[1]
-
-			alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
-
-			if prev_chr != current_chr:
-				if prev_chr != "":
-					hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = prev_chr)
-					hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = prev_chr)
-
-					plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = prev_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[prev_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit)
-
-				prev_chr = current_chr
-				positions = {}
-
-			positions[position] = ratio
-
-		hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = current_chr)
-		hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = current_chr)
-
-		plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = current_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[current_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit)
-
-		r.dev_off()
-
-	except Exception as inst:
-        	print inst
-        	print "There was an error creating the location plot pdf... Please try again"
-
-def get_hist_dict_by_chr(normalized_hist_per_xbase = None, chr = ''):
-	hist_dict = {}
-
-	for location in normalized_hist_per_xbase:
-		chromosome = location.split(':')[0]
-		if chromosome == chr:
-			position = int(location.split(':')[1])
-			hist_dict[position] = normalized_hist_per_xbase[location]
-
-	max_location = max(hist_dict.keys(), key=int)
-	for i in range(1, max_location):
-		if i not in hist_dict:
-			hist_dict[i] = 0
-
-	return hist_dict
-
-def plot_data(chr_dict =  None, hist_dict_mb = None, hist_dict_5kb = None, chr = "", x_label = "", divide_position = False, draw_secondary_grid_lines = False, loess_span=None, d_yaxis=None, h_yaxis=None, points_color="", loess_color="", breaks = None, standardize= None, max_breaks = 1, break_unit = 1):
-	ratios = "c("
-	positions = "c("
-
-	for position in chr_dict:
-		ratio = chr_dict[position]
-		if divide_position:
-		       	position = float(position) / 1000000.0
-	        positions = positions + str(position) + ", "
-		ratios = ratios + str(ratio) + ", "
-
-	if len(ratios) == 2:
-		ratios = ratios + ")"
-	else:
-		ratios = ratios[0:len(ratios) - 2] + ")"
-
-	if len(positions) == 2:
-		positions = positions + ")"
-	else:
-		positions = positions[0:len(positions) - 2] + ")"
-
-	r("x <- " + positions)
-	r("y <- " + ratios)
-
-	hist_mb_values = "c("
-    	for position in sorted(hist_dict_mb):
-		hist_mb_values = hist_mb_values + str(hist_dict_mb[position]) + ", "
-
-	if len(hist_mb_values) == 2:
-		hist_mb_values = hist_mb_values + ")"
-	else:
-		hist_mb_values = hist_mb_values[0:len(hist_mb_values) - 2] + ")"
-
-	hist_5kb_values = "c("
-	for position in sorted(hist_dict_5kb):
-		hist_5kb_values = hist_5kb_values + str(hist_dict_5kb[position]) + ", "
-
-	if len(hist_5kb_values) == 2:
-		hist_5kb_values = hist_5kb_values + ")"
-	else:
-		hist_5kb_values = hist_5kb_values[0:len(hist_5kb_values) - 2] + ")"
-
-	r("xz <- " + hist_mb_values)
-	r("yz <- " + hist_5kb_values)
-
-
-	max_break_str = str(max_breaks)
-	break_unit_str = str(Decimal(break_unit))
-	half_break_unit_str = str(Decimal(break_unit) / Decimal(2))
-	break_penta_unit_str = str(Decimal(break_unit) * Decimal(5))
-
-	if (standardize=='true'):
-		r("plot(x, y, ,cex=0.60, xlim=c(0," + max_break_str + "), main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')")
-		r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')")
-		r("axis(1, at=seq(0, " + max_break_str + ", by=" + break_unit_str + "), labels=FALSE, tcl=-0.5)")
-		r("axis(1, at=seq(0, " + max_break_str + ", by=" + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
-		r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)")
-	elif (standardize=='false'):
-		r("plot(x, y, cex=0.60, main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')")
-		r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')")
-		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
-		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
-		r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)")
-
-	if draw_secondary_grid_lines:
-		r("abline(h = seq(floor(min(y)), 1, by=0.1), v = seq(floor(min(x)), length(x), by= 1), col='gray')")
-	else:
-		r("grid(lty = 1, col = 'gray')")
-
-	if (standardize=='true'):
-		r("barplot(xz, xlim=c(0, " + max_break_str + "), ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = " + break_unit_str + ", xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')")
-		r("barplot(yz, space = 0, add=TRUE, width = " + half_break_unit_str + ", col=rgb(1, 0, 0, 1))")
-		r("axis(1, hadj = 1, at=seq(0, " + max_break_str + ", by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
-		r("axis(1, at=seq(0, " + max_break_str + ", by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)")
-		r("axis(1, at=seq(0, " + max_break_str + ", by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
-	elif (standardize=='false'):
-		r("barplot(xz, ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = 1, xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')")
-		r("barplot(yz, space = 0, add=TRUE, width = 0.5, col=rgb(1, 0, 0, 1))")
-		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
-		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)")
-		r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
-
-
-
-def calculate_normalized_histogram_bins_per_xbase(vcf_info = None, xbase = 1000000, normalize_bins = None):
-	normalized_histogram_bins_per_xbase = {}
-
-	ref_snp_count_per_xbase = get_ref_snp_count_per_xbase(vcf_info = vcf_info, xbase = xbase)
-	mean_one_ratio_snp_count_per_chromosome = get_mean_one_ratio_snp_count_per_chromosome(vcf_info = vcf_info, xbase = xbase)
-	one_ratio_snp_count_per_xbase = get_one_ratio_snp_count_per_xbase(vcf_info = vcf_info, xbase = xbase)
-
-	for location in ref_snp_count_per_xbase:
-		chromosome = location.split(':')[0]
-		mean_one_ratio_snp_count = mean_one_ratio_snp_count_per_chromosome[chromosome]
-		ref_snp_count = ref_snp_count_per_xbase[location]
-
-		one_ratio_snp_count = 0
-		if location in one_ratio_snp_count_per_xbase:
-			one_ratio_snp_count = one_ratio_snp_count_per_xbase[location]
-
-		if normalize_bins == 'true':
-			if one_ratio_snp_count == 0:
-				normalized_histogram_bins_per_xbase[location] = 0
-			elif one_ratio_snp_count == ref_snp_count:
-				normalized_histogram_bins_per_xbase[location] = (Decimal(one_ratio_snp_count**2) * Decimal(mean_one_ratio_snp_count))
-			else:
-				normalized_histogram_bins_per_xbase[location] = (Decimal(one_ratio_snp_count**2) / (Decimal(ref_snp_count)-Decimal(one_ratio_snp_count))) * Decimal(mean_one_ratio_snp_count)
-		else:
-			normalized_histogram_bins_per_xbase[location] = one_ratio_snp_count
-
-	return normalized_histogram_bins_per_xbase
-
-
-def get_ref_snp_count_per_xbase(vcf_info = None, xbase = 1000000):
-	ref_snps_per_xbase = {}
-
-	for location in vcf_info:
-		location_info = location.split(':')
-
-		chromosome = location_info[0].upper()
-		chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
-		chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
-
-		position = location_info[1]
-		xbase_position = (int(position) / xbase) + 1
-
-		location = chromosome + ":" + str(xbase_position)
-		if location in ref_snps_per_xbase:
-			ref_snps_per_xbase[location] = ref_snps_per_xbase[location] + 1
-		else:
-			ref_snps_per_xbase[location] = 1
-
-	return ref_snps_per_xbase
-
-
-
-def get_mean_one_ratio_snp_count_per_chromosome(vcf_info, xbase = 1000000):
-	sample_snp_count_per_xbase = {}
-
-	for location in vcf_info:
-		alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
-
-		location_info = location.split(':')
-		chromosome = location_info[0]
-		position = location_info[1]
-		xbase_position = (int(position) / xbase) + 1
-		xbase_location = chromosome + ":" + str(xbase_position)
-
-		if ratio == 1:
-			if xbase_location in sample_snp_count_per_xbase:
-				sample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1
-			else:
-				sample_snp_count_per_xbase[xbase_location] = 1
-
-		elif ratio != 1 and xbase_location not in sample_snp_count_per_xbase:
-			sample_snp_count_per_xbase[xbase_location] = 0
-
-	mean_one_ratio_snp_count_per_chromosome = {}
-	for location in sample_snp_count_per_xbase:
-		chromosome = location.split(':')[0]
-		sample_count = sample_snp_count_per_xbase[location]
-		if chromosome in mean_one_ratio_snp_count_per_chromosome:
-			mean_one_ratio_snp_count_per_chromosome[chromosome].append(sample_count)
-		else:
-			mean_one_ratio_snp_count_per_chromosome[chromosome] = [sample_count]
-
-	for chromosome in mean_one_ratio_snp_count_per_chromosome:
-		summa = sum(mean_one_ratio_snp_count_per_chromosome[chromosome])
-		count = len(mean_one_ratio_snp_count_per_chromosome[chromosome])
-
-		mean_one_ratio_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)
-
-	return mean_one_ratio_snp_count_per_chromosome
-
-
-def get_one_ratio_snp_count_per_xbase(vcf_info = None, xbase = 1000000):
-	one_ratio_snp_count_per_xbase = {}
-
-	for location in vcf_info:
-		alt_allele_count, ref_allele_count, read_depth, ratio = vcf_info[location]
-
-		location_info = location.split(':')
-		chromosome = location_info[0]
-		position = location_info[1]
-		xbase_position = (int(position) / xbase) + 1
-		xbase_location = chromosome + ":" + str(xbase_position)
-
-		if ratio == 1:
-			if xbase_location in one_ratio_snp_count_per_xbase:
-				one_ratio_snp_count_per_xbase[xbase_location] = one_ratio_snp_count_per_xbase[xbase_location] + 1
-			else:
-				one_ratio_snp_count_per_xbase[xbase_location] = 1
-
-		elif ratio != 1 and xbase_location not in one_ratio_snp_count_per_xbase:
-			one_ratio_snp_count_per_xbase[xbase_location] = 0
-
-	return one_ratio_snp_count_per_xbase
-
-
-def parse_vcf(sample_vcf = None):
-	i_file = open(sample_vcf, 'rU')
-	reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE)
-
-	skip_headers(reader = reader, i_file = i_file)
-	vcf_info = {}
-
-	for row in reader:
-		chromosome = row[0].upper()
-		chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
-		chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
-
-		if chromosome != 'MTDNA':
-			position = row[1]
-			#ref_allele = row[2]
-			#read_depth = row[3]
-			#read_bases = row[4]
-
-			vcf_format_info = row[8].split(":")
-			vcf_allele_freq_data = row[9]
-
-			read_depth_data_index = vcf_format_info.index("DP")
-			read_depth = vcf_allele_freq_data.split(":")[read_depth_data_index]
-
-			ref_and_alt_counts_data_index = vcf_format_info.index("AD")
-			ref_and_alt_counts = vcf_allele_freq_data.split(":")[ref_and_alt_counts_data_index]
-			ref_allele_count = ref_and_alt_counts.split(",")[0]
-			alt_allele_count = ref_and_alt_counts.split(",")[1]
-
-			location = chromosome + ":" + position
-
-			if Decimal(read_depth!=0):
-				getcontext().prec = 6
-				ratio = Decimal(alt_allele_count) / Decimal(read_depth)
-
-				vcf_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio)
-
-				#debug line
-				#print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id
-
-	i_file.close()
-
-	return vcf_info
-
-def parse_read_bases(read_bases = None, alt_allele = None):
-	read_bases = re.sub('\$', '', read_bases)
-	read_bases = re.sub('\^[^\s]', '', read_bases)
-
-	ref_allele_matches = re.findall("\.|\,", read_bases)
-	ref_allele_count = len(ref_allele_matches)
-
-	alt_allele_matches = re.findall(alt_allele, read_bases, flags = re.IGNORECASE)
-	alt_allele_count = len(alt_allele_matches)
-
-	#debug line
-	#print read_bases, alt_allele, alt_allele_count, ref_allele_count
-
-	return ref_allele_count, alt_allele_count
-
-if __name__ == "__main__":
-	main()
--- a/VDM_Mapping.xml	Mon Oct 08 13:53:22 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,134 +0,0 @@
-<tool id="cloudmap_variant_discovery_mapping" name="CloudMap: Variant Discovery Mapping with WGS data">
-    <description>Map a mutation using in silico bulk segregant linkage analysis using variants that are already present in the mutant strain of interest (rather than those introduced by a cross to a polymorphic strain).</description>
-    <command interpreter="python">
-	#if $source.source_select=="elegans" #VDM_Mapping.py --sample_vcf $sample_vcf --loess_span $loess_span --d_yaxis $d_yaxis --h_yaxis $h_yaxis --points_color "$points_color" --loess_color "$loess_color" --output $output --location_plot_output $location_plot_output --standardize $standardize --normalize_bins $normalize_bins --break_file $source.Celegans
-	#else if  $source.source_select=="arabadopsis" #VDM_Mapping.py --sample_vcf $sample_vcf --loess_span $loess_span --d_yaxis $d_yaxis --h_yaxis $h_yaxis --points_color "$points_color" --loess_color "$loess_color" --output $output --location_plot_output $location_plot_output --standardize $standardize --normalize_bins $normalize_bins --break_file $source.Arabadop
-	#else if  $source.source_select=="other" #VDM_Mapping.py --sample_vcf $sample_vcf --loess_span $loess_span --d_yaxis $d_yaxis --h_yaxis $h_yaxis --points_color "$points_color" --loess_color "$loess_color" --output $output --location_plot_output $location_plot_output --standardize $standardize --normalize_bins $normalize_bins --break_file $source.Other
-	#end if
-    </command>
-
-    <inputs>
-	<conditional name="source">
-		<param name="source_select" type="select" label="Please select the species">
-		        <option value="elegans">C. elegans</option>
-        		<option value="arabidopsis">Arabidopsis</option>
-        		<option value="other">Other</option>
-      		</param>
-      		<when value="elegans">
-        		<param name="Celegans" type="hidden" value="C.elegans" label="The C.elegans configuration file by default" help="C.elegans help" />
-		</when>
-      		<when value="arabidopsis">
-        		<param name="Arabidop" type="hidden" value="Arabidopsis" label="The Arabidopsis configuration file by default" help="Arabidopsis help" />
-		</when>
-      		<when value="other">
-        		<param name="Other" type="data" format="tabular" label="Please select your 'Other species' configuration file from your history" help="Tabular configuration file for Other species support" />
-		</when>
-        </conditional>
-        	<param name="sample_vcf" size = "125" type="data" format="vcf" label="WGS Mutant VCF File" help="WGS Mutant VCF file from pooled F2 mutants that have been outcrossed to any strain. The VCF should contain data from only heterozygous or homozygous base position as determined by the GATK Unified Genotyper filtered for quality score > Q200" />
-		<param name="loess_span" size = "15" type="float" value=".4" label="Loess span" help="Parameter that controls the degree of data smoothing."/>
-		<param name="d_yaxis" size = "15" type="float" value="1" label="Y-axis upper limit for scatter plot" />
-		<param name="h_yaxis" size = "15" type="integer" value="20" label="Y-axis upper limit for frequency plot" />
-		<param name="points_color" size = "15" type="text" value="gray27" label="Color for data points" help="See below for list of supported colors"/>
-		<param name="loess_color" size = "15" type="text" value="red" label="Color for loess regression line" help="See below for list of supported colors"/>
-		<param name="standardize" type="boolean" truevalue="true" falsevalue="false" checked="true"  label="Standardize X-axis" help="Scatter plots and frequency plots from separate chromosomes will have uniform X-axis spacing for comparison"/>
-		<param name="normalize_bins" type="boolean" truevalue="true" falsevalue="false" checked="true"  label="Normalize frequency plots" help="Frequency plots of pure parental allele counts will be normalized according to the equation in Supp Fig.4 of the CloudMap paper"/>
-    </inputs>
-    <outputs>
-        <data name="output" type="text" format="tabular" />
-	<data name="location_plot_output" format="pdf" />
-    </outputs>
-    <requirements>
-        <requirement type="python-module">sys</requirement>
-        <requirement type="python-module">optparse</requirement>
-        <requirement type="python-module">csv</requirement>
-        <requirement type="python-module">re</requirement>
-	<requirement type="python-module">decimal</requirement>
-        <requirement type="python-module">rpy</requirement>
-    </requirements>
-    <tests>
-	<param name="sample_vcf" value="" />
-	<output name="output" file="" />
-	<output name="plot_output" file="" />
-    </tests>
-    <help>
-**What it does:**
-
-This tool is part of the CloudMap pipeline for analysis of mutant genome sequences. For further details, please see `Gregory Minevich, Danny S. Park, Daniel Blankenberg, Richard J. Poole and Oliver Hobert.  CloudMap: A Cloud-based Pipeline for Analysis of Mutant Genome Sequences. (Genetics 2012 In Press)`__
-
-    .. __: http://biochemistry.hs.columbia.edu/labs/hobert/literature.html
-
-CloudMap workflows, shared histories and reference datasets are available at the `CloudMap Galaxy page`__
-
-    .. __: http://usegalaxy.org/cloudmap
-
-
-Although Hawaiian Variant Mapping is the preferred method for mapping causal mutations in whole genome sequenced strains (see CloudMap Hawaiian Variant Mapping with WGS tool), there remain certain scenarios where alternate mapping approaches are useful. For instance, introducing tens of thousands of Hawaiian variants into a mutant strain may not be desirable for individuals concerned with the possibility that some of these Hawaiian variants may act as modifiers of a given phenotype. Behavioral mutants may be especially vulnerable in this regard. Furthermore, in the case of suppressor screens or other screens that have been performed in a mutant background, it is tedious to recover both the suppressor variant and the starting mutation when picking the F2 progeny required for the Hawaiian Variant Mapping technique. In these scenarios, it is useful to not have to rely on a polymorphic mapping strain like the Hawaiian strain.
-
-A recent study in plants (ABE et al. 2012), uses EMS-induced variants and bulk segregant analysis to map a phenotype-causing mutation. We have developed a similar method, which we call “Variant Discovery Mapping”. Our method makes use of background variants in addition to EMS-induced variants (including indels as well as SNPs), and also uses the bulk segregant approach.
-
-**The conceptual strategy of variant discovery mapping is to perform in silico bulk segregant linkage analysis using variants that are already present in the mutant strain of interest, rather than examining those introduced by a cross to a polymorphic strain.** Any individual mutant strain will contain a certain number of homozygous variants compared to the reference genome. These homozygous variants are of two types: 1) those directly induced during mutagenesis (one or more of which are responsible for the mutant phenotype) (Fig.11A red diamonds) and 2) those already present in the background of the parental strain, either because of genetic drift or because of the parental strain containing, for example, a transgene that was integrated into the genome by irradiation (Fig.11A pale blue diamonds).
-
-
-.. image:: http://biochemistry.hs.columbia.edu/labs/hobert/CloudMap/Fig.11A_VDM.png
-
-
-Following an outcross to a non-parental strain and selection of a pool of F2-mutant recombinants, these homozygous variants will segregate according to their degree of linkage to the phenotype-inducing locus. The degree of linkage will be directly reflected in the allele frequency among the pool of recombinants and this can be represented as scatter plots of the ratio of variant reads/total reads present in the pool of sequenced recombinants (Fig.11A). We then plot a loess regression line through all the points on a given chromosome to give greater accuracy to the mapping region (Fig.11B). The loess lines on scatter plots for linked chromosomes approach 1, indicating retention of the original homozygous variants in the linked region. We also draw corresponding frequency plots that display regions of linked chromosomes where pure parental allele variant positions are concentrated (positions where the ratio of variant reads/total reads are equal to 1) (Fig.11B). 1Mb bins for the 0 ratio SNP positions are colored gray by default and .5Mb bins are colored in red.
-
-
-.. image:: http://biochemistry.hs.columbia.edu/labs/hobert/CloudMap/Fig.11B-C_VDM_2.png
-
-
-------
-
-**Input:**
-
-This tool accepts as input a single VCF file containing reference (e.g. Bristol) and alternate (e.g. EMS, background, or crossing strain variant) alleles calls in the pooled mutant sample. This input VCF is generated at an earlier analysis step by running the GATK Unified Genotyper on a BAM alignment file of the pooled mutant sample and selecting only heterozygous or homozygous base positions as determined by the GATK Unified Genotyper (filtered for quality score > Q200). The reader is referred to the user guide and online video for direction on this procedure.
-
-The CloudMap Variant Discovery Mapping with WGS Data tool supports data from any organism. C. elegans and Arabidopsis are natively supported. For all other organisms, users must provide a simple tab-delimited configuration file containing chromosome numbers and respective lengths (example configuration files for most major organisms provided at http://usegalaxy.org/cloudmap).
-
-
-**Output:**
-
-The tool also provides a tabular output file that contains a count of the number of reference and alternate variants in the pooled mutant sample as well as the ratio of alternate alleles/total reads at each variant position.
-
-
-------
-
-**Settings:**
-
-.. class:: infomark
-
-Information on loess regression and the loess span parameter:
-http://en.wikipedia.org/wiki/Local_regression
-
-.. class:: infomark
-
-Based on our testing, we've settled on .4 as a loess span default. Larger values result in smoothing of the line to reflect trends at a more macro level. Smaller values result in loess lines that more closely reflect local data fluctuations.
-
-.. class:: infomark
-
-Supported colors for data points and loess regression line:
-
-http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf
-
-http://research.stowers-institute.org/efg/R/Color/Chart/ColorChart.pdf
-
-
-
-.. class:: warningmark
-
-This tool requires that the statistical programming environment R has been installed on the system hosting Galaxy (http://www.r-project.org/). If you are running this tool on Galaxy via the Cloud, this does not apply to you.
-
-
-------
-
-**Citation:**
-
-This tool is part of the CloudMap package from the Hobert Lab. If you use this tool, please cite `Gregory Minevich, Danny S Park, Daniel Blankenberg, Richard J. Poole, and Oliver Hobert.  CloudMap: A Cloud-based Pipeline for Analysis of Mutant Genome Sequences. (Genetics 2012 In Press)`__
-
-    .. __: http://biochemistry.hs.columbia.edu/labs/hobert/literature.html
-
-Correspondence to gm2123@columbia.edu (G.M.) or or38@columbia.edu (O.H.)
-
-    </help>
-</tool>