# HG changeset patch # User gregory-minevich # Date 1349717854 14400 # Node ID 8fc36bd53f945c9ffafc54a5f3321194cbb7edc0 # Parent 98d409af683c2ae5e3f16abdec9fdd8c4d5b5503 Deleted selected files diff -r 98d409af683c -r 8fc36bd53f94 SNP_Mapping.py --- a/SNP_Mapping.py Thu Jun 28 14:21:31 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,498 +0,0 @@ -#!/usr/bin/python - -import re -import sys -import optparse -import csv -import re -import pprint -from decimal import * -from rpy import * - -def main(): - csv.field_size_limit(1000000000) - - parser = optparse.OptionParser() - parser.add_option('-p', '--sample_pileup', dest = 'sample_pileup', action = 'store', type = 'string', default = None, help = "Sample pileup from mpileup") - parser.add_option('-v', '--haw_vcf', dest = 'haw_vcf', action = 'store', type = 'string', default = None, help = "vcf file of Hawaiian SNPs") - parser.add_option('-l', '--loess_span', dest = 'loess_span', action = 'store', type = 'float', default = .01, help = "Loess span") - parser.add_option('-d', '--d_yaxis', dest = 'd_yaxis', action = 'store', type = 'float', default = .7, help = "y-axis upper limit for dot plot") - parser.add_option('-y', '--h_yaxis', dest = 'h_yaxis', action = 'store', type = 'int', default = 5, help = "y-axis upper limit for histogram plot") - parser.add_option('-c', '--points_color', dest = 'points_color', action = 'store', type = 'string', default = "gray27", help = "Color for data points") - parser.add_option('-k', '--loess_color', dest = 'loess_color', action = 'store', type = 'string', default = "red", help = "Color for loess regression line") - parser.add_option('-z', '--standardize', dest = 'standardize', default= 'true', help = "Standardize X-axis") - parser.add_option('-b', '--break_file', dest = 'break_file', action = 'store', type = 'string', default = 'C.elegans', help = "File defining the breaks per chromosome") - parser.add_option('-x', '--bin_size', dest = 'bin_size', action = 'store', type = 'int', default = 1000000, help = "Size of histogram bins, default is 1mb") - #parser.add_option('-n', '--normalize_bins', dest = 'normalize_bins', action = 'store_true', help = "Normalize histograms") - parser.add_option('-n', '--normalize_bins', dest = 'normalize_bins', default= 'true', help = "Normalize histograms") - - - parser.add_option('-o', '--output', dest = 'output', action = 'store', type = 'string', default = None, help = "Output file name") - parser.add_option('-s', '--location_plot_output', dest = 'location_plot_output', action = 'store', type = 'string', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location") - - #For plotting with map units on the X-axis instead of physical distance - #parser.add_option('-u', '--mpu_plot_output', dest = 'mpu_plot_output', action = 'store', type = 'string', default = None, help = "Output file name of SNP plots by map unit location") - (options, args) = parser.parse_args() - - haw_snps = build_haw_snp_dictionary(haw_vcf = options.haw_vcf) - pileup_info = parse_pileup(sample_pileup = options.sample_pileup, haw_snps = haw_snps) - - output_pileup_info(output = options.output, pileup_info = pileup_info) - - #output plot with all ratios - rounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000) - - normalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = rounded_bin_size, normalize_bins = options.normalize_bins) - normalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = (rounded_bin_size / 2), normalize_bins = options.normalize_bins) - - break_dict = parse_breaks(break_file = options.break_file) - - output_scatter_plots_by_location(location_plot_output = options.location_plot_output, pileup_info = pileup_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size) - - #For plotting with map units on the X-axis instead of physical distance) - #output_scatter_plots_by_mapping_units(mpu_plot_output = options.mpu_plot_output, pileup_info = pileup_info) - -def skip_headers(reader = None, i_file = None): - # count headers - comment = 0 - while reader.next()[0].startswith('#'): - comment = comment + 1 - - # skip headers - i_file.seek(0) - for i in range(0, comment): - reader.next() - -def parse_breaks(break_file = None): - if break_file == 'C.elegans': - break_dict = { 'I' : 16 , 'II' : 16, 'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 } - return break_dict - elif break_file == 'Arabadopsis': - break_dict = { '1' : 16 , '2' : 16, '3' : 21, '4' : 18, '5' : 21 } - return break_dict - else: - i_file = open(break_file, 'rU') - break_dict = {} - reader = csv.reader(i_file, delimiter = '\t') - for row in reader: - chromosome = row[0].upper() - chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE) - chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE) - break_count = row[1] - break_dict[chromosome] = int(break_count) - return break_dict - - -def location_comparer(location_1, location_2): - chr_loc_1 = location_1.split(':')[0] - pos_loc_1 = int(location_1.split(':')[1]) - - chr_loc_2 = location_2.split(':')[0] - pos_loc_2 = int(location_2.split(':')[1]) - - if chr_loc_1 == chr_loc_2: - if pos_loc_1 < pos_loc_2: - return -1 - elif pos_loc_1 == pos_loc_1: - return 0 - elif pos_loc_1 > pos_loc_2: - return 1 - elif chr_loc_1 < chr_loc_2: - return -1 - elif chr_loc_1 > chr_loc_2: - return 1 - -def output_pileup_info(output = None, pileup_info = None): - o_file = open(output, 'wb') - writer = csv.writer(o_file, delimiter = '\t') - - writer.writerow(["#Chr\t", "Pos\t", "ID\t", "Alt Count\t", "Ref Count\t", "Read Depth\t", "Ratio\t", "Mapping Unit"]) - - location_sorted_pileup_info_keys = sorted(pileup_info.keys(), cmp=location_comparer) - - for location in location_sorted_pileup_info_keys: - alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location] - - location_info = location.split(':') - chromosome = location_info[0] - position = location_info[1] - - writer.writerow([chromosome, position, haw_snp_id, alt_allele_count, ref_allele_count, read_depth, ratio, mapping_unit]) - - o_file.close() - -def output_scatter_plots_by_location(location_plot_output = None, pileup_info = None, loess_span="", d_yaxis="", h_yaxis="", points_color="", loess_color="", standardize=None, normalized_hist_per_mb = None, normalized_hist_per_5kb = None, breaks = None, rounded_bin_size = 1000000): - positions = {} - current_chr = "" - prev_chr = "" - - x_label = "Location (Mb)" - filtered_label = '' - - location_sorted_pileup_info_keys = sorted(pileup_info.keys(), cmp=location_comparer) - - break_unit = Decimal(rounded_bin_size) / Decimal(1000000) - max_breaks = max(breaks.values()) - - try: - r.pdf(location_plot_output, 8, 8) - - for location in location_sorted_pileup_info_keys: - current_chr = location.split(':')[0] - position = location.split(':')[1] - - alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location] - - if prev_chr != current_chr: - if prev_chr != "": - hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = prev_chr) - hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = prev_chr) - - plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = prev_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[prev_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit) - - prev_chr = current_chr - positions = {} - - positions[position] = ratio - - hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = current_chr) - hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = current_chr) - - plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = current_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[current_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit) - - r.dev_off() - - except Exception as inst: - print inst - print "There was an error creating the location plot pdf... Please try again" - -def get_hist_dict_by_chr(normalized_hist_per_xbase = None, chr = ''): - hist_dict = {} - - for location in normalized_hist_per_xbase: - chromosome = location.split(':')[0] - if chromosome == chr: - position = int(location.split(':')[1]) - hist_dict[position] = normalized_hist_per_xbase[location] - - return hist_dict - -''' -def output_scatter_plots_by_mapping_units(mpu_plot_output = None, pileup_info = None): - i = {} - ii = {} - iii = {} - iv = {} - v = {} - x = {} - - for location in pileup_info: - chromosome = location.split(':')[0] - position = location.split(':')[1] - - alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location] - - if chromosome == "I": - i[mapping_unit] = ratio - elif chromosome == "II": - ii[mapping_unit] = ratio - elif chromosome == "III": - iii[mapping_unit] = ratio - elif chromosome == "IV": - iv[mapping_unit] = ratio - elif chromosome == "V": - v[mapping_unit] = ratio - elif chromosome == "X": - x[mapping_unit] = ratio - - x_label = "Map Units" - - try: - r.pdf(mpu_plot_output, 8, 8) - plot_data(chr_dict = i, chr = "I", x_label = "Map Units") - plot_data(chr_dict = ii, chr = "II", x_label = "Map Units") - plot_data(chr_dict = iii, chr = "III", x_label = "Map Units") - plot_data(chr_dict = iv, chr = "IV", x_label = "Map Units") - plot_data(chr_dict = v, chr = "V", x_label = "Map Units") - plot_data(chr_dict = x, chr = "X", x_label = "Map Units") - r.dev_off() - except Exception as inst: - print inst - print "There was an error creating the map unit plot pdf... Please try again" -''' - -def plot_data(chr_dict = None, hist_dict_mb = None, hist_dict_5kb = None, chr = "", x_label = "", divide_position = False, draw_secondary_grid_lines = False, loess_span=None, d_yaxis=None, h_yaxis=None, points_color="", loess_color="", breaks = None, standardize= None, max_breaks = 1, break_unit = 1): - ratios = "c(" - positions = "c(" - - for position in chr_dict: - ratio = chr_dict[position] - if divide_position: - position = float(position) / 1000000.0 - positions = positions + str(position) + ", " - ratios = ratios + str(ratio) + ", " - - if len(ratios) == 2: - ratios = ratios + ")" - else: - ratios = ratios[0:len(ratios) - 2] + ")" - - if len(positions) == 2: - positions = positions + ")" - else: - positions = positions[0:len(positions) - 2] + ")" - - r("x <- " + positions) - r("y <- " + ratios) - - hist_mb_values = "c(" - for position in sorted(hist_dict_mb): - hist_mb_values = hist_mb_values + str(hist_dict_mb[position]) + ", " - - if len(hist_mb_values) == 2: - hist_mb_values = hist_mb_values + ")" - else: - hist_mb_values = hist_mb_values[0:len(hist_mb_values) - 2] + ")" - - hist_5kb_values = "c(" - for position in sorted(hist_dict_5kb): - hist_5kb_values = hist_5kb_values + str(hist_dict_5kb[position]) + ", " - - if len(hist_5kb_values) == 2: - hist_5kb_values = hist_5kb_values + ")" - else: - hist_5kb_values = hist_5kb_values[0:len(hist_5kb_values) - 2] + ")" - - r("xz <- " + hist_mb_values) - r("yz <- " + hist_5kb_values) - - max_break_str = str(max_breaks) - break_unit_str = str(Decimal(break_unit)) - half_break_unit_str = str(Decimal(break_unit) / Decimal(2)) - break_penta_unit_str = str(Decimal(break_unit) * Decimal(5)) - - if (standardize=='true'): - r("plot(x, y, ,cex=0.60, xlim=c(0," + max_break_str + "), main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')") - r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')") - r("axis(1, at=seq(0, " + max_break_str + ", by=" + break_unit_str + "), labels=FALSE, tcl=-0.5)") - r("axis(1, at=seq(0, " + max_break_str + ", by=" + half_break_unit_str + "), labels=FALSE, tcl=-0.25)") - r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)") - elif (standardize=='false'): - r("plot(x, y, cex=0.60, main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')") - r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')") - r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)") - r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)") - r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)") - - if draw_secondary_grid_lines: - r("abline(h = seq(floor(min(y)), 1, by=0.1), v = seq(floor(min(x)), length(x), by= 1), col='gray')") - else: - r("grid(lty = 1, col = 'gray')") - - if (standardize=='true'): - r("barplot(xz, xlim=c(0, " + max_break_str + "), ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = " + break_unit_str + ", xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')") - r("barplot(yz, space = 0, add=TRUE, width = " + half_break_unit_str + ", col=rgb(1, 0, 0, 1))") - r("axis(1, hadj = 1, at=seq(0, " + max_break_str + ", by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)") - r("axis(1, at=seq(0, " + max_break_str + ", by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)") - r("axis(1, at=seq(0, " + max_break_str + ", by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)") - elif (standardize=='false'): - r("barplot(xz, ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = 1, xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')") - r("barplot(yz, space = 0, add=TRUE, width = 0.5, col=rgb(1, 0, 0, 1))") - r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)") - r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)") - r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)") - - -def build_haw_snp_dictionary(haw_vcf = None): - haw_snps = {} - - i_file = open(haw_vcf, 'rU') - reader = csv.reader(i_file, delimiter = '\t') - - skip_headers(reader = reader, i_file = i_file) - - for row in reader: - #print row - chromosome = row[0].upper() - chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE) - chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE) - - if chromosome != 'MTDNA': - position = row[1] - haw_snp_id = row[2] - ref_allele = row[3] - alt_allele = row[4] - - info = row[7] - - mapping_unit = info.replace("MPU=", "") - - location = chromosome + ":" + position - haw_snps[location] = (alt_allele, haw_snp_id, mapping_unit) - - i_file.close() - - return haw_snps - -def calculate_normalized_histogram_bins_per_xbase(pileup_info = None, xbase = 1000000, normalize_bins = None): - normalized_histogram_bins_per_xbase = {} - - ref_snp_count_per_xbase = get_ref_snp_count_per_xbase(pileup_info = pileup_info, xbase = xbase) - mean_zero_snp_count_per_chromosome = get_mean_zero_snp_count_per_chromosome(pileup_info = pileup_info, xbase = xbase) - zero_snp_count_per_xbase = get_zero_snp_count_per_xbase(pileup_info = pileup_info, xbase = xbase) - - for location in ref_snp_count_per_xbase: - chromosome = location.split(':')[0] - mean_zero_snp_count = mean_zero_snp_count_per_chromosome[chromosome] - ref_snp_count = ref_snp_count_per_xbase[location] - - zero_snp_count = 0 - if location in zero_snp_count_per_xbase: - zero_snp_count = zero_snp_count_per_xbase[location] - - if normalize_bins == 'true': - if zero_snp_count == 0 or ref_snp_count == 0: - normalized_histogram_bins_per_xbase[location] = 0 - elif zero_snp_count == ref_snp_count: - normalized_histogram_bins_per_xbase[location] = 0 - else: - normalized_histogram_bins_per_xbase[location] = (Decimal(zero_snp_count) / (Decimal(ref_snp_count)-Decimal(zero_snp_count))) * Decimal(mean_zero_snp_count) - else: - normalized_histogram_bins_per_xbase[location] = zero_snp_count - - return normalized_histogram_bins_per_xbase - -def get_ref_snp_count_per_xbase(pileup_info = None, xbase = 1000000): - ref_snps_per_xbase = {} - - for location in pileup_info: - location_info = location.split(':') - - chromosome = location_info[0].upper() - chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE) - chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE) - - position = location_info[1] - xbase_position = (int(position) / xbase) + 1 - - location = chromosome + ":" + str(xbase_position) - if location in ref_snps_per_xbase: - ref_snps_per_xbase[location] = ref_snps_per_xbase[location] + 1 - else: - ref_snps_per_xbase[location] = 1 - - return ref_snps_per_xbase - -def get_mean_zero_snp_count_per_chromosome(pileup_info, xbase = 1000000): - sample_snp_count_per_xbase = {} - - for location in pileup_info: - alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location] - - location_info = location.split(':') - chromosome = location_info[0] - position = location_info[1] - xbase_position = (int(position) / xbase) + 1 - xbase_location = chromosome + ":" + str(xbase_position) - - if alt_allele_count == 0: - if xbase_location in sample_snp_count_per_xbase: - sample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1 - else: - sample_snp_count_per_xbase[xbase_location] = 1 - - elif alt_allele_count != 0 and xbase_location not in sample_snp_count_per_xbase: - sample_snp_count_per_xbase[xbase_location] = 0 - - mean_zero_snp_count_per_chromosome = {} - for location in sample_snp_count_per_xbase: - chromosome = location.split(':')[0] - sample_count = sample_snp_count_per_xbase[location] - if chromosome in mean_zero_snp_count_per_chromosome: - mean_zero_snp_count_per_chromosome[chromosome].append(sample_count) - else: - mean_zero_snp_count_per_chromosome[chromosome] = [sample_count] - - for chromosome in mean_zero_snp_count_per_chromosome: - summa = sum(mean_zero_snp_count_per_chromosome[chromosome]) - count = len(mean_zero_snp_count_per_chromosome[chromosome]) - - mean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count) - - return mean_zero_snp_count_per_chromosome - -def get_zero_snp_count_per_xbase(pileup_info = None, xbase = 1000000): - zero_snp_count_per_xbase = {} - - for location in pileup_info: - alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location] - - location_info = location.split(':') - chromosome = location_info[0] - position = location_info[1] - xbase_position = (int(position) / xbase) + 1 - xbase_location = chromosome + ":" + str(xbase_position) - - if alt_allele_count == 0: - if xbase_location in zero_snp_count_per_xbase: - zero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1 - else: - zero_snp_count_per_xbase[xbase_location] = 1 - - elif alt_allele_count != 0 and xbase_location not in zero_snp_count_per_xbase: - zero_snp_count_per_xbase[xbase_location] = 0 - - return zero_snp_count_per_xbase - -def parse_pileup(sample_pileup = None, haw_snps = None): - i_file = open(sample_pileup, 'rU') - reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE) - - pileup_info = {} - - for row in reader: - chromosome = row[0].upper() - chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE) - chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE) - - position = row[1] - ref_allele = row[2] - read_depth = row[3] - read_bases = row[4] - - location = chromosome + ":" + position - if location in haw_snps: - alt_allele, haw_snp_id, mapping_unit = haw_snps[location] - ref_allele_count, alt_allele_count = parse_read_bases(read_bases = read_bases, alt_allele = alt_allele) - - if Decimal(read_depth!=0): - getcontext().prec = 6 - ratio = Decimal(alt_allele_count) / Decimal(read_depth) - #ratio = Decimal(alt_allele_count) / Decimal(ref_allele_count) - - pileup_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit) - - #debug line - #print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id - - i_file.close() - - return pileup_info - -def parse_read_bases(read_bases = None, alt_allele = None): - read_bases = re.sub('\$', '', read_bases) - read_bases = re.sub('\^[^\s]', '', read_bases) - - ref_allele_matches = re.findall("\.|\,", read_bases) - ref_allele_count = len(ref_allele_matches) - - alt_allele_matches = re.findall(alt_allele, read_bases, flags = re.IGNORECASE) - alt_allele_count = len(alt_allele_matches) - - #debug line - #print read_bases, alt_allele, alt_allele_count, ref_allele_count - - return ref_allele_count, alt_allele_count - -if __name__ == "__main__": - main() diff -r 98d409af683c -r 8fc36bd53f94 SNP_Mapping.xml --- a/SNP_Mapping.xml Thu Jun 28 14:21:31 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,141 +0,0 @@ - - Map a mutation by plotting recombination frequencies resulting from crossing to a highly polymorphic strain - - #if $source.source_select=="elegans" #SNP_Mapping.py --sample_pileup $sample_pileup --haw_vcf $haw_vcf --loess_span $loess_span --d_yaxis $d_yaxis --h_yaxis $h_yaxis --points_color "$points_color" --loess_color "$loess_color" --output $output --location_plot_output $location_plot_output --standardize $standardize --normalize_bins $normalize_bins --break_file $source.Celegans - #else if $source.source_select=="arabadopsis" #SNP_Mapping.py --sample_pileup $sample_pileup --haw_vcf $haw_vcf --loess_span $loess_span --d_yaxis $d_yaxis --h_yaxis $h_yaxis --points_color "$points_color" --loess_color "$loess_color" --output $output --location_plot_output $location_plot_output --standardize $standardize --normalize_bins $normalize_bins --break_file $source.Arabadop - #else if $source.source_select=="other" #SNP_Mapping.py --sample_pileup $sample_pileup --haw_vcf $haw_vcf --loess_span $loess_span --d_yaxis $d_yaxis --h_yaxis $h_yaxis --points_color "$points_color" --loess_color "$loess_color" --output $output --location_plot_output $location_plot_output --standardize $standardize --normalize_bins $normalize_bins --break_file $source.Other - #end if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sys - optparse - csv - re - decimal - rpy - - - - - - - - -**What it does:** - -This tool is part of the CloudMap pipeline for analysis of mutant genome sequences. For further details, please see `Gregory Minevich, Danny Park, Richard J. Poole, Daniel Blankenberg, Anton Nekrutenko, and Oliver Hobert. CloudMap: A Cloud-based Pipeline for Analysis of Mutant Genome Sequences. (2012 In Preparation)`__ - - .. __: http://biochemistry.hs.columbia.edu/labs/hobert/literature.html - -CloudMap workflows, shared histories and reference datasets are available at the `CloudMap Galaxy page`__ - - .. __: https://test.g2.bx.psu.edu/u/gal40/p/cloudmap - -This tool improves upon the method described in Doitsidou et al., PLoS One 2010 for mapping causal mutations using whole genome sequencing data. - -Sample output for a linked chromosome: - -.. image:: http://biochemistry.hs.columbia.edu/labs/hobert/CloudMap/Linked_LG_500px.png - - -The polymorphic Hawaiian strain CB4856 is used as a mapping strain in most cases but in principle any sequenced nematode strain that is significantly different from the mutant strain can be used for mapping. The tool plots the ratio of mapping strain (Hawaiian)/mutant strain (N2) nucleotides at all SNP positions, reflecting the number of recombinants in the sequenced pool of animals. Chromosomes which contain regions of linkage to the causal mutation will have regions where the ratio of mapping strain (Hawaiian)/total reads will be equal to 0. The scatter plots for such linked regions will have a high number of data points lying exactly on the X axis. A loess regression line is plotted through all the points on a given chromosome giving further accuracy to the linked region. - -Each scatter plot has a corresponding frequency plot that displays regions of linked chromosome where 0 ratio SNP positions are concentrated. 1Mb bins for the 0 ratio SNP positions are colored gray by default and .5Mb bins are colored in red. - - -The experimental design required to generate data for the plots is described in Doitsidou et al., PLoS One 2010 Figure 1: - -.. image:: http://biochemistry.hs.columbia.edu/labs/hobert/CloudMap/Doitsidou_2010_PLoS_Fig.1_500px.png - - ------- - -**Input:** - - -The input pileup files are generated by the SAMTools mpileup tool. Default SAMTools mpileup (and Samtools filter pileup) parameters for mapping quality, base quality and coverage at each SNP position typically yield good results, though users may experiment with filtering SNP data by adjusting these parameters. In our testing, low threshold filtering on base pair quality has been useful in improving accuracy of plots while high threshold filtering on coverage has skewed plot accuracy. - -This tool requires a pileup that has been created at each SNP position using SAMTools mpileup (http://samtools.sourceforge.net/samtools.shtml) and a BED file of all Hawaiian SNP positions. Download Hawaiian SNP positions BED file here: -http://biochemistry.hs.columbia.edu/labs/hobert/protocols.html - -The required VCF of mapping strain (e.g. Hawaiian) SNPs is a reference file that contains mapping strain SNP positions and reference base pairs at each position. -(download Hawaiian SNPs VCF from: http://biochemistry.hs.columbia.edu/labs/hobert/protocols.html). You may also make your own VCF of SNP positions following the steps described in the CloudMAP paper. - - -**Output:** - -The tool also provides a tabular output file that contains a count of the number of reference and alternate SNPs at each mapping strain SNP position as well as the ratio of reference/alternate SNPs. The position of each mapping strain SNP in map units and physical coordinates is also provided in the output file. - - ------- - -**Settings:** - -.. class:: infomark - -Information on loess regression and the loess span parameter: -http://en.wikipedia.org/wiki/Local_regression - -.. class:: infomark - -Based on our testing, we've settled on .1 as a loess span default. Larger values result in smoothing of the line to reflect trends at a more macro level. Smaller values result in loess lines that more closely reflect local data fluctuations. - -.. class:: infomark - -Supported colors for data points and loess regression line: - -http://www.stat.columbia.edu/~tzheng/files/Rcolor.pdf - -http://research.stowers-institute.org/efg/R/Color/Chart/ColorChart.pdf - - - -.. class:: warningmark - -This tool requires that the statistical programming environment R has been installed on the system hosting Galaxy (http://www.r-project.org/). If you are accessing this tool on Galaxy via the Cloud, this does not apply to you. - - ------- - -**Citation:** - -This tool is part of the CloudMap package from the Hobert Lab. If you use this tool, please cite `Gregory Minevich, Danny Park, Richard J. Poole, Daniel Blankenberg, Anton Nekrutenko, and Oliver Hobert. CloudMap: A Cloud-based Pipeline for Analysis of Mutant Genome Sequences. (2012 In Preparation)`__ - - .. __: http://biochemistry.hs.columbia.edu/labs/hobert/literature.html - -Correspondence to gm2123@columbia.edu (G.M.) or or38@columbia.edu (O.H.) - - -