comparison SNP_Mapping.py @ 10:7d6bccd5f88c draft

Uploaded
author gregory-minevich
date Thu, 14 Jun 2012 20:34:20 -0400
parents 30fa4d84e84c
children
comparison
equal deleted inserted replaced
9:d4aa63bc2ef6 10:7d6bccd5f88c
3 import re 3 import re
4 import sys 4 import sys
5 import optparse 5 import optparse
6 import csv 6 import csv
7 import re 7 import re
8 import pprint
8 from decimal import * 9 from decimal import *
9 from rpy import * 10 from rpy import *
10 11
11 def main(): 12 def main():
12 csv.field_size_limit(1000000000) 13 csv.field_size_limit(1000000000)
14 parser = optparse.OptionParser() 15 parser = optparse.OptionParser()
15 parser.add_option('-p', '--sample_pileup', dest = 'sample_pileup', action = 'store', type = 'string', default = None, help = "Sample pileup from mpileup") 16 parser.add_option('-p', '--sample_pileup', dest = 'sample_pileup', action = 'store', type = 'string', default = None, help = "Sample pileup from mpileup")
16 parser.add_option('-v', '--haw_vcf', dest = 'haw_vcf', action = 'store', type = 'string', default = None, help = "vcf file of Hawaiian SNPs") 17 parser.add_option('-v', '--haw_vcf', dest = 'haw_vcf', action = 'store', type = 'string', default = None, help = "vcf file of Hawaiian SNPs")
17 parser.add_option('-l', '--loess_span', dest = 'loess_span', action = 'store', type = 'float', default = .01, help = "Loess span") 18 parser.add_option('-l', '--loess_span', dest = 'loess_span', action = 'store', type = 'float', default = .01, help = "Loess span")
18 parser.add_option('-d', '--d_yaxis', dest = 'd_yaxis', action = 'store', type = 'float', default = .7, help = "y-axis upper limit for dot plot") 19 parser.add_option('-d', '--d_yaxis', dest = 'd_yaxis', action = 'store', type = 'float', default = .7, help = "y-axis upper limit for dot plot")
19 parser.add_option('-y', '--h_yaxis', dest = 'h_yaxis', action = 'store', type = 'int', default = 500, help = "y-axis upper limit for dot plot") 20 parser.add_option('-y', '--h_yaxis', dest = 'h_yaxis', action = 'store', type = 'int', default = 5, help = "y-axis upper limit for histogram plot")
20 parser.add_option('-c', '--points_color', dest = 'points_color', action = 'store', type = 'string', default = "gray27", help = "Color for data points") 21 parser.add_option('-c', '--points_color', dest = 'points_color', action = 'store', type = 'string', default = "gray27", help = "Color for data points")
21 parser.add_option('-k', '--loess_color', dest = 'loess_color', action = 'store', type = 'string', default = "red", help = "Color for loess regression line") 22 parser.add_option('-k', '--loess_color', dest = 'loess_color', action = 'store', type = 'string', default = "red", help = "Color for loess regression line")
22 parser.add_option('-z', '--standardize', dest = 'standardize', default= 'false', help = "Standardize X-axis") 23 parser.add_option('-z', '--standardize', dest = 'standardize', default= 'true', help = "Standardize X-axis")
24 parser.add_option('-b', '--break_file', dest = 'break_file', action = 'store', type = 'string', default = 'C.elegans', help = "File defining the breaks per chromosome")
25 parser.add_option('-x', '--bin_size', dest = 'bin_size', action = 'store', type = 'int', default = 1000000, help = "Size of histogram bins, default is 1mb")
26 parser.add_option('-n', '--do_not_normalize_bin', dest = 'do_not_normalize_bin', action = 'store_true', help = "Do not Normalize histograms")
23 27
24 parser.add_option('-o', '--output', dest = 'output', action = 'store', type = 'string', default = None, help = "Output file name") 28 parser.add_option('-o', '--output', dest = 'output', action = 'store', type = 'string', default = None, help = "Output file name")
25 parser.add_option('-s', '--location_plot_output', dest = 'location_plot_output', action = 'store', type = 'string', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location") 29 parser.add_option('-s', '--location_plot_output', dest = 'location_plot_output', action = 'store', type = 'string', default = "SNP_Mapping_Plot.pdf", help = "Output file name of SNP plots by chromosomal location")
26 30
27 #For plotting with map units on the X-axis instead of physical distance 31 #For plotting with map units on the X-axis instead of physical distance
28 #parser.add_option('-u', '--mpu_plot_output', dest = 'mpu_plot_output', action = 'store', type = 'string', default = None, help = "Output file name of SNP plots by map unit location") 32 #parser.add_option('-u', '--mpu_plot_output', dest = 'mpu_plot_output', action = 'store', type = 'string', default = None, help = "Output file name of SNP plots by map unit location")
29 (options, args) = parser.parse_args() 33 (options, args) = parser.parse_args()
30 34
31 haw_snps = build_haw_snp_dictionary(haw_vcf = options.haw_vcf) 35 haw_snps = build_haw_snp_dictionary(haw_vcf = options.haw_vcf)
32 pileup_info = parse_pileup(sample_pileup = options.sample_pileup, haw_snps = haw_snps) 36 pileup_info = parse_pileup(sample_pileup = options.sample_pileup, haw_snps = haw_snps)
37
33 output_pileup_info(output = options.output, pileup_info = pileup_info) 38 output_pileup_info(output = options.output, pileup_info = pileup_info)
34 39
35 #output plot with all ratios 40 #output plot with all ratios
36 output_scatter_plots_by_location(location_plot_output = options.location_plot_output, pileup_info = pileup_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize) 41 rounded_bin_size = int(round((float(options.bin_size) / 1000000), 1) * 1000000)
42
43 normalized_histogram_bins_per_mb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = rounded_bin_size, do_not_normalize = options.do_not_normalize_bin)
44 normalized_histogram_bins_per_5kb = calculate_normalized_histogram_bins_per_xbase(pileup_info = pileup_info, xbase = (rounded_bin_size / 2), do_not_normalize = options.do_not_normalize_bin)
45
46 break_dict = parse_breaks(break_file = options.break_file)
47
48 output_scatter_plots_by_location(location_plot_output = options.location_plot_output, pileup_info = pileup_info, loess_span=options.loess_span, d_yaxis=options.d_yaxis, h_yaxis=options.h_yaxis, points_color=options.points_color, loess_color=options.loess_color, standardize =options.standardize, normalized_hist_per_mb = normalized_histogram_bins_per_mb, normalized_hist_per_5kb = normalized_histogram_bins_per_5kb, breaks = break_dict, rounded_bin_size = rounded_bin_size)
37 49
38 #For plotting with map units on the X-axis instead of physical distance) 50 #For plotting with map units on the X-axis instead of physical distance)
39 #output_scatter_plots_by_mapping_units(mpu_plot_output = options.mpu_plot_output, pileup_info = pileup_info) 51 #output_scatter_plots_by_mapping_units(mpu_plot_output = options.mpu_plot_output, pileup_info = pileup_info)
40 52
41 def skip_headers(reader = None, i_file = None): 53 def skip_headers(reader = None, i_file = None):
46 58
47 # skip headers 59 # skip headers
48 i_file.seek(0) 60 i_file.seek(0)
49 for i in range(0, comment): 61 for i in range(0, comment):
50 reader.next() 62 reader.next()
63
64 def parse_breaks(break_file = None):
65 if break_file == 'C.elegans':
66 break_dict = { 'I' : 16 , 'II' : 16, 'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 }
67 return break_dict
68 elif break_file == 'Arabadopsis':
69 break_dict = { '1' : 16 , '2' : 16, '3' : 21, '4' : 18, '5' : 21 }
70 return break_dict
71 else:
72 i_file = open(break_file, 'rU')
73 break_dict = {}
74 reader = csv.reader(i_file, delimiter = '\t')
75 for row in reader:
76 chromosome = row[0].upper()
77 chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
78 chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
79 break_count = row[1]
80 break_dict[chromosome] = int(break_count)
81 return break_dict
82
51 83
52 def location_comparer(location_1, location_2): 84 def location_comparer(location_1, location_2):
53 chr_loc_1 = location_1.split(':')[0] 85 chr_loc_1 = location_1.split(':')[0]
54 pos_loc_1 = int(location_1.split(':')[1]) 86 pos_loc_1 = int(location_1.split(':')[1])
55 87
85 117
86 writer.writerow([chromosome, position, haw_snp_id, alt_allele_count, ref_allele_count, read_depth, ratio, mapping_unit]) 118 writer.writerow([chromosome, position, haw_snp_id, alt_allele_count, ref_allele_count, read_depth, ratio, mapping_unit])
87 119
88 o_file.close() 120 o_file.close()
89 121
90 def output_scatter_plots_by_location(location_plot_output = None, pileup_info = None, loess_span="", d_yaxis="", h_yaxis="", points_color="", loess_color="", standardize=None): 122 def output_scatter_plots_by_location(location_plot_output = None, pileup_info = None, loess_span="", d_yaxis="", h_yaxis="", points_color="", loess_color="", standardize=None, normalized_hist_per_mb = None, normalized_hist_per_5kb = None, breaks = None, rounded_bin_size = 1000000):
91 i = {} 123 positions = {}
92 ii = {} 124 current_chr = ""
93 iii = {} 125 prev_chr = ""
94 iv = {}
95 v = {}
96 x = {}
97
98 breaks = { 'I' : 16 , 'II' : 16, 'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 }
99
100 for location in pileup_info:
101 chromosome = location.split(':')[0]
102 position = location.split(':')[1]
103
104 alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]
105
106 if chromosome == "I":
107 i[position] = ratio
108 elif chromosome == "II":
109 ii[position] = ratio
110 elif chromosome == "III":
111 iii[position] = ratio
112 elif chromosome == "IV":
113 iv[position] = ratio
114 elif chromosome == "V":
115 v[position] = ratio
116 elif chromosome == "X":
117 x[position] = ratio
118 126
119 x_label = "Location (Mb)" 127 x_label = "Location (Mb)"
120 filtered_label = '' 128 filtered_label = ''
121 129
130 location_sorted_pileup_info_keys = sorted(pileup_info.keys(), cmp=location_comparer)
131
132 break_unit = Decimal(rounded_bin_size) / Decimal(1000000)
133 max_breaks = max(breaks.values())
122 134
123 try: 135 try:
124 r.pdf(location_plot_output, 8, 8) 136 r.pdf(location_plot_output, 8, 8)
125 if i: 137
126 plot_data(chr_dict = i, chr = "I" + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks["I"], standardize=standardize) 138 for location in location_sorted_pileup_info_keys:
127 if ii: 139 current_chr = location.split(':')[0]
128 plot_data(chr_dict = ii, chr = "II" + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks["II"], standardize=standardize) 140 position = location.split(':')[1]
129 if iii: 141
130 plot_data(chr_dict = iii, chr = "III" + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks["III"], standardize=standardize) 142 alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]
131 if iv: 143
132 plot_data(chr_dict = iv, chr = "IV" + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks["IV"], standardize=standardize) 144 if prev_chr != current_chr:
133 if v: 145 if prev_chr != "":
134 plot_data(chr_dict = v, chr = "V" + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks["V"], standardize=standardize) 146 hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = prev_chr)
135 if x: 147 hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = prev_chr)
136 plot_data(chr_dict = x, chr = "X" + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks["X"], standardize=standardize) 148
137 149 plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = prev_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[prev_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit)
138 r.dev_off() 150
139 except Exception as inst: 151 prev_chr = current_chr
152 positions = {}
153
154 positions[position] = ratio
155
156 hist_dict_mb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_mb, chr = current_chr)
157 hist_dict_5kb = get_hist_dict_by_chr(normalized_hist_per_xbase = normalized_hist_per_5kb, chr = current_chr)
158
159 plot_data(chr_dict = positions, hist_dict_mb = hist_dict_mb, hist_dict_5kb = hist_dict_5kb, chr = current_chr + filtered_label, x_label = "Location (Mb)", divide_position = True, draw_secondary_grid_lines = True, loess_span=loess_span, d_yaxis=d_yaxis, h_yaxis=h_yaxis, points_color=points_color, loess_color=loess_color, breaks = breaks[current_chr], standardize=standardize, max_breaks = max_breaks, break_unit = break_unit)
160
161 r.dev_off()
162
163 except Exception as inst:
140 print inst 164 print inst
141 print "There was an error creating the location plot pdf... Please try again" 165 print "There was an error creating the location plot pdf... Please try again"
142 166
167 def get_hist_dict_by_chr(normalized_hist_per_xbase = None, chr = ''):
168 hist_dict = {}
169
170 for location in normalized_hist_per_xbase:
171 chromosome = location.split(':')[0]
172 if chromosome == chr:
173 position = int(location.split(':')[1])
174 hist_dict[position] = normalized_hist_per_xbase[location]
175
176 return hist_dict
177
178 '''
143 def output_scatter_plots_by_mapping_units(mpu_plot_output = None, pileup_info = None): 179 def output_scatter_plots_by_mapping_units(mpu_plot_output = None, pileup_info = None):
144 i = {} 180 i = {}
145 ii = {} 181 ii = {}
146 iii = {} 182 iii = {}
147 iv = {} 183 iv = {}
179 plot_data(chr_dict = x, chr = "X", x_label = "Map Units") 215 plot_data(chr_dict = x, chr = "X", x_label = "Map Units")
180 r.dev_off() 216 r.dev_off()
181 except Exception as inst: 217 except Exception as inst:
182 print inst 218 print inst
183 print "There was an error creating the map unit plot pdf... Please try again" 219 print "There was an error creating the map unit plot pdf... Please try again"
184 220 '''
185 221
186 def plot_data(chr_dict = None, chr = "", x_label = "", divide_position = False, draw_secondary_grid_lines = False, loess_span=None, d_yaxis=None, h_yaxis=None, points_color="", loess_color="", breaks = None, standardize= None): 222 def plot_data(chr_dict = None, hist_dict_mb = None, hist_dict_5kb = None, chr = "", x_label = "", divide_position = False, draw_secondary_grid_lines = False, loess_span=None, d_yaxis=None, h_yaxis=None, points_color="", loess_color="", breaks = None, standardize= None, max_breaks = 1, break_unit = 1):
187 ratios = "c(" 223 ratios = "c("
188 positions = "c(" 224 positions = "c("
189 z_ratios = "c(" 225
190 z_positions = "c("
191
192 for position in chr_dict: 226 for position in chr_dict:
193 ratio = chr_dict[position] 227 ratio = chr_dict[position]
194 if divide_position: 228 if divide_position:
195 position = float(position) / 1000000.0 229 position = float(position) / 1000000.0
196 positions = positions + str(position) + ", " 230 positions = positions + str(position) + ", "
197 ratios = ratios + str(ratio) + ", " 231 ratios = ratios + str(ratio) + ", "
198 if ratio == 0: 232
199 if divide_position:
200 z_position = float(position) / 1000000.0
201 z_positions = z_positions + str(position) + ", "
202 z_ratios = z_ratios + str(ratio) + ", "
203
204
205 if len(ratios) == 2: 233 if len(ratios) == 2:
206 ratios = ratios + ")" 234 ratios = ratios + ")"
207 else: 235 else:
208 ratios = ratios[0:len(ratios) - 2] + ")" 236 ratios = ratios[0:len(ratios) - 2] + ")"
209 237
210 if len(z_ratios) == 2:
211 z_ratios = z_ratios + ")"
212 else:
213 z_ratios = z_ratios[0:len(z_ratios) - 2] + ")"
214
215
216 if len(positions) == 2: 238 if len(positions) == 2:
217 positions = positions + ")" 239 positions = positions + ")"
218 else: 240 else:
219 positions = positions[0:len(positions) - 2] + ")" 241 positions = positions[0:len(positions) - 2] + ")"
220 242
221 if len(z_positions) == 2:
222 z_positions = z_positions + ")"
223 else:
224 z_positions = z_positions[0:len(z_positions) - 2] + ")"
225
226 r("x <- " + positions) 243 r("x <- " + positions)
227 r("y <- " + ratios) 244 r("y <- " + ratios)
228 245
229 r("xz <- " + z_positions) 246 hist_mb_values = "c("
230 r("yz <- " + z_ratios) 247 for position in sorted(hist_dict_mb):
231 248 hist_mb_values = hist_mb_values + str(hist_dict_mb[position]) + ", "
232 if (standardize=='true'): 249
233 r("plot(x, y, cex=0.60, xlim=c(0,21), main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='HA Ratio [Hawaiian Reads / Total Read Depth]', pch=18, col='"+ points_color +"')") 250 if len(hist_mb_values) == 2:
251 hist_mb_values = hist_mb_values + ")"
252 else:
253 hist_mb_values = hist_mb_values[0:len(hist_mb_values) - 2] + ")"
254
255 hist_5kb_values = "c("
256 for position in sorted(hist_dict_5kb):
257 hist_5kb_values = hist_5kb_values + str(hist_dict_5kb[position]) + ", "
258
259 if len(hist_5kb_values) == 2:
260 hist_5kb_values = hist_5kb_values + ")"
261 else:
262 hist_5kb_values = hist_5kb_values[0:len(hist_5kb_values) - 2] + ")"
263
264 r("xz <- " + hist_mb_values)
265 r("yz <- " + hist_5kb_values)
266
267 max_break_str = str(max_breaks)
268 break_unit_str = str(Decimal(break_unit))
269 half_break_unit_str = str(Decimal(break_unit) / Decimal(2))
270 break_penta_unit_str = str(Decimal(break_unit) * Decimal(5))
271
272 if (standardize=='true'):
273 r("plot(x, y, ,cex=0.60, xlim=c(0," + max_break_str + "), main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')")
234 r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')") 274 r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')")
235 r("axis(1, at=seq(0, 21, by=1), labels=FALSE, tcl=-0.5)") 275 r("axis(1, at=seq(0, " + max_break_str + ", by=" + break_unit_str + "), labels=FALSE, tcl=-0.5)")
236 r("axis(1, at=seq(0, 21, by=0.5), labels=FALSE, tcl=-0.25)") 276 r("axis(1, at=seq(0, " + max_break_str + ", by=" + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
237 r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)") 277 r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)")
238 elif (standardize=='false'): 278 elif (standardize=='false'):
239 r("plot(x, y, cex=0.60, main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='HA Ratio [Hawaiian Reads / Total Read Depth]', pch=18, col='"+ points_color +"')") 279 r("plot(x, y, cex=0.60, main='LG " + chr + "', xlab= '" + x_label + "', ylim = c(0, %f " %d_yaxis + "), ylab='Ratios of mapping strain alleles/total reads (at SNP positions)', pch=18, col='"+ points_color +"')")
240 r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')") 280 r("lines(loess.smooth(x, y, span = %f "%loess_span + "), lwd=5, col='"+ loess_color +"')")
241 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=1), labels=FALSE, tcl=-0.5)") 281 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
242 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=0.5), labels=FALSE, tcl=-0.25)") 282 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
243 r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)") 283 r("axis(2, at=seq(floor(min(y)), 1, by=0.1), labels=FALSE, tcl=-0.2)")
244 284
245
246 if draw_secondary_grid_lines: 285 if draw_secondary_grid_lines:
247 r("abline(h = seq(floor(min(y)), 1, by=0.1), v = seq(floor(min(x)), length(x), by= 1), col='gray')") 286 r("abline(h = seq(floor(min(y)), 1, by=0.1), v = seq(floor(min(x)), length(x), by= 1), col='gray')")
248 else: 287 else:
249 r("grid(lty = 1, col = 'gray')") 288 r("grid(lty = 1, col = 'gray')")
250 289
251 if (standardize=='true'): 290 if (standardize=='true'):
252 r("hist(xz, col='darkgray', xlim=c(0,21), xlab='Location (Mb)', ylab='Frequency SNP Positions Where HA Ratio=0 ', ylim=c(0, %f " %h_yaxis + "), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=1), main='LG " + chr + "')") 291 r("barplot(xz, xlim=c(0, " + max_break_str + "), ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = " + break_unit_str + ", xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')")
253 r("hist(xz, add=TRUE, col=rgb(1, 0, 0, 1), xlim=c(0,21), xlab='Location (Mb)', ylab='Frequency SNP Positions Where HA Ratio=0 ', ylim=c(0, %f " %h_yaxis + "), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=.5), main='Chr " + chr + "')") 292 r("barplot(yz, space = 0, add=TRUE, width = " + half_break_unit_str + ", col=rgb(1, 0, 0, 1))")
254 r("axis(1, at=seq(0, 21, by=1), labels=FALSE, tcl=-0.5)") 293 r("axis(1, hadj = 1, at=seq(0, " + max_break_str + ", by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
255 r("axis(1, at=seq(0, 21, by=0.5), labels=FALSE, tcl=-0.25)") 294 r("axis(1, at=seq(0, " + max_break_str + ", by= " + break_penta_unit_str + "), labels=TRUE, tcl=-0.5)")
295 r("axis(1, at=seq(0, " + max_break_str + ", by= " + half_break_unit_str + "), labels=FALSE, tcl=-0.25)")
256 elif (standardize=='false'): 296 elif (standardize=='false'):
257 r("hist(xz, col='darkgray', xlab='Location (Mb)', ylab='Frequency SNP Positions Where HA Ratio=0 ', ylim=c(0, %f " %h_yaxis + "), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=1), main='LG " + chr + "')") 297 r("barplot(xz, ylim = c(0, " + str(h_yaxis) + "), yaxp=c(0, " + str(h_yaxis) + ", 1), space = 0, col='darkgray', width = 1, xlab='Location (Mb)', ylab='Normalized frequency of pure parental alleles ', main='LG " + chr + "')")
258 r("hist(xz, add=TRUE, col=rgb(1, 0, 0, 1), xlab='Location (Mb)', ylab='Frequency SNP Positions Where HA Ratio=0 ', ylim=c(0, %f " %h_yaxis + "), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=.5), main='Chr " + chr + "')") 298 r("barplot(yz, space = 0, add=TRUE, width = 0.5, col=rgb(1, 0, 0, 1))")
259 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=1), labels=FALSE, tcl=-0.5)") 299 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_unit_str + "), labels=FALSE, tcl=-0.5)")
260 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=0.5), labels=FALSE, tcl=-0.25)") 300 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_penta_unit_str + ", labels=TRUE, tcl=-0.5)")
301 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by= " + break_half_unit_str + "), labels=FALSE, tcl=-0.25)")
261 302
262 303
263 def build_haw_snp_dictionary(haw_vcf = None): 304 def build_haw_snp_dictionary(haw_vcf = None):
264 haw_snps = {} 305 haw_snps = {}
265 306
276 317
277 position = row[1] 318 position = row[1]
278 haw_snp_id = row[2] 319 haw_snp_id = row[2]
279 ref_allele = row[3] 320 ref_allele = row[3]
280 alt_allele = row[4] 321 alt_allele = row[4]
281 info = row[7] 322
323 info = row[7]
282 324
283 mapping_unit = info.replace("MPU=", "") 325 mapping_unit = info.replace("MPU=", "")
284 326
285 location = chromosome + ":" + position 327 location = chromosome + ":" + position
286 haw_snps[location] = (alt_allele, haw_snp_id, mapping_unit) 328 haw_snps[location] = (alt_allele, haw_snp_id, mapping_unit)
287 329
288 i_file.close() 330 i_file.close()
289 331
290 return haw_snps 332 return haw_snps
333
334 def calculate_normalized_histogram_bins_per_xbase(pileup_info = None, xbase = 1000000, do_not_normalize = False):
335 normalized_histogram_bins_per_xbase = {}
336
337 ref_snp_count_per_xbase = get_ref_snp_count_per_xbase(pileup_info = pileup_info, xbase = xbase)
338 mean_zero_snp_count_per_chromosome = get_mean_zero_snp_count_per_chromosome(pileup_info = pileup_info, xbase = xbase)
339 zero_snp_count_per_xbase = get_zero_snp_count_per_xbase(pileup_info = pileup_info, xbase = xbase)
340
341 for location in ref_snp_count_per_xbase:
342 chromosome = location.split(':')[0]
343 mean_zero_snp_count = mean_zero_snp_count_per_chromosome[chromosome]
344 ref_snp_count = ref_snp_count_per_xbase[location]
345
346 zero_snp_count = 0
347 if location in zero_snp_count_per_xbase:
348 zero_snp_count = zero_snp_count_per_xbase[location]
349
350 if do_not_normalize == True:
351 normalized_histogram_bins_per_xbase[location] = zero_snp_count
352 else:
353 if zero_snp_count == 0 or ref_snp_count == 0:
354 normalized_histogram_bins_per_xbase[location] = 0
355 elif zero_snp_count == ref_snp_count:
356 normalized_histogram_bins_per_xbase[location] = 0
357 else:
358 normalized_histogram_bins_per_xbase[location] = (Decimal(zero_snp_count) / (Decimal(ref_snp_count)-Decimal(zero_snp_count))) * Decimal(mean_zero_snp_count)
359
360 return normalized_histogram_bins_per_xbase
361
362 def get_ref_snp_count_per_xbase(pileup_info = None, xbase = 1000000):
363 ref_snps_per_xbase = {}
364
365 for location in pileup_info:
366 location_info = location.split(':')
367
368 chromosome = location_info[0].upper()
369 chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE)
370 chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE)
371
372 position = location_info[1]
373 xbase_position = (int(position) / xbase) + 1
374
375 location = chromosome + ":" + str(xbase_position)
376 if location in ref_snps_per_xbase:
377 ref_snps_per_xbase[location] = ref_snps_per_xbase[location] + 1
378 else:
379 ref_snps_per_xbase[location] = 1
380
381 return ref_snps_per_xbase
382
383 def get_mean_zero_snp_count_per_chromosome(pileup_info, xbase = 1000000):
384 sample_snp_count_per_xbase = {}
385
386 for location in pileup_info:
387 alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]
388
389 location_info = location.split(':')
390 chromosome = location_info[0]
391 position = location_info[1]
392 xbase_position = (int(position) / xbase) + 1
393 xbase_location = chromosome + ":" + str(xbase_position)
394
395 if alt_allele_count == 0:
396 if xbase_location in sample_snp_count_per_xbase:
397 sample_snp_count_per_xbase[xbase_location] = sample_snp_count_per_xbase[xbase_location] + 1
398 else:
399 sample_snp_count_per_xbase[xbase_location] = 1
400
401 elif alt_allele_count != 0 and xbase_location not in sample_snp_count_per_xbase:
402 sample_snp_count_per_xbase[xbase_location] = 0
403
404 mean_zero_snp_count_per_chromosome = {}
405 for location in sample_snp_count_per_xbase:
406 chromosome = location.split(':')[0]
407 sample_count = sample_snp_count_per_xbase[location]
408 if chromosome in mean_zero_snp_count_per_chromosome:
409 mean_zero_snp_count_per_chromosome[chromosome].append(sample_count)
410 else:
411 mean_zero_snp_count_per_chromosome[chromosome] = [sample_count]
412
413 for chromosome in mean_zero_snp_count_per_chromosome:
414 summa = sum(mean_zero_snp_count_per_chromosome[chromosome])
415 count = len(mean_zero_snp_count_per_chromosome[chromosome])
416
417 mean_zero_snp_count_per_chromosome[chromosome] = Decimal(summa) / Decimal(count)
418
419 return mean_zero_snp_count_per_chromosome
420
421 def get_zero_snp_count_per_xbase(pileup_info = None, xbase = 1000000):
422 zero_snp_count_per_xbase = {}
423
424 for location in pileup_info:
425 alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit = pileup_info[location]
426
427 location_info = location.split(':')
428 chromosome = location_info[0]
429 position = location_info[1]
430 xbase_position = (int(position) / xbase) + 1
431 xbase_location = chromosome + ":" + str(xbase_position)
432
433 if alt_allele_count == 0:
434 if xbase_location in zero_snp_count_per_xbase:
435 zero_snp_count_per_xbase[xbase_location] = zero_snp_count_per_xbase[xbase_location] + 1
436 else:
437 zero_snp_count_per_xbase[xbase_location] = 1
438
439 elif alt_allele_count != 0 and xbase_location not in zero_snp_count_per_xbase:
440 zero_snp_count_per_xbase[xbase_location] = 0
441
442 return zero_snp_count_per_xbase
291 443
292 def parse_pileup(sample_pileup = None, haw_snps = None): 444 def parse_pileup(sample_pileup = None, haw_snps = None):
293 i_file = open(sample_pileup, 'rU') 445 i_file = open(sample_pileup, 'rU')
294 reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE) 446 reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE)
295 447
307 459
308 location = chromosome + ":" + position 460 location = chromosome + ":" + position
309 if location in haw_snps: 461 if location in haw_snps:
310 alt_allele, haw_snp_id, mapping_unit = haw_snps[location] 462 alt_allele, haw_snp_id, mapping_unit = haw_snps[location]
311 ref_allele_count, alt_allele_count = parse_read_bases(read_bases = read_bases, alt_allele = alt_allele) 463 ref_allele_count, alt_allele_count = parse_read_bases(read_bases = read_bases, alt_allele = alt_allele)
312 464
313 getcontext().prec = 6 465 if Decimal(read_depth!=0):
314 ratio = Decimal(alt_allele_count) / Decimal(read_depth) 466 getcontext().prec = 6
315 467 ratio = Decimal(alt_allele_count) / Decimal(read_depth)
316 pileup_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit) 468 #ratio = Decimal(alt_allele_count) / Decimal(ref_allele_count)
317 469
318 #debug line 470 pileup_info[location] = (alt_allele_count, ref_allele_count, read_depth, ratio, haw_snp_id, mapping_unit)
319 #print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id 471
472 #debug line
473 #print chromosome, position, read_depth, ref_allele_count, alt_allele_count, ratio, id
320 474
321 i_file.close() 475 i_file.close()
322 476
323 return pileup_info 477 return pileup_info
324 478