Mercurial > repos > gregory-minevich > ems_variant_density_mapping
comparison EMS_VariantDensityMapping.py @ 8:9a760d986268 draft
Uploaded
author | gregory-minevich |
---|---|
date | Mon, 25 Jun 2012 16:08:52 -0400 |
parents | 6d2dbdfa11e3 |
children |
comparison
equal
deleted
inserted
replaced
7:1bee9742af9b | 8:9a760d986268 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import re | |
4 import sys | |
5 import optparse | |
6 import csv | |
7 from rpy import * | |
8 | |
9 def main(): | |
10 parser = optparse.OptionParser() | |
11 parser.add_option('-s', '--snp_vcf', dest = 'snp_vcf', action = 'store', type = 'string', default = None, help = "VCF of SNPs") | |
12 parser.add_option('-c', '--hist_color', dest = 'hist_color', action = 'store', type = 'string', default = "darkgray", help = "Color for 1Mb histograms") | |
13 parser.add_option('-y', '--ylim', dest = 'ylim', action = 'store', type = 'int', default= 100, help = "Upper limit of Y axis") | |
14 parser.add_option('-z', '--standardize', dest = 'standardize', default= 'false', help = "Standardize X-axis") | |
15 parser.add_option('-e', '--ems', dest = 'ems', default= 'false', help = "Whether EMS variants should be filtered for") | |
16 parser.add_option('-o', '--output', dest = 'plot_output', action = 'store', type = 'string', default = 'EMS_Variant_Density_Plot.pdf', help = "Output file name of plot") | |
17 (options, args) = parser.parse_args() | |
18 | |
19 | |
20 i, ii, iii, iv, v, x = parse_snp_vcf(snp_vcf = options.snp_vcf, ems=options.ems) | |
21 create_histograms(plot_output = options.plot_output, hist_color=options.hist_color, ylim=options.ylim, ems=options.ems, standardize=options.standardize, i = i, ii = ii, iii = iii, iv = iv, v = v, x = x) | |
22 | |
23 def create_histograms(plot_output = None, hist_color=None, ylim=None, ems=None, standardize=None , i = None, ii = None, iii = None, iv = None, v = None, x = None): | |
24 breaks = { 'I' : 16 , 'II' : 16, 'III' : 14, 'IV' : 18, 'V' : 21, 'X' : 18 } | |
25 | |
26 try: | |
27 r.pdf(plot_output, 8, 8) | |
28 if len(i) > 0: | |
29 plot_data(position_list = i, chr = "I", breaks = breaks["I"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize) | |
30 if len(ii) > 0: | |
31 plot_data(position_list = ii, chr = "II", breaks = breaks["II"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize) | |
32 if len(iii) > 0: | |
33 plot_data(position_list = iii, chr = "III", breaks = breaks["III"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize) | |
34 if len(iv) > 0: | |
35 plot_data(position_list = iv, chr = "IV", breaks = breaks["IV"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize) | |
36 if len(v) > 0: | |
37 plot_data(position_list = v, chr = "V", breaks = breaks["V"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize) | |
38 if len(x) > 0: | |
39 plot_data(position_list = x, chr = "X", breaks = breaks["X"], hist_color=hist_color, ylim=ylim, ems=ems, standardize=standardize) | |
40 r.dev_off() | |
41 except Exception as inst: | |
42 print inst | |
43 print "There was an error creating the plot pdf... Please try again" | |
44 | |
45 def parse_snp_vcf(snp_vcf = None, ems=None): | |
46 i_file = open(snp_vcf, 'rU') | |
47 reader = csv.reader(i_file, delimiter = '\t', quoting = csv.QUOTE_NONE) | |
48 | |
49 skip_headers(reader = reader, i_file = i_file) | |
50 | |
51 i_position_list = [] | |
52 ii_position_list = [] | |
53 iii_position_list = [] | |
54 iv_position_list = [] | |
55 v_position_list = [] | |
56 x_position_list = [] | |
57 | |
58 for row in reader: | |
59 chromosome = row[0].upper() | |
60 chromosome = re.sub("chr", "", chromosome, flags = re.IGNORECASE) | |
61 chromosome = re.sub("CHROMOSOME_", "", chromosome, flags = re.IGNORECASE) | |
62 | |
63 position = row[1] | |
64 ref_allele = row[3] | |
65 alt_allele = row[4] | |
66 | |
67 if (ems=='true'): | |
68 if (ref_allele =="G" or ref_allele =="C") and (alt_allele =="A" or alt_allele =="T"): | |
69 if chromosome == "I": | |
70 i_position_list.append(position) | |
71 elif chromosome == "II": | |
72 ii_position_list.append(position) | |
73 elif chromosome == "III": | |
74 iii_position_list.append(position) | |
75 elif chromosome == "IV": | |
76 iv_position_list.append(position) | |
77 elif chromosome == "V": | |
78 v_position_list.append(position) | |
79 elif chromosome == "X": | |
80 x_position_list.append(position) | |
81 elif (ems=='false'): | |
82 if chromosome == "I": | |
83 i_position_list.append(position) | |
84 elif chromosome == "II": | |
85 ii_position_list.append(position) | |
86 elif chromosome == "III": | |
87 iii_position_list.append(position) | |
88 elif chromosome == "IV": | |
89 iv_position_list.append(position) | |
90 elif chromosome == "V": | |
91 v_position_list.append(position) | |
92 elif chromosome == "X": | |
93 x_position_list.append(position) | |
94 | |
95 return i_position_list, ii_position_list, iii_position_list, iv_position_list, v_position_list, x_position_list | |
96 | |
97 def skip_headers(reader = None, i_file = None): | |
98 # count headers | |
99 comment = 0 | |
100 while reader.next()[0].startswith('#'): | |
101 comment = comment + 1 | |
102 | |
103 # skip headers | |
104 i_file.seek(0) | |
105 for i in range(0, comment): | |
106 reader.next() | |
107 | |
108 def plot_data(position_list = None, chr = None, breaks = None, hist_color=None, ylim = None, ems=None, standardize=None): | |
109 positions = ",".join(map(str, map(lambda x: float(x) / 1000000, position_list))) | |
110 positions = "c(" + positions + ")" | |
111 | |
112 if (standardize=='true'): | |
113 r("hist(" + positions + ", xlim=c(0,21), ylim=c(0, %d "%ylim +"),col='"+ hist_color + "', breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=1), main = 'LG " + chr + "', ylab = 'Frequency Of SNPs', xlab = 'Location (Mb)')") | |
114 r("hist(" + positions + ", xlim=c(0,21), add=TRUE, ylim=c(0, %d "%ylim +"), col=rgb(1, 0, 0, 1), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=.5), main = 'Chr " + chr + "', ylab = 'Number Of SNPs', xlab = 'Location (Mb)')") | |
115 r("axis(1, at=seq(0, 21, by=1), labels=FALSE, tcl=-0.5)") | |
116 r("axis(1, at=seq(0, 21, by=0.5), labels=FALSE, tcl=-0.25)") | |
117 elif (standardize=='false'): | |
118 r("hist(" + positions + ", xlim=c(0,as.integer( ' " + str(breaks) + " ')), ylim=c(0, %d "%ylim +"),col='"+ hist_color + "', breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=1), main = 'LG " + chr + "', ylab = 'Frequency Of SNPs', xlab = 'Location (Mb)')") | |
119 r("hist(" + positions + ", xlim=c(0,as.integer( ' " + str(breaks) + " ')), add=TRUE, ylim=c(0, %d "%ylim +"), col=rgb(1, 0, 0, 1), breaks = seq(0, as.integer( ' " + str(breaks) + " '), by=.5), main = 'Chr " + chr + "', ylab = 'Number Of SNPs', xlab = 'Location (Mb)')") | |
120 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=1), labels=FALSE, tcl=-0.5)") | |
121 r("axis(1, at=seq(0, as.integer( ' " + str(breaks) + " '), by=0.5), labels=FALSE, tcl=-0.25)") | |
122 | |
123 | |
124 | |
125 if __name__ == "__main__": | |
126 main() |