Mercurial > repos > kaymccoy > consolidate_fitnesses
comparison consol_fit.py @ 0:da1c63d00c1b draft
Uploaded
author | kaymccoy |
---|---|
date | Thu, 11 Aug 2016 18:07:29 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:da1c63d00c1b |
---|---|
1 # Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's all. | |
2 | |
3 import math | |
4 import csv | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 ##### ARGUMENTS ##### | |
16 | |
17 def print_usage(): | |
18 print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n" | |
19 print "\033[1m" + "Required" + "\033[0m" + "\n" | |
20 print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n" | |
21 print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n" | |
22 print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n" | |
23 print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n" | |
24 print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n" | |
25 print "\n" | |
26 print "\033[1m" + "Optional" + "\033[0m" + "\n" | |
27 print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n" | |
28 print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n" | |
29 print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n" | |
30 print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n" | |
31 print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n" | |
32 print "\n" | |
33 | |
34 import argparse | |
35 parser = argparse.ArgumentParser() | |
36 parser.add_argument("-calctxt", action="store", dest="calctxt") | |
37 parser.add_argument("-normalize", action="store", dest="normalize") | |
38 parser.add_argument("-i", action="store", dest="input") | |
39 parser.add_argument("-out", action="store", dest="outfile") | |
40 parser.add_argument("-out2", action="store", dest="outfile2") | |
41 parser.add_argument("-cutoff", action="store", dest="cutoff") | |
42 parser.add_argument("-cutoff2", action="store", dest="cutoff2") | |
43 parser.add_argument("-wig", action="store", dest="wig") | |
44 parser.add_argument("-maxweight", action="store", dest="max_weight") | |
45 parser.add_argument("-multiply", action="store", dest="multiply") | |
46 arguments = parser.parse_args() | |
47 | |
48 if (not arguments.input or not arguments.outfile or not arguments.calctxt): | |
49 print_usage() | |
50 quit() | |
51 | |
52 if (not arguments.max_weight): | |
53 arguments.max_weight = 75 | |
54 | |
55 if (not arguments.cutoff): | |
56 arguments.cutoff = 0 | |
57 | |
58 # Cutoff2 only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff. | |
59 | |
60 if (not arguments.cutoff2): | |
61 arguments.cutoff2 = 10 | |
62 | |
63 #Gets total & refname from calc_fit outfile2 | |
64 | |
65 with open(arguments.calctxt) as file: | |
66 calctxt = file.readlines() | |
67 total = float(calctxt[1].split()[1]) | |
68 refname = calctxt[2].split()[1] | |
69 | |
70 | |
71 | |
72 | |
73 | |
74 | |
75 | |
76 | |
77 | |
78 | |
79 ##### CONSOLIDATING THE CALC_FIT FILE ##### | |
80 | |
81 with open(arguments.input) as file: | |
82 input = file.readlines() | |
83 results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]] | |
84 i = 1 | |
85 d = float(input[i].split(",")[10]) | |
86 while i < len(input): | |
87 position = float(input[i].split(",")[0]) | |
88 strands = input[i].split(",")[1] | |
89 c1 = float(input[i].split(",")[2]) | |
90 c2 = float(input[i].split(",")[3]) | |
91 gene = input[i].split(",")[9] | |
92 while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4: | |
93 if i + 1 < len(input): | |
94 i += 1 | |
95 c1 += float(input[i].split(",")[2]) | |
96 c2 += float(input[i].split(",")[3]) | |
97 strands = input[i].split(",")[1] | |
98 if strands[0] == 'b': | |
99 new_strands = 'b/' | |
100 elif strands[0] == '+': | |
101 if input[i].split(",")[1][0] == 'b': | |
102 new_strands = 'b/' | |
103 elif input[i].split(",")[1][0] == '+': | |
104 new_strands = '+/' | |
105 elif input[i].split(",")[1][0] == '-': | |
106 new_strands = 'b/' | |
107 elif strands[0] == '-': | |
108 if input[i].split(",")[1][0] == 'b': | |
109 new_strands = 'b/' | |
110 elif input[i].split(",")[1][0] == '+': | |
111 new_strands = 'b/' | |
112 elif input[i].split(",")[1][0] == '-': | |
113 new_strands = '-/' | |
114 if len(strands) == 3: | |
115 if len(input[i].split(",")[1]) < 3: | |
116 new_strands += strands[2] | |
117 elif strands[0] == 'b': | |
118 new_strands += 'b' | |
119 elif strands[0] == '+': | |
120 if input[i].split(",")[1][2] == 'b': | |
121 new_strands += 'b' | |
122 elif input[i].split(",")[1][2] == '+': | |
123 new_strands += '+' | |
124 elif input[i].split(",")[1][2] == '-': | |
125 new_strands += 'b' | |
126 elif strands[0] == '-': | |
127 if input[i].split(",")[1][2] == 'b': | |
128 new_strands += 'b' | |
129 elif input[i].split(",")[1][2] == '+': | |
130 new_strands += 'b' | |
131 elif input[i].split(",")[1][2] == '-': | |
132 new_strands += '-' | |
133 else: | |
134 if len(input[i].split(",")[1]) == 3: | |
135 new_strands += input[i].split(",")[1][2] | |
136 strands = new_strands | |
137 i +=1 | |
138 if c2 != 0: | |
139 ratio = c2/c1 | |
140 else: | |
141 ratio = 0 | |
142 mt_freq_t1 = c1/total | |
143 mt_freq_t2 = c2/total | |
144 pop_freq_t1 = 1 - mt_freq_t1 | |
145 pop_freq_t2 = 1 - mt_freq_t2 | |
146 w = 0 | |
147 if mt_freq_t2 != 0: | |
148 top_w = math.log(mt_freq_t2*(d/mt_freq_t1)) | |
149 bot_w = math.log(pop_freq_t2*(d/pop_freq_t1)) | |
150 w = top_w/bot_w | |
151 row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w] | |
152 results.append(row) | |
153 with open(arguments.outfile, "wb") as csvfile: | |
154 writer = csv.writer(csvfile) | |
155 writer.writerows(results) | |
156 | |
157 | |
158 | |
159 | |
160 | |
161 | |
162 | |
163 | |
164 | |
165 | |
166 ##### REDOING NORMALIZATION ##### | |
167 | |
168 # The header below is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site. | |
169 | |
170 if (arguments.wig): | |
171 wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n" | |
172 | |
173 if (arguments.normalize): | |
174 with open(arguments.normalize) as file: | |
175 transposon_genes = file.read().splitlines() | |
176 print "Normalize genes loaded" + "\n" | |
177 blank_ws = 0 | |
178 sum = 0 | |
179 count = 0 | |
180 weights = [] | |
181 scores = [] | |
182 for list in results: | |
183 if list[9] != '' and list[9] in transposon_genes and list[11]: | |
184 c1 = list[2] | |
185 c2 = list[3] | |
186 score = list[11] | |
187 avg = (c1 + c2)/2 | |
188 | |
189 # Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers. | |
190 | |
191 if float(c1) >= float(arguments.cutoff2): | |
192 | |
193 # Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization. | |
194 | |
195 if (avg >= float(arguments.max_weight)): | |
196 avg = float(arguments.max_weight) | |
197 | |
198 # Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments. | |
199 # For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance! | |
200 | |
201 if score == 0: | |
202 blank_ws += 1 | |
203 sum += score | |
204 count += 1 | |
205 weights.append(avg) | |
206 scores.append(score) | |
207 | |
208 print str(list[9]) + " " + str(score) + " " + str(c1) | |
209 | |
210 # Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2. | |
211 | |
212 blank_count = 0 | |
213 original_count = len(scores) | |
214 i = 0 | |
215 while i < original_count: | |
216 w_value = scores[i] | |
217 if w_value == 0: | |
218 blank_count += 1 | |
219 weights.pop[i] | |
220 scores.pop[i] | |
221 i-=1 | |
222 i += 1 | |
223 | |
224 # If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2. | |
225 | |
226 if len(scores) == 0: | |
227 print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n" | |
228 quit() | |
229 | |
230 pc_blank_normals = float(blank_count) / float(original_count) | |
231 print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n" | |
232 with open(arguments.outfile2, "w") as f: | |
233 f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname) | |
234 | |
235 average = sum / count | |
236 i = 0 | |
237 weighted_sum = 0 | |
238 weight_sum = 0 | |
239 while i < len(weights): | |
240 weighted_sum += weights[i]*scores[i] | |
241 weight_sum += weights[i] | |
242 i += 1 | |
243 weighted_average = weighted_sum/weight_sum | |
244 | |
245 print "Normalization step:" + "\n" | |
246 print "Regular average: " + str(average) + "\n" | |
247 print "Weighted Average: " + str(weighted_average) + "\n" | |
248 print "Total Insertions: " + str(count) + "\n" | |
249 | |
250 old_ws = 0 | |
251 new_ws = 0 | |
252 wcount = 0 | |
253 for list in results: | |
254 if list[11] == 'W': | |
255 continue | |
256 new_w = float(list[11])/weighted_average | |
257 | |
258 # Sometimes you want to multiply all the fitness values by a constant; this does that. | |
259 # For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. | |
260 | |
261 if arguments.multiply: | |
262 new_w *= float(arguments.multiply) | |
263 | |
264 if float(list[11]) > 0: | |
265 old_ws += float(list[11]) | |
266 new_ws += new_w | |
267 wcount += 1 | |
268 | |
269 list[12] = new_w | |
270 | |
271 if (arguments.wig): | |
272 wigstring += str(list[0]) + " " + str(new_w) + "\n" | |
273 | |
274 old_w_mean = old_ws / wcount | |
275 new_w_mean = new_ws / wcount | |
276 print "Old W Average: " + str(old_w_mean) + "\n" | |
277 print "New W Average: " + str(new_w_mean) + "\n" | |
278 | |
279 with open(arguments.outfile, "wb") as csvfile: | |
280 writer = csv.writer(csvfile) | |
281 writer.writerows(results) | |
282 | |
283 if (arguments.wig): | |
284 if (arguments.normalize): | |
285 with open(arguments.wig, "wb") as wigfile: | |
286 wigfile.write(wigstring) | |
287 else: | |
288 for list in results: | |
289 wigstring += str(list[0]) + " " + str(list[11]) + "\n" | |
290 with open(arguments.wig, "wb") as wigfile: | |
291 wigfile.write(wigstring) | |
292 | |
293 | |
294 # ___ ___ ___ ___ ___ ___ ___ ___ | |
295 # /\__\ /\ \ /\__\ /\__\ /\ \ /\ \ /\ \ /\__\ | |
296 # /:/ _/_ /::\ \ |::L__L /::L_L_ /::\ \ /::\ \ /::\ \ |::L__L | |
297 # /::-"\__\ /::\:\__\ |:::\__\ /:/L:\__\ /:/\:\__\ /:/\:\__\ /:/\:\__\ |:::\__\ | |
298 # \;:;-",-" \/\::/ / /:;;/__/ \/_/:/ / \:\ \/__/ \:\ \/__/ \:\/:/ / /:;;/__/ | |
299 # |:| | /:/ / \/__/ /:/ / \:\__\ \:\__\ \::/ / \/__/ | |
300 # \|__| \/__/ \/__/ \/__/ \/__/ \/__/ |