7
|
1 # Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's all.
|
|
2 # K. McCoy
|
|
3
|
|
4 import math
|
|
5 import csv
|
|
6
|
|
7
|
|
8
|
|
9
|
|
10
|
|
11
|
|
12
|
|
13
|
|
14
|
|
15
|
|
16 ##### ARGUMENTS #####
|
|
17
|
|
18 def print_usage():
|
|
19 print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
|
|
20 print "\033[1m" + "Required" + "\033[0m" + "\n"
|
|
21 print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n"
|
|
22 print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
|
|
23 print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n"
|
|
24 print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n"
|
|
25 print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
|
|
26 print "\n"
|
|
27 print "\033[1m" + "Optional" + "\033[0m" + "\n"
|
|
28 print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
|
|
29 print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
|
|
30 print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
|
|
31 print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
|
|
32 print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
|
|
33 print "\n"
|
|
34
|
|
35 import argparse
|
|
36 parser = argparse.ArgumentParser()
|
|
37 parser.add_argument("-calctxt", action="store", dest="calctxt")
|
|
38 parser.add_argument("-normalize", action="store", dest="normalize")
|
|
39 parser.add_argument("-i", action="store", dest="input")
|
|
40 parser.add_argument("-out", action="store", dest="outfile")
|
|
41 parser.add_argument("-out2", action="store", dest="outfile2")
|
|
42 parser.add_argument("-cutoff", action="store", dest="cutoff")
|
|
43 parser.add_argument("-cutoff2", action="store", dest="cutoff2")
|
|
44 parser.add_argument("-wig", action="store", dest="wig")
|
|
45 parser.add_argument("-maxweight", action="store", dest="max_weight")
|
|
46 parser.add_argument("-multiply", action="store", dest="multiply")
|
|
47 arguments = parser.parse_args()
|
|
48
|
|
49 if (not arguments.input or not arguments.outfile or not arguments.calctxt):
|
|
50 print_usage()
|
|
51 quit()
|
|
52
|
|
53 if (not arguments.max_weight):
|
|
54 arguments.max_weight = 75
|
|
55
|
|
56 if (not arguments.cutoff):
|
|
57 arguments.cutoff = 0
|
|
58
|
|
59 # Cutoff2 only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
|
|
60
|
|
61 if (not arguments.cutoff2):
|
|
62 arguments.cutoff2 = 10
|
|
63
|
|
64 #Gets total & refname from calc_fit outfile2
|
|
65
|
|
66 with open(arguments.calctxt) as file:
|
|
67 calctxt = file.readlines()
|
|
68 total = float(calctxt[1].split()[1])
|
|
69 refname = calctxt[2].split()[1]
|
|
70
|
|
71
|
|
72
|
|
73
|
|
74
|
|
75
|
|
76
|
|
77
|
|
78
|
|
79
|
|
80 ##### CONSOLIDATING THE CALC_FIT FILE #####
|
|
81
|
|
82 with open(arguments.input) as file:
|
|
83 input = file.readlines()
|
|
84 results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
|
|
85 i = 1
|
|
86 d = float(input[i].split(",")[10])
|
|
87 while i < len(input):
|
|
88 position = float(input[i].split(",")[0])
|
|
89 strands = input[i].split(",")[1]
|
|
90 c1 = float(input[i].split(",")[2])
|
|
91 c2 = float(input[i].split(",")[3])
|
|
92 gene = input[i].split(",")[9]
|
|
93 while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4:
|
|
94 if i + 1 < len(input):
|
|
95 i += 1
|
|
96 c1 += float(input[i].split(",")[2])
|
|
97 c2 += float(input[i].split(",")[3])
|
|
98 strands = input[i].split(",")[1]
|
|
99 if strands[0] == 'b':
|
|
100 new_strands = 'b/'
|
|
101 elif strands[0] == '+':
|
|
102 if input[i].split(",")[1][0] == 'b':
|
|
103 new_strands = 'b/'
|
|
104 elif input[i].split(",")[1][0] == '+':
|
|
105 new_strands = '+/'
|
|
106 elif input[i].split(",")[1][0] == '-':
|
|
107 new_strands = 'b/'
|
|
108 elif strands[0] == '-':
|
|
109 if input[i].split(",")[1][0] == 'b':
|
|
110 new_strands = 'b/'
|
|
111 elif input[i].split(",")[1][0] == '+':
|
|
112 new_strands = 'b/'
|
|
113 elif input[i].split(",")[1][0] == '-':
|
|
114 new_strands = '-/'
|
|
115 if len(strands) == 3:
|
|
116 if len(input[i].split(",")[1]) < 3:
|
|
117 new_strands += strands[2]
|
|
118 elif strands[0] == 'b':
|
|
119 new_strands += 'b'
|
|
120 elif strands[0] == '+':
|
|
121 if input[i].split(",")[1][2] == 'b':
|
|
122 new_strands += 'b'
|
|
123 elif input[i].split(",")[1][2] == '+':
|
|
124 new_strands += '+'
|
|
125 elif input[i].split(",")[1][2] == '-':
|
|
126 new_strands += 'b'
|
|
127 elif strands[0] == '-':
|
|
128 if input[i].split(",")[1][2] == 'b':
|
|
129 new_strands += 'b'
|
|
130 elif input[i].split(",")[1][2] == '+':
|
|
131 new_strands += 'b'
|
|
132 elif input[i].split(",")[1][2] == '-':
|
|
133 new_strands += '-'
|
|
134 else:
|
|
135 if len(input[i].split(",")[1]) == 3:
|
|
136 new_strands += input[i].split(",")[1][2]
|
|
137 strands = new_strands
|
|
138 i +=1
|
|
139 if c2 != 0:
|
|
140 ratio = c2/c1
|
|
141 else:
|
|
142 ratio = 0
|
|
143 mt_freq_t1 = c1/total
|
|
144 mt_freq_t2 = c2/total
|
|
145 pop_freq_t1 = 1 - mt_freq_t1
|
|
146 pop_freq_t2 = 1 - mt_freq_t2
|
|
147 w = 0
|
|
148 if mt_freq_t2 != 0:
|
|
149 top_w = math.log(mt_freq_t2*(d/mt_freq_t1))
|
|
150 bot_w = math.log(pop_freq_t2*(d/pop_freq_t1))
|
|
151 w = top_w/bot_w
|
|
152 row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w]
|
|
153 results.append(row)
|
|
154 with open(arguments.outfile, "wb") as csvfile:
|
|
155 writer = csv.writer(csvfile)
|
|
156 writer.writerows(results)
|
|
157
|
|
158
|
|
159
|
|
160
|
|
161
|
|
162
|
|
163
|
|
164
|
|
165
|
|
166
|
|
167 ##### REDOING NORMALIZATION #####
|
|
168
|
|
169 # The header below is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.
|
|
170
|
|
171 if (arguments.wig):
|
|
172 wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"
|
|
173
|
|
174 if (arguments.normalize):
|
|
175 with open(arguments.normalize) as file:
|
|
176 transposon_genes = file.read().splitlines()
|
|
177 print "Normalize genes loaded" + "\n"
|
|
178 blank_ws = 0
|
|
179 sum = 0
|
|
180 count = 0
|
|
181 weights = []
|
|
182 scores = []
|
|
183 for list in results:
|
|
184 if list[9] != '' and list[9] in transposon_genes and list[11]:
|
|
185 c1 = list[2]
|
|
186 c2 = list[3]
|
|
187 score = list[11]
|
|
188 avg = (c1 + c2)/2
|
|
189
|
|
190 # Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
|
|
191
|
|
192 if float(c1) >= float(arguments.cutoff2):
|
|
193
|
|
194 # Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
|
|
195
|
|
196 if (avg >= float(arguments.max_weight)):
|
|
197 avg = float(arguments.max_weight)
|
|
198
|
|
199 # Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.
|
|
200 # For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!
|
|
201
|
|
202 if score == 0:
|
|
203 blank_ws += 1
|
|
204 sum += score
|
|
205 count += 1
|
|
206 weights.append(avg)
|
|
207 scores.append(score)
|
|
208
|
|
209 print str(list[9]) + " " + str(score) + " " + str(c1)
|
|
210
|
|
211 # Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.
|
|
212
|
|
213 blank_count = 0
|
|
214 original_count = len(scores)
|
|
215 i = 0
|
|
216 while i < original_count:
|
|
217 w_value = scores[i]
|
|
218 if w_value == 0:
|
|
219 blank_count += 1
|
|
220 weights.pop[i]
|
|
221 scores.pop[i]
|
|
222 i-=1
|
|
223 i += 1
|
|
224
|
|
225 # If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
|
|
226
|
|
227 if len(scores) == 0:
|
|
228 print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
|
|
229 quit()
|
|
230
|
|
231 pc_blank_normals = float(blank_count) / float(original_count)
|
|
232 print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
|
|
233 with open(arguments.outfile2, "w") as f:
|
|
234 f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
|
|
235
|
|
236 average = sum / count
|
|
237 i = 0
|
|
238 weighted_sum = 0
|
|
239 weight_sum = 0
|
|
240 while i < len(weights):
|
|
241 weighted_sum += weights[i]*scores[i]
|
|
242 weight_sum += weights[i]
|
|
243 i += 1
|
|
244 weighted_average = weighted_sum/weight_sum
|
|
245
|
|
246 print "Normalization step:" + "\n"
|
|
247 print "Regular average: " + str(average) + "\n"
|
|
248 print "Weighted Average: " + str(weighted_average) + "\n"
|
|
249 print "Total Insertions: " + str(count) + "\n"
|
|
250
|
|
251 old_ws = 0
|
|
252 new_ws = 0
|
|
253 wcount = 0
|
|
254 for list in results:
|
|
255 if list[11] == 'W':
|
|
256 continue
|
|
257 new_w = float(list[11])/weighted_average
|
|
258
|
|
259 # Sometimes you want to multiply all the fitness values by a constant; this does that.
|
|
260 # For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share.
|
|
261
|
|
262 if arguments.multiply:
|
|
263 new_w *= float(arguments.multiply)
|
|
264
|
|
265 if float(list[11]) > 0:
|
|
266 old_ws += float(list[11])
|
|
267 new_ws += new_w
|
|
268 wcount += 1
|
|
269
|
|
270 list[12] = new_w
|
|
271
|
|
272 if (arguments.wig):
|
|
273 wigstring += str(list[0]) + " " + str(new_w) + "\n"
|
|
274
|
|
275 old_w_mean = old_ws / wcount
|
|
276 new_w_mean = new_ws / wcount
|
|
277 print "Old W Average: " + str(old_w_mean) + "\n"
|
|
278 print "New W Average: " + str(new_w_mean) + "\n"
|
|
279
|
|
280 with open(arguments.outfile, "wb") as csvfile:
|
|
281 writer = csv.writer(csvfile)
|
|
282 writer.writerows(results)
|
|
283
|
|
284 if (arguments.wig):
|
|
285 if (arguments.normalize):
|
|
286 with open(arguments.wig, "wb") as wigfile:
|
|
287 wigfile.write(wigstring)
|
|
288 else:
|
|
289 for list in results:
|
|
290 wigstring += str(list[0]) + " " + str(list[11]) + "\n"
|
|
291 with open(arguments.wig, "wb") as wigfile:
|
|
292 wigfile.write(wigstring)
|
|
293
|
|
294
|
|
295 # ___ ___ ___ ___ ___ ___ ___ ___
|
|
296 # /\__\ /\ \ /\__\ /\__\ /\ \ /\ \ /\ \ /\__\
|
|
297 # /:/ _/_ /::\ \ |::L__L /::L_L_ /::\ \ /::\ \ /::\ \ |::L__L
|
|
298 # /::-"\__\ /::\:\__\ |:::\__\ /:/L:\__\ /:/\:\__\ /:/\:\__\ /:/\:\__\ |:::\__\
|
|
299 # \;:;-",-" \/\::/ / /:;;/__/ \/_/:/ / \:\ \/__/ \:\ \/__/ \:\/:/ / /:;;/__/
|
|
300 # |:| | /:/ / \/__/ /:/ / \:\__\ \:\__\ \::/ / \/__/
|
|
301 # \|__| \/__/ \/__/ \/__/ \/__/ \/__/ |