# HG changeset patch # User kaymccoy # Date 1471034668 14400 # Node ID 4b2fc35b32b0f136e62171723e1a76b6903762e1 # Parent b2943ce019bbb82c993180f96fd257c452adc026 Uploaded diff -r b2943ce019bb -r 4b2fc35b32b0 consol_fit.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consol_fit.py Fri Aug 12 16:44:28 2016 -0400 @@ -0,0 +1,301 @@ +# Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's all. +# K. McCoy + +import math +import csv + + + + + + + + + + +##### ARGUMENTS ##### + +def print_usage(): + print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n" + print "\033[1m" + "Required" + "\033[0m" + "\n" + print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n" + print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n" + print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n" + print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n" + print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n" + print "\n" + print "\033[1m" + "Optional" + "\033[0m" + "\n" + print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n" + print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n" + print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n" + print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n" + print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n" + print "\n" + +import argparse +parser = argparse.ArgumentParser() +parser.add_argument("-calctxt", action="store", dest="calctxt") +parser.add_argument("-normalize", action="store", dest="normalize") +parser.add_argument("-i", action="store", dest="input") +parser.add_argument("-out", action="store", dest="outfile") +parser.add_argument("-out2", action="store", dest="outfile2") +parser.add_argument("-cutoff", action="store", dest="cutoff") +parser.add_argument("-cutoff2", action="store", dest="cutoff2") +parser.add_argument("-wig", action="store", dest="wig") +parser.add_argument("-maxweight", action="store", dest="max_weight") +parser.add_argument("-multiply", action="store", dest="multiply") +arguments = parser.parse_args() + +if (not arguments.input or not arguments.outfile or not arguments.calctxt): + print_usage() + quit() + +if (not arguments.max_weight): + arguments.max_weight = 75 + +if (not arguments.cutoff): + arguments.cutoff = 0 + +# Cutoff2 only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff. + +if (not arguments.cutoff2): + arguments.cutoff2 = 10 + +#Gets total & refname from calc_fit outfile2 + +with open(arguments.calctxt) as file: + calctxt = file.readlines() +total = float(calctxt[1].split()[1]) +refname = calctxt[2].split()[1] + + + + + + + + + + +##### CONSOLIDATING THE CALC_FIT FILE ##### + +with open(arguments.input) as file: + input = file.readlines() +results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]] +i = 1 +d = float(input[i].split(",")[10]) +while i < len(input): + position = float(input[i].split(",")[0]) + strands = input[i].split(",")[1] + c1 = float(input[i].split(",")[2]) + c2 = float(input[i].split(",")[3]) + gene = input[i].split(",")[9] + while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4: + if i + 1 < len(input): + i += 1 + c1 += float(input[i].split(",")[2]) + c2 += float(input[i].split(",")[3]) + strands = input[i].split(",")[1] + if strands[0] == 'b': + new_strands = 'b/' + elif strands[0] == '+': + if input[i].split(",")[1][0] == 'b': + new_strands = 'b/' + elif input[i].split(",")[1][0] == '+': + new_strands = '+/' + elif input[i].split(",")[1][0] == '-': + new_strands = 'b/' + elif strands[0] == '-': + if input[i].split(",")[1][0] == 'b': + new_strands = 'b/' + elif input[i].split(",")[1][0] == '+': + new_strands = 'b/' + elif input[i].split(",")[1][0] == '-': + new_strands = '-/' + if len(strands) == 3: + if len(input[i].split(",")[1]) < 3: + new_strands += strands[2] + elif strands[0] == 'b': + new_strands += 'b' + elif strands[0] == '+': + if input[i].split(",")[1][2] == 'b': + new_strands += 'b' + elif input[i].split(",")[1][2] == '+': + new_strands += '+' + elif input[i].split(",")[1][2] == '-': + new_strands += 'b' + elif strands[0] == '-': + if input[i].split(",")[1][2] == 'b': + new_strands += 'b' + elif input[i].split(",")[1][2] == '+': + new_strands += 'b' + elif input[i].split(",")[1][2] == '-': + new_strands += '-' + else: + if len(input[i].split(",")[1]) == 3: + new_strands += input[i].split(",")[1][2] + strands = new_strands + i +=1 + if c2 != 0: + ratio = c2/c1 + else: + ratio = 0 + mt_freq_t1 = c1/total + mt_freq_t2 = c2/total + pop_freq_t1 = 1 - mt_freq_t1 + pop_freq_t2 = 1 - mt_freq_t2 + w = 0 + if mt_freq_t2 != 0: + top_w = math.log(mt_freq_t2*(d/mt_freq_t1)) + bot_w = math.log(pop_freq_t2*(d/pop_freq_t1)) + w = top_w/bot_w + row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w] + results.append(row) +with open(arguments.outfile, "wb") as csvfile: + writer = csv.writer(csvfile) + writer.writerows(results) + + + + + + + + + + +##### REDOING NORMALIZATION ##### + +# The header below is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site. + +if (arguments.wig): + wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n" + +if (arguments.normalize): + with open(arguments.normalize) as file: + transposon_genes = file.read().splitlines() + print "Normalize genes loaded" + "\n" + blank_ws = 0 + sum = 0 + count = 0 + weights = [] + scores = [] + for list in results: + if list[9] != '' and list[9] in transposon_genes and list[11]: + c1 = list[2] + c2 = list[3] + score = list[11] + avg = (c1 + c2)/2 + +# Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers. + + if float(c1) >= float(arguments.cutoff2): + +# Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization. + + if (avg >= float(arguments.max_weight)): + avg = float(arguments.max_weight) + +# Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments. +# For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance! + + if score == 0: + blank_ws += 1 + sum += score + count += 1 + weights.append(avg) + scores.append(score) + + print str(list[9]) + " " + str(score) + " " + str(c1) + +# Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2. + + blank_count = 0 + original_count = len(scores) + i = 0 + while i < original_count: + w_value = scores[i] + if w_value == 0: + blank_count += 1 + weights.pop[i] + scores.pop[i] + i-=1 + i += 1 + +# If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2. + + if len(scores) == 0: + print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n" + quit() + + pc_blank_normals = float(blank_count) / float(original_count) + print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n" + with open(arguments.outfile2, "w") as f: + f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname) + + average = sum / count + i = 0 + weighted_sum = 0 + weight_sum = 0 + while i < len(weights): + weighted_sum += weights[i]*scores[i] + weight_sum += weights[i] + i += 1 + weighted_average = weighted_sum/weight_sum + + print "Normalization step:" + "\n" + print "Regular average: " + str(average) + "\n" + print "Weighted Average: " + str(weighted_average) + "\n" + print "Total Insertions: " + str(count) + "\n" + + old_ws = 0 + new_ws = 0 + wcount = 0 + for list in results: + if list[11] == 'W': + continue + new_w = float(list[11])/weighted_average + +# Sometimes you want to multiply all the fitness values by a constant; this does that. +# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. + + if arguments.multiply: + new_w *= float(arguments.multiply) + + if float(list[11]) > 0: + old_ws += float(list[11]) + new_ws += new_w + wcount += 1 + + list[12] = new_w + + if (arguments.wig): + wigstring += str(list[0]) + " " + str(new_w) + "\n" + + old_w_mean = old_ws / wcount + new_w_mean = new_ws / wcount + print "Old W Average: " + str(old_w_mean) + "\n" + print "New W Average: " + str(new_w_mean) + "\n" + +with open(arguments.outfile, "wb") as csvfile: + writer = csv.writer(csvfile) + writer.writerows(results) + +if (arguments.wig): + if (arguments.normalize): + with open(arguments.wig, "wb") as wigfile: + wigfile.write(wigstring) + else: + for list in results: + wigstring += str(list[0]) + " " + str(list[11]) + "\n" + with open(arguments.wig, "wb") as wigfile: + wigfile.write(wigstring) + + +# ___ ___ ___ ___ ___ ___ ___ ___ +# /\__\ /\ \ /\__\ /\__\ /\ \ /\ \ /\ \ /\__\ +# /:/ _/_ /::\ \ |::L__L /::L_L_ /::\ \ /::\ \ /::\ \ |::L__L +# /::-"\__\ /::\:\__\ |:::\__\ /:/L:\__\ /:/\:\__\ /:/\:\__\ /:/\:\__\ |:::\__\ +# \;:;-",-" \/\::/ / /:;;/__/ \/_/:/ / \:\ \/__/ \:\ \/__/ \:\/:/ / /:;;/__/ +# |:| | /:/ / \/__/ /:/ / \:\__\ \:\__\ \::/ / \/__/ +# \|__| \/__/ \/__/ \/__/ \/__/ \/__/ \ No newline at end of file