Mercurial > repos > kaymccoy > consolidate_fitnesses
changeset 6:b2943ce019bb draft
Deleted selected files
author | kaymccoy |
---|---|
date | Fri, 12 Aug 2016 16:44:09 -0400 |
parents | deb5134655cb |
children | 4b2fc35b32b0 |
files | consol_fit.py consol_fit.xml |
diffstat | 2 files changed, 0 insertions(+), 425 deletions(-) [+] |
line wrap: on
line diff
--- a/consol_fit.py Thu Aug 11 18:34:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,344 +0,0 @@ -# Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's it! - -# Test: python ../script/consol_fit.py -calctxt results/py_2_L3_2394eVI_Gluc.txt -wig gview/consol_L3_2394eVI_Gluc.wig -i results/py_L3_2394eVI_Gluc.csv -out results/consol_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.csv -normalize tigr4_normal.txt - -# Test: python ../script/consol_fit.py -calctxt results/py_2_L3_2394eVI_Gluc.txt -wig gview/consol_L3_2394eVI_Gluc.wig -i results/galaxy_test.csv -out results/consol_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.csv -normalize tigr4_normal.txt - -import math -import csv - - - - - - - - - - -##### ARGUMENTS ##### - -def print_usage(): - print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n" - print "\033[1m" + "Required" + "\033[0m" + "\n" - print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n" - print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n" - print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n" - print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n" - print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n" - print "\n" - print "\033[1m" + "Optional" + "\033[0m" + "\n" - print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n" - print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n" - print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n" - print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n" - print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n" - print "\n" - -import argparse -parser = argparse.ArgumentParser() -parser.add_argument("-calctxt", action="store", dest="calctxt") -parser.add_argument("-normalize", action="store", dest="normalize") -parser.add_argument("-i", action="store", dest="input") -parser.add_argument("-out", action="store", dest="outfile") -parser.add_argument("-out2", action="store", dest="outfile2") -parser.add_argument("-cutoff", action="store", dest="cutoff") -parser.add_argument("-cutoff2", action="store", dest="cutoff2") -parser.add_argument("-wig", action="store", dest="wig") -parser.add_argument("-maxweight", action="store", dest="max_weight") -parser.add_argument("-multiply", action="store", dest="multiply") -arguments = parser.parse_args() - -# Checks that all the required arguments have actually been entered - -if (not arguments.input or not arguments.outfile or not arguments.calctxt): - print_usage() - quit() - -# - -if (not arguments.max_weight): - arguments.max_weight = 75 - -# - -if (not arguments.cutoff): - arguments.cutoff = 0 - -# Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate. -# This only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff. - -if (not arguments.cutoff2): - arguments.cutoff2 = 10 - -#Gets total & refname from calc_fit outfile2 - -with open(arguments.calctxt) as file: - calctxt = file.readlines() -total = float(calctxt[1].split()[1]) -refname = calctxt[2].split()[1] - - - - - - - - - - -##### CONSOLIDATING THE CALC_FIT FILE ##### - -with open(arguments.input) as file: - input = file.readlines() -results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]] -i = 1 -d = float(input[i].split(",")[10]) -while i < len(input): - position = float(input[i].split(",")[0]) - strands = input[i].split(",")[1] - c1 = float(input[i].split(",")[2]) - c2 = float(input[i].split(",")[3]) - gene = input[i].split(",")[9] - while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4: - if i + 1 < len(input): - i += 1 - c1 += float(input[i].split(",")[2]) - c2 += float(input[i].split(",")[3]) - strands = input[i].split(",")[1] - if strands[0] == 'b': - new_strands = 'b/' - elif strands[0] == '+': - if input[i].split(",")[1][0] == 'b': - new_strands = 'b/' - elif input[i].split(",")[1][0] == '+': - new_strands = '+/' - elif input[i].split(",")[1][0] == '-': - new_strands = 'b/' - elif strands[0] == '-': - if input[i].split(",")[1][0] == 'b': - new_strands = 'b/' - elif input[i].split(",")[1][0] == '+': - new_strands = 'b/' - elif input[i].split(",")[1][0] == '-': - new_strands = '-/' - if len(strands) == 3: - if len(input[i].split(",")[1]) < 3: - new_strands += strands[2] - elif strands[0] == 'b': - new_strands += 'b' - elif strands[0] == '+': - if input[i].split(",")[1][2] == 'b': - new_strands += 'b' - elif input[i].split(",")[1][2] == '+': - new_strands += '+' - elif input[i].split(",")[1][2] == '-': - new_strands += 'b' - elif strands[0] == '-': - if input[i].split(",")[1][2] == 'b': - new_strands += 'b' - elif input[i].split(",")[1][2] == '+': - new_strands += 'b' - elif input[i].split(",")[1][2] == '-': - new_strands += '-' - else: - if len(input[i].split(",")[1]) == 3: - new_strands += input[i].split(",")[1][2] - strands = new_strands - i +=1 - if c2 != 0: - ratio = c2/c1 - else: - ratio = 0 - mt_freq_t1 = c1/total - mt_freq_t2 = c2/total - pop_freq_t1 = 1 - mt_freq_t1 - pop_freq_t2 = 1 - mt_freq_t2 - w = 0 - if mt_freq_t2 != 0: - top_w = math.log(mt_freq_t2*(d/mt_freq_t1)) - bot_w = math.log(pop_freq_t2*(d/pop_freq_t1)) - w = top_w/bot_w - row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w] - results.append(row) -with open(arguments.outfile, "wb") as csvfile: - writer = csv.writer(csvfile) - writer.writerows(results) - - - - - - - - - - -##### REDOING NORMALIZATION ##### - -# If making a WIG file is requested in the arguments, starts a string to be added to and then written to the WIG file with a typical WIG file header. -# The header is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site. - -if (arguments.wig): - wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n" - -# If a file's given for normalization, starts normalization; this corrects for anything that would cause all the fitness values to be too high or too low. - -if (arguments.normalize): - -# Makes a list of the genes in the normalization file, which should all be transposon genes (these naturally ought to have a fitness value of exactly 1, because transposons are generally non-coding DNA) - - with open(arguments.normalize) as file: - transposon_genes = file.read().splitlines() - print "Normalize genes loaded" + "\n" - blank_ws = 0 - sum = 0 - count = 0 - weights = [] - scores = [] - for list in results: - -# Finds all insertions within one of the normalization genes that also have a w value; gets their c1 and c2 values (the number of insertions at t1 and t2) and takes the average of that! -# The average is later used as the "weight" of an insertion location's fitness - if it's had more insertions, it should weigh proportionally more towards the average fitness of insertions within the normalization genes. - - if list[9] != '' and list[9] in transposon_genes and list[11]: - c1 = list[2] - c2 = list[3] - score = list[11] - avg = (c1 + c2)/2 - -# Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers. - - if float(c1) >= float(arguments.cutoff2): - -# Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization. - - if (avg >= float(arguments.max_weight)): - avg = float(arguments.max_weight) - -# Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments. -# For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter -# of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance! - - if score == 0: - blank_ws += 1 - -# Adds the fitness values of the insertions within normalization genes together and increments count so their average fitness (sum/count) can be calculated later on - - sum += score - count += 1 - -# Records the weights of the fitness values of the insertion locations in corresponding lists - for example, weights[2] would be the weight of the fitness value at score[2] - - weights.append(avg) - scores.append(score) - - print str(list[9]) + " " + str(score) + " " + str(c1) - -# Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2. - - blank_count = 0 - original_count = len(scores) - i = 0 - while i < original_count: - w_value = scores[i] - if w_value == 0: - blank_count += 1 - weights.pop[i] - scores.pop[i] - i-=1 - i += 1 - -# If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2. - - if len(scores) == 0: - print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n" - quit() - -# Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis. - - pc_blank_normals = float(blank_count) / float(original_count) - print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n" - with open(arguments.outfile2, "w") as f: - f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname) - - -# Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had. - - average = sum / count - i = 0 - weighted_sum = 0 - weight_sum = 0 - while i < len(weights): - weighted_sum += weights[i]*scores[i] - weight_sum += weights[i] - i += 1 - weighted_average = weighted_sum/weight_sum - -# Prints the regular average, weighted average, and total insertions for reference - - print "Normalization step:" + "\n" - print "Regular average: " + str(average) + "\n" - print "Weighted Average: " + str(weighted_average) + "\n" - print "Total Insertions: " + str(count) + "\n" - -# The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1. -# For example, if the average fitness for genes was too low overall - let's say 0.97 within the normalization geness - every fitness would be proportionally raised. - - old_ws = 0 - new_ws = 0 - wcount = 0 - for list in results: - if list[11] == 'W': - continue - new_w = float(list[11])/weighted_average - -# Sometimes you want to multiply all the fitness values by a constant; this does that. -# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. This is -# because independent mutations should have a fitness value that's equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you'd multiply -# all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library! - - if arguments.multiply: - new_w *= float(arguments.multiply) - -# Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later). - - if float(list[11]) > 0: - old_ws += float(list[11]) - new_ws += new_w - wcount += 1 - -# Writes the new w score into the results list of lists. - - list[12] = new_w - -# Adds a line to wiglist for each insertion position, with the insertion position and its new w value. - - if (arguments.wig): - wigstring += str(list[0]) + " " + str(new_w) + "\n" - -# Prints the old w mean and new w mean for reference. - - old_w_mean = old_ws / wcount - new_w_mean = new_ws / wcount - print "Old W Average: " + str(old_w_mean) + "\n" - print "New W Average: " + str(new_w_mean) + "\n" - -# Overwrites the old file with the normalized file. - -with open(arguments.outfile, "wb") as csvfile: - writer = csv.writer(csvfile) - writer.writerows(results) - -# If a WIG file was requested, actually creates the WIG file and writes wiglist to it -# So what's written here is the WIG header plus each insertion position and it's new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for. - -if (arguments.wig): - if (arguments.normalize): - with open(arguments.wig, "wb") as wigfile: - wigfile.write(wigstring) - else: - for list in results: - wigstring += str(list[0]) + " " + str(list[11]) + "\n" - with open(arguments.wig, "wb") as wigfile: - wigfile.write(wigstring) \ No newline at end of file
--- a/consol_fit.xml Thu Aug 11 18:34:10 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,81 +0,0 @@ -<tool id="consolidate_fitnesses" name="Consolidate Fitnesses"> - <description>of transposon insertion locations</description> - <requirements> - <requirement type="package" version="1.64">biopython</requirement> - </requirements> - <command interpreter="python"> - consol_fit.py - -i $input - -calctxt $calctxt - -out $output - -out2 $output2 - -maxweight $maxweight - -cutoff $cutoff - -cutoff2 $cutoff2 - -wig $output3 - #if $normalization.calculations == "yes": - -normalize $normalization.genes - #end if - #if $multiply.choice == "yes": - -multiply $multiply.factor - #end if - #if $reads.downstream == "yes": - -d 1 - #end if - </command> - <inputs> - <param name="input" type="data" label="Fitness file"/> - <param name="calctxt" type="data" label="the txt file output from calc_fitness"/> - <conditional name="normalization"> - <param name="calculations" type="select" label="Normalize fitness calculations?"> - <option value="no">No</option> - <option value="yes">Yes</option> - </param> - <when value="no"> - <!-- do nothing --> - </when> - <when value="yes"> - <param name="genes" type="data" label="Genes to normalize by" /> - </when> - </conditional> - <param name="cutoff" type="float" value="0.0" label="Cutoff"/> - <param name="cutoff2" type="float" value="0.0" label="Cutoff2"/> - <param name="maxweight" type="float" value="75" label="Maximum weight of a transposon gene in normalization calculations"/> - <conditional name="multiply"> - <param name="choice" type="select" label="Multiply fitness scores by a certain value?"> - <option value="no">No</option> - <option value="yes">Yes</option> - </param> - <when value="no"> - <!-- do nothing --> - </when> - <when value="yes"> - <param name="factor" type="float" value="0.0" label="Multiply by" /> - </when> - </conditional> - <conditional name="reads"> - <param name="downstream" type="select" label="Are all reads downstream of the transposon?"> - <option value="no">No</option> - <option value="yes">Yes</option> - </param> - <when value="no"> - <!-- do nothing --> - </when> - <when value="yes"> - <!-- do nothing --> - </when> - </conditional> - </inputs> - <outputs> - <data format="csv" name="output" /> - <data format="txt" name="output2" /> - <data format="wig" name="output3" /> - </outputs> - <help> - -**What it does** - -This tool consolidates the fitness values of transposon insertion mutations generated by Tn-Seq, and is mostly useful for consolidating values produced by a 4-cycle looping trimming pipeline. - -</help> -</tool> \ No newline at end of file