Mercurial > repos > kaymccoy > consolidate_fitnesses

--- a/consol_fit.py	Thu Aug 11 18:07:53 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,300 +0,0 @@
-# Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's all.
-
-import math
-import csv
-
-
-
-
-
-
-
-
-
-
-##### ARGUMENTS #####
-
-def print_usage():
-	print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
-	print "\033[1m" + "Required" + "\033[0m" + "\n"
-	print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n"
-	print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
-	print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n"
-	print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n"
-	print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
-	print "\n"
-	print "\033[1m" + "Optional" + "\033[0m" + "\n"
-	print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
-	print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
-	print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
-	print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
-	print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
-	print "\n"
-
-import argparse
-parser = argparse.ArgumentParser()
-parser.add_argument("-calctxt", action="store", dest="calctxt")
-parser.add_argument("-normalize", action="store", dest="normalize")
-parser.add_argument("-i", action="store", dest="input")
-parser.add_argument("-out", action="store", dest="outfile")
-parser.add_argument("-out2", action="store", dest="outfile2")
-parser.add_argument("-cutoff", action="store", dest="cutoff")
-parser.add_argument("-cutoff2", action="store", dest="cutoff2")
-parser.add_argument("-wig", action="store", dest="wig")
-parser.add_argument("-maxweight", action="store", dest="max_weight")
-parser.add_argument("-multiply", action="store", dest="multiply")
-arguments = parser.parse_args()
-
-if (not arguments.input or not arguments.outfile or not arguments.calctxt):
-	print_usage()
-	quit()
-
-if (not arguments.max_weight):
-	arguments.max_weight = 75
-
-if (not arguments.cutoff):
-	arguments.cutoff = 0
-
-# Cutoff2 only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
-
-if (not arguments.cutoff2):
-	arguments.cutoff2 = 10
-
-#Gets total & refname from calc_fit outfile2
-
-with open(arguments.calctxt) as file:
-	calctxt = file.readlines()
-total = float(calctxt[1].split()[1])
-refname = calctxt[2].split()[1]
-
-
-
-
-
-
-
-
-
-
-##### CONSOLIDATING THE CALC_FIT FILE #####
-
-with open(arguments.input) as file:
-	input = file.readlines()
-results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
-i = 1
-d = float(input[i].split(",")[10])
-while i < len(input):
-	position = float(input[i].split(",")[0])
-	strands = input[i].split(",")[1]
-	c1 = float(input[i].split(",")[2])
-	c2 = float(input[i].split(",")[3])
-	gene = input[i].split(",")[9]
-	while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4:
-		if i + 1 < len(input):
-			i += 1
-			c1 += float(input[i].split(",")[2])
-			c2 += float(input[i].split(",")[3])
-			strands = input[i].split(",")[1]
-			if strands[0] == 'b':
-				new_strands = 'b/'
-			elif strands[0] == '+':
-				if input[i].split(",")[1][0] == 'b':
-					new_strands = 'b/'
-				elif input[i].split(",")[1][0] == '+':
-					new_strands = '+/'
-				elif input[i].split(",")[1][0] == '-':
-					new_strands = 'b/'
-			elif strands[0] == '-':
-				if input[i].split(",")[1][0] == 'b':
-					new_strands = 'b/'
-				elif input[i].split(",")[1][0] == '+':
-					new_strands = 'b/'
-				elif input[i].split(",")[1][0] == '-':
-					new_strands = '-/'
-			if len(strands) == 3:
-				if len(input[i].split(",")[1]) < 3:
-					new_strands += strands[2]
-				elif strands[0] == 'b':
-					new_strands += 'b'
-				elif strands[0] == '+':
-					if input[i].split(",")[1][2] == 'b':
-						new_strands += 'b'
-					elif input[i].split(",")[1][2] == '+':
-						new_strands += '+'
-					elif input[i].split(",")[1][2] == '-':
-						new_strands += 'b'
-				elif strands[0] == '-':
-					if input[i].split(",")[1][2] == 'b':
-						new_strands += 'b'
-					elif input[i].split(",")[1][2] == '+':
-						new_strands += 'b'
-					elif input[i].split(",")[1][2] == '-':
-						new_strands += '-'
-			else:
-				if len(input[i].split(",")[1]) == 3:
-						new_strands += input[i].split(",")[1][2]
-			strands = new_strands
-	i +=1
-	if c2 != 0:
-		ratio = c2/c1
-	else:
-		ratio = 0
-	mt_freq_t1 = c1/total
-	mt_freq_t2 = c2/total
-	pop_freq_t1 = 1 - mt_freq_t1
-	pop_freq_t2 = 1 - mt_freq_t2
-	w = 0
-	if mt_freq_t2 != 0:
-		top_w = math.log(mt_freq_t2*(d/mt_freq_t1))
-		bot_w = math.log(pop_freq_t2*(d/pop_freq_t1))
-		w = top_w/bot_w
-	row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w]
-	results.append(row)
-with open(arguments.outfile, "wb") as csvfile:
-    writer = csv.writer(csvfile)
-    writer.writerows(results)
-
-
-
-
-
-
-
-
-
-
-##### REDOING NORMALIZATION #####
-
-# The header below is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.
-
-if (arguments.wig):
-	wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"
-
-if (arguments.normalize):
-	with open(arguments.normalize) as file:
-		transposon_genes = file.read().splitlines()
-	print "Normalize genes loaded" + "\n"
-	blank_ws = 0
-	sum = 0
-	count = 0
-	weights = []
-	scores = []
-	for list in results:
-		if list[9] != '' and list[9] in transposon_genes and list[11]:
-			c1 = list[2]
-			c2 = list[3]
-			score = list[11]
-			avg = (c1 + c2)/2
-
-# Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
-
-			if float(c1) >= float(arguments.cutoff2):
-
-# Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
-
-				if (avg >= float(arguments.max_weight)):
-					avg = float(arguments.max_weight)
-
-# Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments.
-# For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!
-
-				if score == 0:
-					blank_ws += 1
-				sum += score
-				count += 1
-				weights.append(avg)
-				scores.append(score)
-
-				print str(list[9]) + " " + str(score) + " " + str(c1)
-
-# Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2.
-
-	blank_count = 0
-	original_count = len(scores)
-	i = 0
-	while i < original_count:
-		w_value = scores[i]
-		if w_value == 0:
-			blank_count += 1
-			weights.pop[i]
-			scores.pop[i]
-			i-=1
-		i += 1
-
-# If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
-
-	if len(scores) == 0:
-		print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
-		quit()
-
-	pc_blank_normals = float(blank_count) / float(original_count)
-	print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
-	with open(arguments.outfile2, "w") as f:
-		f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
-
-	average = sum / count
-	i = 0
-	weighted_sum = 0
-	weight_sum = 0
-	while i < len(weights):
-		weighted_sum += weights[i]*scores[i]
-		weight_sum += weights[i]
-		i += 1
-	weighted_average = weighted_sum/weight_sum
-
-	print "Normalization step:" + "\n"
-	print "Regular average: " + str(average) + "\n"
-	print "Weighted Average: " + str(weighted_average) + "\n"
-	print "Total Insertions: " + str(count) + "\n"
-
-	old_ws = 0
-	new_ws = 0
-	wcount = 0
-	for list in results:
-		if list[11] == 'W':
-			continue
-		new_w = float(list[11])/weighted_average
-
-# Sometimes you want to multiply all the fitness values by a constant; this does that.
-# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share.
-
-		if arguments.multiply:
-			new_w *= float(arguments.multiply)
-
-		if float(list[11]) > 0:
-			old_ws += float(list[11])
-			new_ws += new_w
-			wcount += 1
-
-		list[12] = new_w
-
-		if (arguments.wig):
-			wigstring += str(list[0]) + " " + str(new_w) + "\n"
-
-	old_w_mean = old_ws / wcount
-	new_w_mean = new_ws / wcount
-	print "Old W Average: " + str(old_w_mean) + "\n"
-	print "New W Average: " + str(new_w_mean) + "\n"
-
-with open(arguments.outfile, "wb") as csvfile:
-    writer = csv.writer(csvfile)
-    writer.writerows(results)
-
-if (arguments.wig):
-	if (arguments.normalize):
-		with open(arguments.wig, "wb") as wigfile:
-			wigfile.write(wigstring)
-	else:
-		for list in results:
-			wigstring += str(list[0]) + " " + str(list[11]) + "\n"
-		with open(arguments.wig, "wb") as wigfile:
-				wigfile.write(wigstring)
-
-
-#   ___       ___       ___            ___       ___       ___       ___       ___
-#   /\__\     /\  \     /\__\          /\__\     /\  \     /\  \     /\  \     /\__\
-#  /:/ _/_   /::\  \   |::L__L        /::L_L_   /::\  \   /::\  \   /::\  \   |::L__L
-# /::-"\__\ /::\:\__\  |:::\__\      /:/L:\__\ /:/\:\__\ /:/\:\__\ /:/\:\__\  |:::\__\
-# \;:;-",-" \/\::/  /  /:;;/__/      \/_/:/  / \:\ \/__/ \:\ \/__/ \:\/:/  /  /:;;/__/
-#  |:|  |     /:/  /   \/__/           /:/  /   \:\__\    \:\__\    \::/  /   \/__/
-#   \|__|     \/__/                    \/__/     \/__/     \/__/     \/__/
\ No newline at end of file
--- a/consol_fit.xml	Thu Aug 11 18:07:53 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,81 +0,0 @@
-<tool id="consolidate_fitnesses" name="Consolidate Fitnesses">
-  <description>of transposon insertion locations</description>
-  <requirements>
-    <requirement type="package" version="1.64">biopython</requirement>
-  </requirements>
-  <command interpreter="python">
-    consol_fit.py
-    -i $input
-    -calctxt $calctxt
-    -out $output
-    -out2 $output2
-    -maxweight $maxweight
-    -cutoff $cutoff
-    -cutoff2 $cutoff2
-    -wig $output3
-    #if $normalization.calculations  == "yes":
-      -normalize $normalization.genes
-    #end if
-    #if $multiply.choice  == "yes":
-      -multiply $multiply.factor
-    #end if
-    #if $reads.downstream  == "yes":
-      -d 1
-    #end if
-  </command>
-  <inputs>
-    <param name="input" type="data" label="Fitness file"/>
-    <param name="calctxt" type="data" label="the txt file output from calc_fitness"/>
-    <conditional name="normalization">
-      <param name="calculations" type="select" label="Normalize fitness calculations?">
-        <option value="no">No</option>
-        <option value="yes">Yes</option>
-      </param>
-      <when value="no">
-        <!-- do nothing -->
-      </when>
-      <when value="yes">
-        <param name="genes" type="data" label="Genes to normalize by" />
-      </when>
-    </conditional>
-    <param name="cutoff" type="float" value="0.0" label="Cutoff"/>
-    <param name="cutoff2" type="float" value="0.0" label="Cutoff2"/>
-    <param name="maxweight" type="float" value="75" label="Maximum weight of a transposon gene in normalization calculations"/>
-    <conditional name="multiply">
-      <param name="choice" type="select" label="Multiply fitness scores by a certain value?">
-        <option value="no">No</option>
-        <option value="yes">Yes</option>
-      </param>
-      <when value="no">
-        <!-- do nothing -->
-      </when>
-      <when value="yes">
-        <param name="factor" type="float" value="0.0" label="Multiply by" />
-      </when>
-    </conditional>
-    <conditional name="reads">
-      <param name="downstream" type="select" label="Are all reads downstream of the transposon?">
-        <option value="no">No</option>
-        <option value="yes">Yes</option>
-      </param>
-      <when value="no">
-        <!-- do nothing -->
-      </when>
-      <when value="yes">
-        <!-- do nothing -->
-      </when>
-    </conditional>
-  </inputs>
-    <outputs>
-        <data format="csv" name="output" />
-        <data format="txt" name="output2" />
-        <data format="wig" name="output3" />
-    </outputs>
-  <help>
-
-**What it does**
-
-This tool consolidates the fitness values of transposon insertion mutations generated by Tn-Seq, and is mostly useful for consolidating values produced by a 4-cycle looping trimming pipeline.
-
-</help>
-</tool>
\ No newline at end of file