view consol_fit.py @ 4:7d2f2d1a23ee draft

Uploaded
author kaymccoy
date Thu, 11 Aug 2016 18:33:54 -0400
parents
children
line wrap: on
line source

# Consol_fit! It's a script & it'll consolidate your fitness values if you got them from a looping trimming pipeline instead of the standard split-by-transposon pipeline. That's it!

# Test: python ../script/consol_fit.py -calctxt results/py_2_L3_2394eVI_Gluc.txt -wig gview/consol_L3_2394eVI_Gluc.wig -i results/py_L3_2394eVI_Gluc.csv -out results/consol_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.csv -normalize tigr4_normal.txt

# Test: python ../script/consol_fit.py -calctxt results/py_2_L3_2394eVI_Gluc.txt -wig gview/consol_L3_2394eVI_Gluc.wig -i results/galaxy_test.csv -out results/consol_L3_2394eVI_Gluc.csv -out2 results/py_2_L3_2394eVI_Gluc.csv -normalize tigr4_normal.txt

import math
import csv










##### ARGUMENTS #####

def print_usage():
	print "\n" + "You are missing one or more required flags. A complete list of flags accepted by calc_fitness is as follows:" + "\n\n"
	print "\033[1m" + "Required" + "\033[0m" + "\n"
	print "-i" + "\t\t" + "The calc_fit file to be consolidated" + "\n"
	print "-out" + "\t\t" + "Name of a file to enter the .csv output." + "\n"
	print "-out2" + "\t\t" + "Name of a file to put the percent blank score in (used in aggregate)." + "\n"
	print "-calctxt" + "\t\t" + "The txt file output from calc_fit" + "\n"
	print "-normalize" + "\t" + "A file that contains a list of genes that should have a fitness of 1" + "\n"
	print "\n"
	print "\033[1m" + "Optional" + "\033[0m" + "\n"
	print "-cutoff" + "\t\t" + "Discard any positions where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
	print "-cutoff2" + "\t\t" + "Discard any positions within the normalization genes where the average of counted transcripts at time 0 and time 1 is below this number (default 0)" + "\n"
	print "-wig" + "\t\t" + "Create a wiggle file for viewing in a genome browser. Provide a filename." + "\n"
	print "-maxweight" + "\t" + "The maximum weight a transposon gene can have in normalization calculations" + "\n"
	print "-multiply" + "\t" + "Multiply all fitness scores by a certain value (e.g., the fitness of a knockout). You should normalize the data." + "\n"
	print "\n"

import argparse 
parser = argparse.ArgumentParser()
parser.add_argument("-calctxt", action="store", dest="calctxt")
parser.add_argument("-normalize", action="store", dest="normalize")
parser.add_argument("-i", action="store", dest="input")
parser.add_argument("-out", action="store", dest="outfile")
parser.add_argument("-out2", action="store", dest="outfile2")
parser.add_argument("-cutoff", action="store", dest="cutoff")
parser.add_argument("-cutoff2", action="store", dest="cutoff2")
parser.add_argument("-wig", action="store", dest="wig")
parser.add_argument("-maxweight", action="store", dest="max_weight")
parser.add_argument("-multiply", action="store", dest="multiply")
arguments = parser.parse_args()

# Checks that all the required arguments have actually been entered

if (not arguments.input or not arguments.outfile or not arguments.calctxt):
	print_usage() 
	quit()

#

if (not arguments.max_weight):
	arguments.max_weight = 75

#

if (not arguments.cutoff):
	arguments.cutoff = 0
	
# Sets the default value of cutoff2 to 10; cutoff2 exists to discard positions within normalization genes with a low number of counted transcripts, because fitnesses calculated from them similarly may not be very accurate.
# This only has an effect if it's larger than cutoff, since the normalization step references a list of insertions already affected by cutoff.
	
if (not arguments.cutoff2):
	arguments.cutoff2 = 10

#Gets total & refname from calc_fit outfile2

with open(arguments.calctxt) as file:
	calctxt = file.readlines()
total = float(calctxt[1].split()[1])
refname = calctxt[2].split()[1]









	
##### CONSOLIDATING THE CALC_FIT FILE #####

with open(arguments.input) as file:
	input = file.readlines()
results = [["position", "strand", "count_1", "count_2", "ratio", "mt_freq_t1", "mt_freq_t2", "pop_freq_t1", "pop_freq_t2", "gene", "D", "W", "nW"]]
i = 1
d = float(input[i].split(",")[10])
while i < len(input):
	position = float(input[i].split(",")[0])
	strands = input[i].split(",")[1]
	c1 = float(input[i].split(",")[2])
	c2 = float(input[i].split(",")[3])
	gene = input[i].split(",")[9]
	while i + 1 < len(input) and float(input[i+1].split(",")[0]) - position <= 4:
		if i + 1 < len(input):
			i += 1
			c1 += float(input[i].split(",")[2])
			c2 += float(input[i].split(",")[3])
			strands = input[i].split(",")[1]
			if strands[0] == 'b':
				new_strands = 'b/'
			elif strands[0] == '+':
				if input[i].split(",")[1][0] == 'b':
					new_strands = 'b/'
				elif input[i].split(",")[1][0] == '+':
					new_strands = '+/'
				elif input[i].split(",")[1][0] == '-':
					new_strands = 'b/'
			elif strands[0] == '-':
				if input[i].split(",")[1][0] == 'b':
					new_strands = 'b/'
				elif input[i].split(",")[1][0] == '+':
					new_strands = 'b/'
				elif input[i].split(",")[1][0] == '-':
					new_strands = '-/'
			if len(strands) == 3:
				if len(input[i].split(",")[1]) < 3:
					new_strands += strands[2]
				elif strands[0] == 'b':
					new_strands += 'b'
				elif strands[0] == '+':
					if input[i].split(",")[1][2] == 'b':
						new_strands += 'b'
					elif input[i].split(",")[1][2] == '+':
						new_strands += '+'
					elif input[i].split(",")[1][2] == '-':
						new_strands += 'b'
				elif strands[0] == '-':
					if input[i].split(",")[1][2] == 'b':
						new_strands += 'b'
					elif input[i].split(",")[1][2] == '+':
						new_strands += 'b'
					elif input[i].split(",")[1][2] == '-':
						new_strands += '-'
			else:
				if len(input[i].split(",")[1]) == 3:
						new_strands += input[i].split(",")[1][2]
			strands = new_strands		
	i +=1
	if c2 != 0:
		ratio = c2/c1
	else:
		ratio = 0
	mt_freq_t1 = c1/total
	mt_freq_t2 = c2/total
	pop_freq_t1 = 1 - mt_freq_t1
	pop_freq_t2 = 1 - mt_freq_t2
	w = 0
	if mt_freq_t2 != 0:
		top_w = math.log(mt_freq_t2*(d/mt_freq_t1))
		bot_w = math.log(pop_freq_t2*(d/pop_freq_t1))
		w = top_w/bot_w
	row = [position, strands, c1, c2, ratio, mt_freq_t1, mt_freq_t2, pop_freq_t1, pop_freq_t2, gene, d, w, w]
	results.append(row)
with open(arguments.outfile, "wb") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(results)	










##### REDOING NORMALIZATION #####

# If making a WIG file is requested in the arguments, starts a string to be added to and then written to the WIG file with a typical WIG file header.
# The header is just in a typical WIG file format; if you'd like to look into this more UCSC has notes on formatting WIG files on their site.

if (arguments.wig):
	wigstring = "track type=wiggle_0 name=" + arguments.wig + "\n" + "variableStep chrom=" + refname + "\n"

# If a file's given for normalization, starts normalization; this corrects for anything that would cause all the fitness values to be too high or too low.

if (arguments.normalize):

# Makes a list of the genes in the normalization file, which should all be transposon genes (these naturally ought to have a fitness value of exactly 1, because transposons are generally non-coding DNA)

	with open(arguments.normalize) as file:
		transposon_genes = file.read().splitlines()
	print "Normalize genes loaded" + "\n" 
	blank_ws = 0
	sum = 0
	count = 0
	weights = []
	scores = []
	for list in results:
	
# Finds all insertions within one of the normalization genes that also have a w value; gets their c1 and c2 values (the number of insertions at t1 and t2) and takes the average of that!
# The average is later used as the "weight" of an insertion location's fitness - if it's had more insertions, it should weigh proportionally more towards the average fitness of insertions within the normalization genes.

		if list[9] != '' and list[9] in transposon_genes and list[11]:
			c1 = list[2]
			c2 = list[3]
			score = list[11]
			avg = (c1 + c2)/2
			
# Skips over those insertion locations with too few insertions - their fitness values are less accurate because they're based on such small insertion numbers.
			
			if float(c1) >= float(arguments.cutoff2):
			
# Sets a max weight, to prevent insertion location scores with huge weights from unbalancing the normalization.
			
				if (avg >= float(arguments.max_weight)):
					avg = float(arguments.max_weight)
                
# Tallies how many w values are 0 within the blank_ws value; you might get many transposon genes with a w value of 0 if a bottleneck occurs, which is especially common with in vivo experiments. 
# For example, when studying a nasal infection in a mouse model, what bacteria "sticks" and is able to survive and what bacteria is swallowed and killed or otherwise flushed out tends to be a matter 
# of chance not fitness; all mutants with an insertion in a specific transposon gene could be flushed out by chance!

				if score == 0:
					blank_ws += 1
					
# Adds the fitness values of the insertions within normalization genes together and increments count so their average fitness (sum/count) can be calculated later on
					
				sum += score
				count += 1
				
# Records the weights of the fitness values of the insertion locations in corresponding lists - for example, weights[2] would be the weight of the fitness value at score[2]

				weights.append(avg)
				scores.append(score)
				
				print str(list[9]) + " " + str(score) + " " + str(c1)

# Counts and removes all "blank" fitness values of normalization genes - those that = 0 - because they most likely don't really have a fitness value of 0, and you just happened to not get any reads from that location at t2. 
    
	blank_count = 0
	original_count = len(scores)
	i = 0
	while i < original_count:
		w_value = scores[i]
		if w_value == 0:
			blank_count += 1
			weights.pop[i]
			scores.pop[i]
			i-=1
		i += 1

# If no normalization genes can pass the cutoff, normalization cannot occur, so this ends the script advises the user to try again and lower cutoff and/or cutoff2.
	
	if len(scores) == 0:
		print 'ERROR: The normalization genes do not have enough reads to pass cutoff and/or cutoff2; please lower one or both of those arguments.' + "\n"
		quit()
		
# Prints the number of of blank fitness values found and removed for reference. Writes the percentage to a file so it can be referenced for aggregate analysis.
	
	pc_blank_normals = float(blank_count) / float(original_count)
	print "# blank out of " + str(original_count) + ": " + str(pc_blank_normals) + "\n"
	with open(arguments.outfile2, "w") as f:
		f.write("blanks: " + str(pc_blank_normals) + "\n" + "total: " + str(total) + "\n" + "refname: " + refname)
    
    
# Finds "average" - the average fitness value for an insertion within the transposon genes - and "weighted_average" - the average fitness value for an insertion within the transposon genes weighted by how many insertions each had.

	average = sum / count
	i = 0
	weighted_sum = 0
	weight_sum = 0
	while i < len(weights):
		weighted_sum += weights[i]*scores[i]
		weight_sum += weights[i]
		i += 1
	weighted_average = weighted_sum/weight_sum
    
# Prints the regular average, weighted average, and total insertions for reference
   
	print "Normalization step:" + "\n"
	print "Regular average: " + str(average) + "\n"
	print "Weighted Average: " + str(weighted_average) + "\n"
	print "Total Insertions: " + str(count) + "\n"
    
# The actual normalization happens here; every fitness score is divided by the average fitness found for genes that should have a value of 1. 
# For example, if the average fitness for genes was too low overall - let's say 0.97 within the normalization geness - every fitness would be proportionally raised.

	old_ws = 0
	new_ws = 0
	wcount = 0
	for list in results:
		if list[11] == 'W':
			continue
		new_w = float(list[11])/weighted_average
		
# Sometimes you want to multiply all the fitness values by a constant; this does that.
# For example you might multiply all the values by a constant for a genetic interaction screen - where Tn-Seq is performed as usual except there's one background knockout all the mutants share. This is
# because independent mutations should have a fitness value that's equal to their individual fitness values multipled, but related mutations will deviate from that; to find those deviations you'd multiply
# all the fitness values from mutants from a normal library by the fitness of the background knockout and compare that to the fitness values found from the knockout library!
		
		if arguments.multiply:
			new_w *= float(arguments.multiply)
		
# Records the old w score for reference, and adds it to a total sum of all w scores (so that the old w mean and new w mean can be printed later).
		
		if float(list[11]) > 0:
			old_ws += float(list[11])
			new_ws += new_w
			wcount += 1
			
# Writes the new w score into the results list of lists.
			
		list[12] = new_w
		
# Adds a line to wiglist for each insertion position, with the insertion position and its new w value.
		
		if (arguments.wig):
			wigstring += str(list[0]) + " " + str(new_w) + "\n"
      
# Prints the old w mean and new w mean for reference.
   
	old_w_mean = old_ws / wcount
	new_w_mean = new_ws / wcount
	print "Old W Average: " + str(old_w_mean) + "\n"
	print "New W Average: " + str(new_w_mean) + "\n"

# Overwrites the old file with the normalized file.

with open(arguments.outfile, "wb") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(results)
    
# If a WIG file was requested, actually creates the WIG file and writes wiglist to it
# So what's written here is the WIG header plus each insertion position and it's new w value if normalization were called for, and each insertion position and its unnormalized w value if normalization were not called for.
		
if (arguments.wig):
	if (arguments.normalize):
		with open(arguments.wig, "wb") as wigfile:
			wigfile.write(wigstring)
	else:
		for list in results:
			wigstring += str(list[0]) + " " + str(list[11]) + "\n"
		with open(arguments.wig, "wb") as wigfile:
				wigfile.write(wigstring)