Mercurial > repos > fcaramia > contra
diff Contra/baseline.py @ 0:7564f3b1e675
Uploaded
author | fcaramia |
---|---|
date | Thu, 13 Sep 2012 02:31:43 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Contra/baseline.py Thu Sep 13 02:31:43 2012 -0400 @@ -0,0 +1,253 @@ +#!/usr/bin/python + +import os +import sys +import fnmatch +import shlex +import subprocess +import shutil +import math +from optparse import OptionParser +from scripts.split_chromosome import splitByChromosome3 as splitByChromosome +from scripts.get_chr_length import * +from scripts.convert_targeted_regions import * +from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate +from multiprocessing import Pool + +class Params: + """ + Class for top-level system parameters: + """ + + def __init__(self): + + def mult_files(option, opt_str, value, parser): + args=[] + for arg in parser.rargs: + if arg[0] != "-": + args.append(arg) + else: + del parser.rargs[:len(args)] + break + if getattr(parser.values, option.dest): + args.extend(getattr(parser.values, option.dest)) + setattr(parser.values, option.dest, args) + + + #command-line option definition + self.parser = OptionParser() + self.parser.add_option("-t", "--target", + help="Target region definition file [REQUIRED] [BED Format]", + action="store", type="string", dest="target") + self.parser.add_option("-f", "--files", + help="Files to be converted to baselines [REQUIRED] [BAM]", + action="callback", callback=mult_files, dest="files") + self.parser.add_option("-o", "--output", + help="Output folder [REQUIRED]", + action="store", type="string", dest="output") + self.parser.add_option("-c","--trim", + help="Portion of outliers to be removed before calculating average [Default: 0.2]", + action="store", dest="trim", default="0.2") + self.parser.add_option("-n","--name", + help="Output baseline name [Default: baseline]", + action="store", dest="name", default="baseline") + + # required parameters list: + self.ERRORLIST = [] + + #change system parameters based on any command line arguments + (options, args) = self.parser.parse_args() + if options.target: + self.TARGET=options.target + else: + self.ERRORLIST.append("target") + + if options.files: + self.FILES=options.files + else: + self.ERRORLIST.append("files") + + if options.output: + self.OUTPUT=options.output + else: + self.ERRORLIST.append("output") + + if options.trim: + self.TRIM=float(options.trim) + + if options.name: + self.NAME=options.name + + if len(self.ERRORLIST) != 0: + self.parser.print_help() + self.parser.error("Missing required parameters") + + + def repeat(self): + #params test + print "target :", self.TARGET + print "files :", self.FILES + print "output :", self.OUTPUT + print "trim :", self.TRIM + print "name :", self.NAME + + +# option handling +params = Params() +#params.repeat() +targetFile = params.TARGET +infiles = params.FILES +output_dir = params.OUTPUT + +#Debug +print " ------ baseline.py ------- " +print "Target:", targetFile +for files in infiles: + print "File:", files +print "Output Directory: ", output_dir + +def make_new_directory(outdir): + print outdir + if not os.path.exists(outdir): + os.mkdir(outdir) + +print " ----- creating output directory -----" +make_new_directory(output_dir) +outdir = os.path.join(output_dir, "buf") +make_new_directory(outdir) + +targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted") +os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2)) + +def processInFile(infile): + infilename = os.path.basename(infile) + s_outdir = os.path.join(outdir, infilename) + make_new_directory(s_outdir) + genomeFile = os.path.join(s_outdir,infilename +'.chrsummary') + get_genome(infile, genomeFile) + + bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH") + args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile)) + iOutFile = open(bedgraph, "w") + output = subprocess.Popen(args, stdout = iOutFile).wait() + iOutFile.close() + + targetList = convertTarget(targetFile2) + + splitByChromosome(bedgraph) + + convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/") + bedgraph_tgtonly = bedgraph+".TARGETONLY" + bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE" + os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly) + os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg) + shutil.copy(bedgraph_tgtonly,outdir) + shutil.copy(bedgraph_tgtonly_avg,outdir) + +print "----- Processing Files -----" +pool = Pool(5) +pool.map(processInFile,infiles) + + +# STEP 2 -> Create Union BedGraph +allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")] +allfiles_path = [os.path.join(outdir,x) for x in allfiles_names] + +args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names + +print (str(args)) +fo = os.path.join(outdir,"TARGETONLY.union.txt") +foh = open(fo,"w") + +subprocess.Popen(args,stdout=foh).wait() +foh.close() + +# STEP 3 -> POOL METHOD +TRIM = params.TRIM #TRIM = 0.2 +f = fo +fh = open(f) + +fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt") +fo_libsize=fo+".LIBSIZE" + + +foh=open(fo,'w') +fo_libsize_h=open(fo_libsize,'w') + + +fh.readline() + +Ns = None +nSamples = None + +lineCount=0 +for line in fh: + lineCount+=1 + line_elems=line.rstrip().split("\t") + rangeLen=int(line_elems[2])-int(line_elems[1]) + if not Ns: + nSamples=len(line_elems)-3 + Ns = [0 for x in range(nSamples)] + for i in range(nSamples): + ind=i+3 + Ns[i] += int(line_elems[ind])*rangeLen + + +fh.close() +fh=open(f) +fh.readline() + + +lineCount=0 +nSamples_exclude = int(math.floor(TRIM*nSamples)) + +def gm_mean(xs): + tmpprod = 1 + p = 1.0/len(xs) + for x in xs: + tmpprod = tmpprod * math.pow(x,p) + return tmpprod + +Ns_gmean = gm_mean(Ns) + +def meanstdv(x): + from math import sqrt + n, mean, std = len(x), 0, 0 + for a in x: + mean = mean+a + mean = mean / float(n) + for a in x: + std = std+(a-mean)**2 + std=sqrt(std/float(n-1)) + return mean, std + + +libsize = 0 +for line in fh: + lineCount+=1 + line_elems=line.rstrip().split("\t") + rangeLen=int(line_elems[2])-int(line_elems[1]) + xs_tmp=[int(x) for x in line_elems[3:]] + xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))] + xs.sort() + xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)] + #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed)) + trimmed_mean,std=meanstdv(xs_trimmed) + libsize+=rangeLen*trimmed_mean + foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n") + + +fo_libsize_h.write(str(int(round(libsize)))) + +fh.close() +foh.close() +fo_libsize_h.close() + +def removeTempFolder(tempFolderPath): + import shutil + + shutil.rmtree(tempFolderPath) + print "Temp Folder Removed" + +# Removed Temp Folder +removeTempFolder(outdir)