Mercurial > repos > fcaramia > contra
comparison Contra/baseline.py @ 0:7564f3b1e675
Uploaded
| author | fcaramia |
|---|---|
| date | Thu, 13 Sep 2012 02:31:43 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7564f3b1e675 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 | |
| 3 import os | |
| 4 import sys | |
| 5 import fnmatch | |
| 6 import shlex | |
| 7 import subprocess | |
| 8 import shutil | |
| 9 import math | |
| 10 from optparse import OptionParser | |
| 11 from scripts.split_chromosome import splitByChromosome3 as splitByChromosome | |
| 12 from scripts.get_chr_length import * | |
| 13 from scripts.convert_targeted_regions import * | |
| 14 from scripts.convert_gene_coordinate import convertGeneCoordinate2 as convertGeneCoordinate | |
| 15 from multiprocessing import Pool | |
| 16 | |
| 17 class Params: | |
| 18 """ | |
| 19 Class for top-level system parameters: | |
| 20 """ | |
| 21 | |
| 22 def __init__(self): | |
| 23 | |
| 24 def mult_files(option, opt_str, value, parser): | |
| 25 args=[] | |
| 26 for arg in parser.rargs: | |
| 27 if arg[0] != "-": | |
| 28 args.append(arg) | |
| 29 else: | |
| 30 del parser.rargs[:len(args)] | |
| 31 break | |
| 32 if getattr(parser.values, option.dest): | |
| 33 args.extend(getattr(parser.values, option.dest)) | |
| 34 setattr(parser.values, option.dest, args) | |
| 35 | |
| 36 | |
| 37 #command-line option definition | |
| 38 self.parser = OptionParser() | |
| 39 self.parser.add_option("-t", "--target", | |
| 40 help="Target region definition file [REQUIRED] [BED Format]", | |
| 41 action="store", type="string", dest="target") | |
| 42 self.parser.add_option("-f", "--files", | |
| 43 help="Files to be converted to baselines [REQUIRED] [BAM]", | |
| 44 action="callback", callback=mult_files, dest="files") | |
| 45 self.parser.add_option("-o", "--output", | |
| 46 help="Output folder [REQUIRED]", | |
| 47 action="store", type="string", dest="output") | |
| 48 self.parser.add_option("-c","--trim", | |
| 49 help="Portion of outliers to be removed before calculating average [Default: 0.2]", | |
| 50 action="store", dest="trim", default="0.2") | |
| 51 self.parser.add_option("-n","--name", | |
| 52 help="Output baseline name [Default: baseline]", | |
| 53 action="store", dest="name", default="baseline") | |
| 54 | |
| 55 # required parameters list: | |
| 56 self.ERRORLIST = [] | |
| 57 | |
| 58 #change system parameters based on any command line arguments | |
| 59 (options, args) = self.parser.parse_args() | |
| 60 if options.target: | |
| 61 self.TARGET=options.target | |
| 62 else: | |
| 63 self.ERRORLIST.append("target") | |
| 64 | |
| 65 if options.files: | |
| 66 self.FILES=options.files | |
| 67 else: | |
| 68 self.ERRORLIST.append("files") | |
| 69 | |
| 70 if options.output: | |
| 71 self.OUTPUT=options.output | |
| 72 else: | |
| 73 self.ERRORLIST.append("output") | |
| 74 | |
| 75 if options.trim: | |
| 76 self.TRIM=float(options.trim) | |
| 77 | |
| 78 if options.name: | |
| 79 self.NAME=options.name | |
| 80 | |
| 81 if len(self.ERRORLIST) != 0: | |
| 82 self.parser.print_help() | |
| 83 self.parser.error("Missing required parameters") | |
| 84 | |
| 85 | |
| 86 def repeat(self): | |
| 87 #params test | |
| 88 print "target :", self.TARGET | |
| 89 print "files :", self.FILES | |
| 90 print "output :", self.OUTPUT | |
| 91 print "trim :", self.TRIM | |
| 92 print "name :", self.NAME | |
| 93 | |
| 94 | |
| 95 # option handling | |
| 96 params = Params() | |
| 97 #params.repeat() | |
| 98 targetFile = params.TARGET | |
| 99 infiles = params.FILES | |
| 100 output_dir = params.OUTPUT | |
| 101 | |
| 102 #Debug | |
| 103 print " ------ baseline.py ------- " | |
| 104 print "Target:", targetFile | |
| 105 for files in infiles: | |
| 106 print "File:", files | |
| 107 print "Output Directory: ", output_dir | |
| 108 | |
| 109 def make_new_directory(outdir): | |
| 110 print outdir | |
| 111 if not os.path.exists(outdir): | |
| 112 os.mkdir(outdir) | |
| 113 | |
| 114 print " ----- creating output directory -----" | |
| 115 make_new_directory(output_dir) | |
| 116 outdir = os.path.join(output_dir, "buf") | |
| 117 make_new_directory(outdir) | |
| 118 | |
| 119 targetFile2 = os.path.join(outdir, os.path.basename(targetFile)+".sorted") | |
| 120 os.system("sort -k1,1 -k2n %s > %s" %(targetFile,targetFile2)) | |
| 121 | |
| 122 def processInFile(infile): | |
| 123 infilename = os.path.basename(infile) | |
| 124 s_outdir = os.path.join(outdir, infilename) | |
| 125 make_new_directory(s_outdir) | |
| 126 genomeFile = os.path.join(s_outdir,infilename +'.chrsummary') | |
| 127 get_genome(infile, genomeFile) | |
| 128 | |
| 129 bedgraph = os.path.join(s_outdir, infilename + ".BEDGRAPH") | |
| 130 args = shlex.split("genomeCoverageBed -ibam %s -bga -g %s" %(infile, genomeFile)) | |
| 131 iOutFile = open(bedgraph, "w") | |
| 132 output = subprocess.Popen(args, stdout = iOutFile).wait() | |
| 133 iOutFile.close() | |
| 134 | |
| 135 targetList = convertTarget(targetFile2) | |
| 136 | |
| 137 splitByChromosome(bedgraph) | |
| 138 | |
| 139 convertGeneCoordinate(targetList, os.path.dirname(bedgraph)+"/") | |
| 140 bedgraph_tgtonly = bedgraph+".TARGETONLY" | |
| 141 bedgraph_tgtonly_avg = bedgraph+".TARGETONLY.AVERAGE" | |
| 142 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage.txt"),bedgraph_tgtonly) | |
| 143 os.rename(os.path.join(os.path.dirname(bedgraph),"geneRefCoverage_targetAverage.txt"),bedgraph_tgtonly_avg) | |
| 144 shutil.copy(bedgraph_tgtonly,outdir) | |
| 145 shutil.copy(bedgraph_tgtonly_avg,outdir) | |
| 146 | |
| 147 print "----- Processing Files -----" | |
| 148 pool = Pool(5) | |
| 149 pool.map(processInFile,infiles) | |
| 150 | |
| 151 | |
| 152 # STEP 2 -> Create Union BedGraph | |
| 153 allfiles_names = [x for x in os.listdir(outdir) if x.endswith("TARGETONLY")] | |
| 154 allfiles_path = [os.path.join(outdir,x) for x in allfiles_names] | |
| 155 | |
| 156 args=["unionBedGraphs","-header","-i"]+allfiles_path+["-names"]+allfiles_names | |
| 157 | |
| 158 print (str(args)) | |
| 159 fo = os.path.join(outdir,"TARGETONLY.union.txt") | |
| 160 foh = open(fo,"w") | |
| 161 | |
| 162 subprocess.Popen(args,stdout=foh).wait() | |
| 163 foh.close() | |
| 164 | |
| 165 # STEP 3 -> POOL METHOD | |
| 166 TRIM = params.TRIM #TRIM = 0.2 | |
| 167 f = fo | |
| 168 fh = open(f) | |
| 169 | |
| 170 fo=os.path.join(output_dir, params.NAME+".pooled2_TRIM"+str(TRIM)+".txt") | |
| 171 fo_libsize=fo+".LIBSIZE" | |
| 172 | |
| 173 | |
| 174 foh=open(fo,'w') | |
| 175 fo_libsize_h=open(fo_libsize,'w') | |
| 176 | |
| 177 | |
| 178 fh.readline() | |
| 179 | |
| 180 Ns = None | |
| 181 nSamples = None | |
| 182 | |
| 183 lineCount=0 | |
| 184 for line in fh: | |
| 185 lineCount+=1 | |
| 186 line_elems=line.rstrip().split("\t") | |
| 187 rangeLen=int(line_elems[2])-int(line_elems[1]) | |
| 188 if not Ns: | |
| 189 nSamples=len(line_elems)-3 | |
| 190 Ns = [0 for x in range(nSamples)] | |
| 191 for i in range(nSamples): | |
| 192 ind=i+3 | |
| 193 Ns[i] += int(line_elems[ind])*rangeLen | |
| 194 | |
| 195 | |
| 196 fh.close() | |
| 197 fh=open(f) | |
| 198 fh.readline() | |
| 199 | |
| 200 | |
| 201 lineCount=0 | |
| 202 nSamples_exclude = int(math.floor(TRIM*nSamples)) | |
| 203 | |
| 204 def gm_mean(xs): | |
| 205 tmpprod = 1 | |
| 206 p = 1.0/len(xs) | |
| 207 for x in xs: | |
| 208 tmpprod = tmpprod * math.pow(x,p) | |
| 209 return tmpprod | |
| 210 | |
| 211 Ns_gmean = gm_mean(Ns) | |
| 212 | |
| 213 def meanstdv(x): | |
| 214 from math import sqrt | |
| 215 n, mean, std = len(x), 0, 0 | |
| 216 for a in x: | |
| 217 mean = mean+a | |
| 218 mean = mean / float(n) | |
| 219 for a in x: | |
| 220 std = std+(a-mean)**2 | |
| 221 std=sqrt(std/float(n-1)) | |
| 222 return mean, std | |
| 223 | |
| 224 | |
| 225 libsize = 0 | |
| 226 for line in fh: | |
| 227 lineCount+=1 | |
| 228 line_elems=line.rstrip().split("\t") | |
| 229 rangeLen=int(line_elems[2])-int(line_elems[1]) | |
| 230 xs_tmp=[int(x) for x in line_elems[3:]] | |
| 231 xs = [float(xs_tmp[i])*Ns_gmean/Ns[i] for i in range(len(xs_tmp))] | |
| 232 xs.sort() | |
| 233 xs_trimmed=xs[nSamples_exclude:(nSamples-nSamples_exclude)] | |
| 234 #trimmed_mean=sum(xs_trimmed)/float(len(xs_trimmed)) | |
| 235 trimmed_mean,std=meanstdv(xs_trimmed) | |
| 236 libsize+=rangeLen*trimmed_mean | |
| 237 foh.write("\t".join(line_elems[0:3]+[str(trimmed_mean),str(std)])+"\n") | |
| 238 | |
| 239 | |
| 240 fo_libsize_h.write(str(int(round(libsize)))) | |
| 241 | |
| 242 fh.close() | |
| 243 foh.close() | |
| 244 fo_libsize_h.close() | |
| 245 | |
| 246 def removeTempFolder(tempFolderPath): | |
| 247 import shutil | |
| 248 | |
| 249 shutil.rmtree(tempFolderPath) | |
| 250 print "Temp Folder Removed" | |
| 251 | |
| 252 # Removed Temp Folder | |
| 253 removeTempFolder(outdir) |
