Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgQC.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 # oct 15 rpy replaced - temp fix until we get gnuplot working | |
| 2 # rpy deprecated - replace with RRun | |
| 3 # fixes to run functional test! oct1 2009 | |
| 4 # needed to expand our path with os.path.realpath to get newpath working with | |
| 5 # all the fancy pdfnup stuff | |
| 6 # and a fix to pruneld to write output to where it should be | |
| 7 # smallish data in test-data/smallwga in various forms | |
| 8 # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga | |
| 9 # child files are deprecated and broken as at july 19 2009 | |
| 10 # need to move them to the html file extrafiles path | |
| 11 # found lots of corner cases with some illumina data where cnv markers were | |
| 12 # included | |
| 13 # and where affection status was all missing ! | |
| 14 # added links to tab files showing worst 1/keepfrac markers and subjects | |
| 15 # ross lazarus january 2008 | |
| 16 # | |
| 17 # added named parameters | |
| 18 # to ensure no silly slippages if non required parameter in the most general case | |
| 19 # some potentially useful things here reusable in complex scripts | |
| 20 # with lots'o'html (TM) | |
| 21 # aug 17 2007 rml | |
| 22 # | |
| 23 # added marker and subject and parenting april 14 rml | |
| 24 # took a while to get the absolute paths right for all the file munging | |
| 25 # as of april 16 seems to work.. | |
| 26 # getting galaxy to serve images in html reports is a little tricky | |
| 27 # we don't want QC reports to be dozens of individual files, so need | |
| 28 # to use the url /static/rg/... since galaxy's web server will happily serve images | |
| 29 # from there | |
| 30 # galaxy passes output files as relative paths | |
| 31 # these have to be munged by rgQC.py before calling this | |
| 32 # galaxy will pass in 2 file names - one for the log | |
| 33 # and one for the final html report | |
| 34 # of the form './database/files/dataset_66.dat' | |
| 35 # we need to be working in that directory so our plink output files are there | |
| 36 # so these have to be munged by rgQC.py before calling this | |
| 37 # note no ped file passed so had to remove the -l option | |
| 38 # for plinkParse.py that makes a heterozygosity report from the ped | |
| 39 # file - needs fixing... | |
| 40 # new: importing manhattan/qqplot plotter | |
| 41 # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir): | |
| 42 # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0 | |
| 43 # contains some R scripts as text strings - we substitute defaults into the calls | |
| 44 # to make them do our bidding - and save the resulting code for posterity | |
| 45 # this can be called externally, I guess...for QC eg? | |
| 46 # """ | |
| 47 # | |
| 48 # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey)) | |
| 49 # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) | |
| 50 # return rlog,flist | |
| 51 | |
| 52 | |
| 53 from optparse import OptionParser | |
| 54 | |
| 55 import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string | |
| 56 from os.path import abspath | |
| 57 from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD | |
| 58 import rgManQQ | |
| 59 | |
| 60 prog = os.path.split(sys.argv[0])[1] | |
| 61 vers = '0.4 april 2009 rml' | |
| 62 idjoiner = '_~_~_' # need something improbable.. | |
| 63 # many of these may need fixing for a new install | |
| 64 | |
| 65 myversion = vers | |
| 66 keepfrac = 20 # fraction to keep after sorting by each interesting value | |
| 67 | |
| 68 missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change! | |
| 69 | |
| 70 mogresize = "x300" # this controls the width for jpeg thumbnails | |
| 71 | |
| 72 | |
| 73 | |
| 74 | |
| 75 def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''): | |
| 76 """ | |
| 77 marker rhead = ['snp','chrom','maf','a1','a2','missfrac', | |
| 78 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] | |
| 79 subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] | |
| 80 """ | |
| 81 | |
| 82 | |
| 83 def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50): | |
| 84 """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50) | |
| 85 # generic histogram and vertical boxplot in a 3:1 layout | |
| 86 # returns the graphic file name for inclusion in the web page | |
| 87 # broken out here for reuse | |
| 88 # ross lazarus march 2007 | |
| 89 """ | |
| 90 screenmat = (1,2,1,2) # create a 2x2 cabvas | |
| 91 widthlist = (80,20) # change to 4:1 ratio for histo and boxplot | |
| 92 rpy.r.pdf( outfname, height , width ) | |
| 93 #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot | |
| 94 m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True) | |
| 95 # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T) | |
| 96 rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot | |
| 97 maint = 'QC for %s - %s' % (basename,title) | |
| 98 rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8) | |
| 99 rpy.r.boxplot(plotme,main='',col="maroon",outline=False) | |
| 100 rpy.r.dev_off() | |
| 101 | |
| 102 def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100): | |
| 103 """ | |
| 104 Useful to see what various cutoffs yield - plot percentiles | |
| 105 """ | |
| 106 n = len(plotme) | |
| 107 maxveclen = 1000.0 # for reasonable pdf sizes! | |
| 108 yvec = copy.copy(plotme) | |
| 109 # arrives already in decending order of importance missingness or mendel count by subj or marker | |
| 110 xvec = range(n) | |
| 111 xvec = [100.0*(n-x)/n for x in xvec] # convert to centile | |
| 112 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 113 if n > maxveclen: # oversample part of the distribution | |
| 114 always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5% | |
| 115 skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points | |
| 116 samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values | |
| 117 yvec = [yvec[i] for i in samplei] # always get first and last | |
| 118 xvec = [xvec[i] for i in samplei] # always get first and last | |
| 119 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure | |
| 120 rpy.r.pdf( outfname, height , width ) | |
| 121 maint = 'QC for %s - %s' % (basename,title) | |
| 122 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 | |
| 123 rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8) | |
| 124 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") | |
| 125 rpy.r.dev_off() | |
| 126 | |
| 127 def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 128 """ | |
| 129 y is data for a qq plot and ends up on the x axis go figure | |
| 130 if sampling, oversample low values - all the top 1% ? | |
| 131 this version called with -log10 transformed hwe | |
| 132 """ | |
| 133 nrows = len(plotme) | |
| 134 fn = float(nrows) | |
| 135 xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))] | |
| 136 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line | |
| 137 maxveclen = 3000 | |
| 138 yvec = copy.copy(plotme) | |
| 139 if nrows > maxveclen: | |
| 140 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 141 # oversample part of the distribution | |
| 142 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% | |
| 143 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points | |
| 144 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] | |
| 145 # always oversample first sorted (here lowest) values | |
| 146 yvec = [yvec[i] for i in samplei] # always get first and last | |
| 147 xvec = [xvec[i] for i in samplei] # and sample xvec same way | |
| 148 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) | |
| 149 else: | |
| 150 maint='Log QQ Plot(n=%d)' % (nrows) | |
| 151 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line | |
| 152 ylab = '%s' % xlabname | |
| 153 xlab = '-log10(Uniform 0-1)' | |
| 154 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure | |
| 155 rpy.r.pdf( outfname, height , width ) | |
| 156 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 | |
| 157 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) | |
| 158 rpy.r.points(mx,mx,type='l') | |
| 159 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") | |
| 160 rpy.r.dev_off() | |
| 161 | |
| 162 def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 163 """ | |
| 164 data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a | |
| 165 grid of qq plots to show departure from null at extremes of data quality | |
| 166 Need to plot qqplot(p,unif) where the p's come from one x and one y quantile | |
| 167 and ends up on the x axis go figure | |
| 168 if sampling, oversample low values - all the top 1% ? | |
| 169 """ | |
| 170 data = copy.copy(plotme) | |
| 171 nvals = len(data) | |
| 172 stepsize = nvals/nsplits | |
| 173 logstep = math.log10(stepsize) # so is 3 for steps of 1000 | |
| 174 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer | |
| 175 data.sort(key=itertools.itemgetter(1)) # into x order | |
| 176 rpy.r.pdf( outfname, height , width ) | |
| 177 rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits)) | |
| 178 yvec = [-math.log10(random.random()) for x in range(stepsize)] | |
| 179 yvec.sort() # size of each step is expected range for xvec under null?! | |
| 180 for rowstart in quints: | |
| 181 rowend = rowstart + stepsize | |
| 182 if nvals - rowend < stepsize: # finish last split | |
| 183 rowend = nvals | |
| 184 row = data[rowstart:rowend] | |
| 185 row.sort(key=itertools.itemgetter(2)) # into y order | |
| 186 for colstart in quints: | |
| 187 colend = colstart + stepsize | |
| 188 if nvals - colend < stepsize: # finish last split | |
| 189 colend = nvals | |
| 190 cell = row[colstart:colend] | |
| 191 xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell | |
| 192 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8) | |
| 193 rpy.r.points(c(0,logstep),c(0,logstep),type='l') | |
| 194 rpy.r.dev_off() | |
| 195 | |
| 196 | |
| 197 def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 198 """ | |
| 199 y is data for a qqnorm plot | |
| 200 if sampling, oversample low values - all the top 1% ? | |
| 201 """ | |
| 202 rangeunif = len(plotme) | |
| 203 nunif = 1000 | |
| 204 maxveclen = 3000 | |
| 205 nrows = len(plotme) | |
| 206 data = copy.copy(plotme) | |
| 207 if nrows > maxveclen: | |
| 208 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 209 # oversample part of the distribution | |
| 210 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% | |
| 211 skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points | |
| 212 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] | |
| 213 # always oversample first sorted (here lowest) values | |
| 214 yvec = [data[i] for i in samplei] # always get first and last | |
| 215 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) | |
| 216 else: | |
| 217 yvec = data | |
| 218 maint='Log QQ Plot(n=%d)' % (nrows) | |
| 219 n = 1000 | |
| 220 ylab = '%s' % xlabname | |
| 221 xlab = 'Normal' | |
| 222 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure | |
| 223 rpy.r.pdf( outfname, height , width ) | |
| 224 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 | |
| 225 rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) | |
| 226 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") | |
| 227 rpy.r.dev_off() | |
| 228 | |
| 229 def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 230 """ | |
| 231 layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness | |
| 232 like the GAIN qc tools | |
| 233 y is data for a qq plot and ends up on the x axis go figure | |
| 234 if sampling, oversample low values - all the top 1% ? | |
| 235 """ | |
| 236 rangeunif = len(plotme) | |
| 237 nunif = 1000 | |
| 238 fn = float(rangeunif) | |
| 239 xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))] | |
| 240 skip = max(int(rangeunif/fn),1) | |
| 241 # force include last points | |
| 242 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line | |
| 243 maxveclen = 2000 | |
| 244 nrows = len(plotme) | |
| 245 data = copy.copy(plotme) | |
| 246 data.sort() # low to high - oversample low values | |
| 247 if nrows > maxveclen: | |
| 248 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 249 # oversample part of the distribution | |
| 250 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% | |
| 251 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points | |
| 252 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] | |
| 253 # always oversample first sorted (here lowest) values | |
| 254 yvec = [data[i] for i in samplei] # always get first and last | |
| 255 xvec = [xvec[i] for i in samplei] # and sample xvec same way | |
| 256 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) | |
| 257 else: | |
| 258 yvec = data | |
| 259 maint='Log QQ Plot(n=%d)' % (nrows) | |
| 260 n = 1000 | |
| 261 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line | |
| 262 ylab = '%s' % xlabname | |
| 263 xlab = '-log10(Uniform 0-1)' | |
| 264 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure | |
| 265 rpy.r.pdf( outfname, height , width ) | |
| 266 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 | |
| 267 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) | |
| 268 rpy.r.points(mx,mx,type='l') | |
| 269 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") | |
| 270 rpy.r.dev_off() | |
| 271 | |
| 272 | |
| 273 fdsto,stofile = tempfile.mkstemp() | |
| 274 sto = open(stofile,'w') | |
| 275 import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb | |
| 276 mog = 'mogrify' | |
| 277 pdfnup = 'pdfnup' | |
| 278 pdfjoin = 'pdfjoin' | |
| 279 shead = subjects.pop(0) # get rid of head | |
| 280 mhead = markers.pop(0) | |
| 281 maf = mhead.index('maf') | |
| 282 missfrac = mhead.index('missfrac') | |
| 283 logphweall = mhead.index('logp_hwe_all') | |
| 284 logphweunaff = mhead.index('logp_hwe_unaff') | |
| 285 # check for at least some unaffected rml june 2009 | |
| 286 m_mendel = mhead.index('N_Mendel') | |
| 287 fracmiss = shead.index('FracMiss') | |
| 288 s_mendel = shead.index('Mendel_errors') | |
| 289 s_het = shead.index('F_Stat') | |
| 290 params = {} | |
| 291 hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff | |
| 292 and x[logphweunaff].upper() <> 'NA'] | |
| 293 if len(hweres) <> 0: | |
| 294 xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] | |
| 295 # plot for each of these cols | |
| 296 else: # try hwe all instead - maybe no affection status available | |
| 297 xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] | |
| 298 ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything! | |
| 299 oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled | |
| 300 qqplotme = [1,0,0,0,0,0,0] # | |
| 301 qnplotme = [0,0,0,0,0,0,1] # | |
| 302 nplots = len(xs) | |
| 303 xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency', | |
| 304 'Marker Mendel Error Count', 'Missing Rate: Subjects', | |
| 305 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic'] | |
| 306 plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het'] | |
| 307 ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames | |
| 308 ordplotnames = ['%s_cum' % x for x in plotnames] | |
| 309 ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames | |
| 310 outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)] | |
| 311 ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)] | |
| 312 datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table | |
| 313 titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel", | |
| 314 "Subject Missing Genotype","Subject Mendel",'Subject F Statistic'] | |
| 315 html = [] | |
| 316 pdflist = [] | |
| 317 for n,column in enumerate(xs): | |
| 318 dat = [float(x[column]) for x in datasources[n] if len(x) >= column | |
| 319 and x[column][:2].upper() <> 'NA'] # plink gives both! | |
| 320 if sum(dat) <> 0: # eg nada for mendel if case control? | |
| 321 rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n], | |
| 322 title=titles[n],basename=basename,nbreaks=nbreaks) | |
| 323 row = [titles[n],ploturls[n],outfnames[n] ] | |
| 324 html.append(row) | |
| 325 pdflist.append(outfnames[n]) | |
| 326 if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up | |
| 327 otitle = 'Ranked %s' % titles[n] | |
| 328 dat.sort() | |
| 329 if oreverseme[n]: | |
| 330 dat.reverse() | |
| 331 rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n], | |
| 332 title=otitle,basename=basename,nbreaks=1000) | |
| 333 row = [otitle,ordploturls[n],ordoutfnames[n]] | |
| 334 html.append(row) | |
| 335 pdflist.append(ordoutfnames[n]) | |
| 336 if qqplotme[n]: # | |
| 337 otitle = 'LogQQ plot %s' % titles[n] | |
| 338 dat.sort() | |
| 339 dat.reverse() | |
| 340 ofn = os.path.split(ordoutfnames[n]) | |
| 341 ofn = os.path.join(ofn[0],'QQ%s' % ofn[1]) | |
| 342 ofu = os.path.split(ordploturls[n]) | |
| 343 ofu = os.path.join(ofu[0],'QQ%s' % ofu[1]) | |
| 344 rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n], | |
| 345 title=otitle,basename=basename) | |
| 346 row = [otitle,ofu,ofn] | |
| 347 html.append(row) | |
| 348 pdflist.append(ofn) | |
| 349 elif qnplotme[n]: | |
| 350 otitle = 'F Statistic %s' % titles[n] | |
| 351 dat.sort() | |
| 352 dat.reverse() | |
| 353 ofn = os.path.split(ordoutfnames[n]) | |
| 354 ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1]) | |
| 355 ofu = os.path.split(ordploturls[n]) | |
| 356 ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1]) | |
| 357 rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n], | |
| 358 title=otitle,basename=basename) | |
| 359 row = [otitle,ofu,ofn] | |
| 360 html.append(row) | |
| 361 pdflist.append(ofn) | |
| 362 else: | |
| 363 print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10]) | |
| 364 if nup>0: | |
| 365 # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf` | |
| 366 # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf | |
| 367 filestojoin = ' '.join(pdflist) # all the file names so far | |
| 368 afname = '%s_All_Paged.pdf' % (basename) | |
| 369 fullafname = os.path.join(newfpath,afname) | |
| 370 expl = 'All %s QC Plots joined into a single pdf' % basename | |
| 371 vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname) | |
| 372 # make single page pdf | |
| 373 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) | |
| 374 retval = x.wait() | |
| 375 row = [expl,afname,fullafname] | |
| 376 html.insert(0,row) # last rather than second | |
| 377 nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup) | |
| 378 fullnfname = os.path.join(newfpath,nfname) | |
| 379 expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup) | |
| 380 vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname) | |
| 381 # make thumbnail images | |
| 382 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) | |
| 383 retval = x.wait() | |
| 384 row = [expl,nfname,fullnfname] | |
| 385 html.insert(1,row) # this goes second | |
| 386 vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf')) | |
| 387 # make thumbnail images | |
| 388 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) | |
| 389 retval = x.wait() | |
| 390 sto.close() | |
| 391 cruft = open(stofile,'r').readlines() | |
| 392 return html,cruft # elements for an ordered list of urls or whatever.. | |
| 393 | |
| 394 | |
| 395 def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''): | |
| 396 """ | |
| 397 nice try but the R scripts are huge and take forever to run if there's a lot of data | |
| 398 marker rhead = ['snp','chrom','maf','a1','a2','missfrac', | |
| 399 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] | |
| 400 subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] | |
| 401 """ | |
| 402 colour = "maroon" | |
| 403 | |
| 404 def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks): | |
| 405 """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50) | |
| 406 # generic histogram and vertical boxplot in a 3:1 layout | |
| 407 # returns the graphic file name for inclusion in the web page | |
| 408 # broken out here for reuse | |
| 409 # ross lazarus march 2007 | |
| 410 """ | |
| 411 R = [] | |
| 412 maint = 'QC for %s - %s' % (basename,title) | |
| 413 screenmat = (1,2,1,2) # create a 2x2 canvas | |
| 414 widthlist = (80,20) # change to 4:1 ratio for histo and boxplot | |
| 415 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) | |
| 416 R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") | |
| 417 R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme) | |
| 418 R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour)) | |
| 419 R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) ) | |
| 420 R.append('dev.off()') | |
| 421 return R | |
| 422 | |
| 423 def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100): | |
| 424 """ | |
| 425 Useful to see what various cutoffs yield - plot percentiles | |
| 426 """ | |
| 427 R = [] | |
| 428 n = len(plotme) | |
| 429 R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme) | |
| 430 # arrives already in decending order of importance missingness or mendel count by subj or marker | |
| 431 maint = 'QC for %s - %s' % (basename,title) | |
| 432 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) | |
| 433 R.append("par(lab=c(10,10,10))") | |
| 434 R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour)) | |
| 435 R.append('dev.off()') | |
| 436 return R | |
| 437 | |
| 438 def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 439 """ | |
| 440 y is data for a qq plot and ends up on the x axis go figure | |
| 441 if sampling, oversample low values - all the top 1% ? | |
| 442 this version called with -log10 transformed hwe | |
| 443 """ | |
| 444 R = [] | |
| 445 nrows = len(plotme) | |
| 446 fn = float(nrows) | |
| 447 xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))] | |
| 448 #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line | |
| 449 maxveclen = 3000 | |
| 450 yvec = copy.copy(plotme) | |
| 451 if nrows > maxveclen: | |
| 452 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 453 # oversample part of the distribution | |
| 454 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% | |
| 455 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points | |
| 456 if skip < 2: | |
| 457 skip = 2 | |
| 458 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] | |
| 459 # always oversample first sorted (here lowest) values | |
| 460 yvec = [yvec[i] for i in samplei] # always get first and last | |
| 461 xvec = [xvec[i] for i in samplei] # and sample xvec same way | |
| 462 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) | |
| 463 else: | |
| 464 maint='Log QQ Plot(n=%d)' % (nrows) | |
| 465 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line | |
| 466 ylab = '%s' % xlabname | |
| 467 xlab = '-log10(Uniform 0-1)' | |
| 468 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure | |
| 469 x = ['%f' % x for x in xvec] | |
| 470 R.append('xvec = c(%s)' % ','.join(x)) | |
| 471 y = ['%f' % x for x in yvec] | |
| 472 R.append('yvec = c(%s)' % ','.join(y)) | |
| 473 R.append('mx = c(0,%f)' % (math.log10(fn))) | |
| 474 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) | |
| 475 R.append("par(lab=c(10,10,10))") | |
| 476 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) | |
| 477 R.append('points(mx,mx,type="l")') | |
| 478 R.append('grid(col="lightgray",lty="dotted")') | |
| 479 R.append('dev.off()') | |
| 480 return R | |
| 481 | |
| 482 def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 483 """ | |
| 484 data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a | |
| 485 grid of qq plots to show departure from null at extremes of data quality | |
| 486 Need to plot qqplot(p,unif) where the p's come from one x and one y quantile | |
| 487 and ends up on the x axis go figure | |
| 488 if sampling, oversample low values - all the top 1% ? | |
| 489 """ | |
| 490 data = copy.copy(plotme) | |
| 491 nvals = len(data) | |
| 492 stepsize = nvals/nsplits | |
| 493 logstep = math.log10(stepsize) # so is 3 for steps of 1000 | |
| 494 R.append('mx = c(0,%f)' % logstep) | |
| 495 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer | |
| 496 data.sort(key=itertools.itemgetter(1)) # into x order | |
| 497 #rpy.r.pdf( outfname, h , w ) | |
| 498 #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits)) | |
| 499 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) | |
| 500 yvec = [-math.log10(random.random()) for x in range(stepsize)] | |
| 501 yvec.sort() # size of each step is expected range for xvec under null?! | |
| 502 y = ['%f' % x for x in yvec] | |
| 503 R.append('yvec = c(%s)' % ','.join(y)) | |
| 504 for rowstart in quints: | |
| 505 rowend = rowstart + stepsize | |
| 506 if nvals - rowend < stepsize: # finish last split | |
| 507 rowend = nvals | |
| 508 row = data[rowstart:rowend] | |
| 509 row.sort(key=itertools.itemgetter(2)) # into y order | |
| 510 for colstart in quints: | |
| 511 colend = colstart + stepsize | |
| 512 if nvals - colend < stepsize: # finish last split | |
| 513 colend = nvals | |
| 514 cell = row[colstart:colend] | |
| 515 xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell | |
| 516 x = ['%f' % x for x in xvec] | |
| 517 R.append('xvec = c(%s)' % ','.join(x)) | |
| 518 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) | |
| 519 R.append('points(mx,mx,type="l")') | |
| 520 R.append('grid(col="lightgray",lty="dotted")') | |
| 521 #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8) | |
| 522 #rpy.r.points(c(0,logstep),c(0,logstep),type='l') | |
| 523 R.append('dev.off()') | |
| 524 #rpy.r.dev_off() | |
| 525 return R | |
| 526 | |
| 527 | |
| 528 def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 529 """ | |
| 530 y is data for a qqnorm plot | |
| 531 if sampling, oversample low values - all the top 1% ? | |
| 532 """ | |
| 533 rangeunif = len(plotme) | |
| 534 nunif = 1000 | |
| 535 maxveclen = 3000 | |
| 536 nrows = len(plotme) | |
| 537 data = copy.copy(plotme) | |
| 538 if nrows > maxveclen: | |
| 539 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 540 # oversample part of the distribution | |
| 541 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% | |
| 542 skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points | |
| 543 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] | |
| 544 # always oversample first sorted (here lowest) values | |
| 545 yvec = [data[i] for i in samplei] # always get first and last | |
| 546 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) | |
| 547 else: | |
| 548 yvec = data | |
| 549 maint='Log QQ Plot(n=%d)' % (nrows) | |
| 550 n = 1000 | |
| 551 ylab = '%s' % xlabname | |
| 552 xlab = 'Normal' | |
| 553 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure | |
| 554 #rpy.r.pdf( outfname, h , w ) | |
| 555 #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 | |
| 556 #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) | |
| 557 #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") | |
| 558 #rpy.r.dev_off() | |
| 559 y = ['%f' % x for x in yvec] | |
| 560 R.append('yvec = c(%s)' % ','.join(y)) | |
| 561 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) | |
| 562 R.append("par(lab=c(10,10,10))") | |
| 563 R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) | |
| 564 R.append('grid(col="lightgray",lty="dotted")') | |
| 565 R.append('dev.off()') | |
| 566 return R | |
| 567 | |
| 568 def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): | |
| 569 """ | |
| 570 layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness | |
| 571 like the GAIN qc tools | |
| 572 y is data for a qq plot and ends up on the x axis go figure | |
| 573 if sampling, oversample low values - all the top 1% ? | |
| 574 """ | |
| 575 rangeunif = len(plotme) | |
| 576 nunif = 1000 | |
| 577 fn = float(rangeunif) | |
| 578 xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))] | |
| 579 skip = max(int(rangeunif/fn),1) | |
| 580 # force include last points | |
| 581 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line | |
| 582 maxveclen = 2000 | |
| 583 nrows = len(plotme) | |
| 584 data = copy.copy(plotme) | |
| 585 data.sort() # low to high - oversample low values | |
| 586 if nrows > maxveclen: | |
| 587 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 588 # oversample part of the distribution | |
| 589 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% | |
| 590 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points | |
| 591 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] | |
| 592 # always oversample first sorted (here lowest) values | |
| 593 yvec = [data[i] for i in samplei] # always get first and last | |
| 594 xvec = [xvec[i] for i in samplei] # and sample xvec same way | |
| 595 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) | |
| 596 else: | |
| 597 yvec = data | |
| 598 maint='Log QQ Plot(n=%d)' % (nrows) | |
| 599 n = 1000 | |
| 600 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line | |
| 601 ylab = '%s' % xlabname | |
| 602 xlab = '-log10(Uniform 0-1)' | |
| 603 R.append('mx = c(0,%f)' % (math.log10(fn))) | |
| 604 x = ['%f' % x for x in xvec] | |
| 605 R.append('xvec = c(%s)' % ','.join(x)) | |
| 606 y = ['%f' % x for x in yvec] | |
| 607 R.append('yvec = c(%s)' % ','.join(y)) | |
| 608 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) | |
| 609 R.append("par(lab=c(10,10,10))") | |
| 610 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) | |
| 611 R.append('points(mx,mx,type="l")') | |
| 612 R.append('grid(col="lightgray",lty="dotted")') | |
| 613 R.append('dev.off()') | |
| 614 | |
| 615 | |
| 616 shead = subjects.pop(0) # get rid of head | |
| 617 mhead = markers.pop(0) | |
| 618 maf = mhead.index('maf') | |
| 619 missfrac = mhead.index('missfrac') | |
| 620 logphweall = mhead.index('logp_hwe_all') | |
| 621 logphweunaff = mhead.index('logp_hwe_unaff') | |
| 622 # check for at least some unaffected rml june 2009 | |
| 623 m_mendel = mhead.index('N_Mendel') | |
| 624 fracmiss = shead.index('FracMiss') | |
| 625 s_mendel = shead.index('Mendel_errors') | |
| 626 s_het = shead.index('F_Stat') | |
| 627 params = {} | |
| 628 h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff | |
| 629 and x[logphweunaff].upper() <> 'NA'] | |
| 630 if len(h) <> 0: | |
| 631 xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] | |
| 632 # plot for each of these cols | |
| 633 else: # try hwe all instead - maybe no affection status available | |
| 634 xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] | |
| 635 ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything! | |
| 636 oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled | |
| 637 qqplotme = [1,0,0,0,0,0,0] # | |
| 638 qnplotme = [0,0,0,0,0,0,1] # | |
| 639 nplots = len(xs) | |
| 640 xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency', | |
| 641 'Marker Mendel Error Count', 'Missing Rate: Subjects', | |
| 642 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic'] | |
| 643 plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het'] | |
| 644 ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames | |
| 645 ordplotnames = ['%s_cum' % x for x in plotnames] | |
| 646 ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames | |
| 647 outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)] | |
| 648 ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)] | |
| 649 datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table | |
| 650 titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel", | |
| 651 "Subject Missing Genotype","Subject Mendel",'Subject F Statistic'] | |
| 652 html = [] | |
| 653 pdflist = [] | |
| 654 R = [] | |
| 655 for n,column in enumerate(xs): | |
| 656 dfn = '%d_%s.txt' % (n,titles[n]) | |
| 657 dfilepath = os.path.join(newfpath,dfn) | |
| 658 dat = [float(x[column]) for x in datasources[n] if len(x) >= column | |
| 659 and x[column][:2].upper() <> 'NA'] # plink gives both! | |
| 660 if sum(dat) <> 0: # eg nada for mendel if case control? | |
| 661 plotme = file(dfilepath,'w') | |
| 662 plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout! | |
| 663 tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n], | |
| 664 title=titles[n],basename=basename,nbreaks=nbreaks) | |
| 665 R += tR | |
| 666 row = [titles[n],ploturls[n],outfnames[n] ] | |
| 667 html.append(row) | |
| 668 pdflist.append(outfnames[n]) | |
| 669 if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up | |
| 670 otitle = 'Ranked %s' % titles[n] | |
| 671 dat.sort() | |
| 672 if oreverseme[n]: | |
| 673 dat.reverse() | |
| 674 ndat = len(dat) | |
| 675 xvec = range(ndat) | |
| 676 xvec = [100.0*(n-x)/n for x in xvec] # convert to centile | |
| 677 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points | |
| 678 maxveclen = 1000.0 # for reasonable pdf sizes! | |
| 679 if ndat > maxveclen: # oversample part of the distribution | |
| 680 always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5% | |
| 681 skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points | |
| 682 samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values | |
| 683 yvec = [yvec[i] for i in samplei] # always get first and last | |
| 684 xvec = [xvec[i] for i in samplei] # always get first and last | |
| 685 plotme = file(dfilepath,'w') | |
| 686 plotme.write('xvec\tyvec\n') | |
| 687 plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout! | |
| 688 tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n], | |
| 689 title=otitle,basename=basename,nbreaks=nbreaks) | |
| 690 R += tR | |
| 691 row = [otitle,ordploturls[n],ordoutfnames[n]] | |
| 692 html.append(row) | |
| 693 pdflist.append(ordoutfnames[n]) | |
| 694 if qqplotme[n]: # | |
| 695 otitle = 'LogQQ plot %s' % titles[n] | |
| 696 dat.sort() | |
| 697 dat.reverse() | |
| 698 ofn = os.path.split(ordoutfnames[n]) | |
| 699 ofn = os.path.join(ofn[0],'QQ%s' % ofn[1]) | |
| 700 ofu = os.path.split(ordploturls[n]) | |
| 701 ofu = os.path.join(ofu[0],'QQ%s' % ofu[1]) | |
| 702 tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n], | |
| 703 title=otitle,basename=basename) | |
| 704 R += tR | |
| 705 row = [otitle,ofu,ofn] | |
| 706 html.append(row) | |
| 707 pdflist.append(ofn) | |
| 708 elif qnplotme[n]: | |
| 709 otitle = 'F Statistic %s' % titles[n] | |
| 710 dat.sort() | |
| 711 dat.reverse() | |
| 712 ofn = os.path.split(ordoutfnames[n]) | |
| 713 ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1]) | |
| 714 ofu = os.path.split(ordploturls[n]) | |
| 715 ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1]) | |
| 716 tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n], | |
| 717 title=otitle,basename=basename) | |
| 718 R += tR | |
| 719 row = [otitle,ofu,ofn] | |
| 720 html.append(row) | |
| 721 pdflist.append(ofn) | |
| 722 else: | |
| 723 print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10]) | |
| 724 rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath) | |
| 725 if nup>0: | |
| 726 # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf` | |
| 727 # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf | |
| 728 filestojoin = ' '.join(pdflist) # all the file names so far | |
| 729 afname = '%s_All_Paged.pdf' % (basename) | |
| 730 fullafname = os.path.join(newfpath,afname) | |
| 731 expl = 'All %s QC Plots joined into a single pdf' % basename | |
| 732 vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname) | |
| 733 # make single page pdf | |
| 734 x=subprocess.Popen(vcl,shell=True,cwd=newfpath) | |
| 735 retval = x.wait() | |
| 736 row = [expl,afname,fullafname] | |
| 737 html.insert(0,row) # last rather than second | |
| 738 nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup) | |
| 739 fullnfname = os.path.join(newfpath,nfname) | |
| 740 expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup) | |
| 741 vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname) | |
| 742 # make thumbnail images | |
| 743 x=subprocess.Popen(vcl,shell=True,cwd=newfpath) | |
| 744 retval = x.wait() | |
| 745 row = [expl,nfname,fullnfname] | |
| 746 html.insert(1,row) # this goes second | |
| 747 vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf')) | |
| 748 # make thumbnail images | |
| 749 x=subprocess.Popen(vcl,shell=True,cwd=newfpath) | |
| 750 retval = x.wait() | |
| 751 return html # elements for an ordered list of urls or whatever.. | |
| 752 | |
| 753 def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'): | |
| 754 """ | |
| 755 NO LONGER USED - historical interest | |
| 756 count het loci for each subject to look for outliers = ? contamination | |
| 757 assume ped file is linkage format | |
| 758 need to make a ped file from the bed file we were passed | |
| 759 """ | |
| 760 vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file | |
| 761 p=subprocess.Popen(' '.join(vcl),shell=True) | |
| 762 retval = p.wait() | |
| 763 f = open('%s_recode.recode.ped' % froot,'r') | |
| 764 if not linkageped: | |
| 765 head = f.next() # throw away header | |
| 766 hets = [] # simple count of het loci per subject. Expect poisson? | |
| 767 n = 1 | |
| 768 for l in f: | |
| 769 n += 1 | |
| 770 ll = l.strip().split() | |
| 771 if len(ll) > 6: | |
| 772 iid = idjoiner.join(ll[:2]) # fam_iid | |
| 773 gender = ll[4] | |
| 774 alleles = ll[6:] | |
| 775 nallele = len(alleles) | |
| 776 nhet = 0 | |
| 777 for i in range(nallele/2): | |
| 778 a1=alleles[2*i] | |
| 779 a2=alleles[2*i+1] | |
| 780 if alleles[2*i] <> alleles[2*i+1]: # must be het | |
| 781 if not missvals.get(a1,None) and not missvals.get(a2,None): | |
| 782 nhet += 1 | |
| 783 hets.append((nhet,iid,gender)) # for sorting later | |
| 784 f.close() | |
| 785 hets.sort() | |
| 786 hets.reverse() # biggest nhet now on top | |
| 787 f = open(outfname ,'w') | |
| 788 res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions | |
| 789 res.insert(0,'nhetloci\tfamid_iid\tgender') | |
| 790 res.append('') | |
| 791 f.write('\n'.join(res)) | |
| 792 f.close() | |
| 793 | |
| 794 | |
| 795 | |
| 796 def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None): | |
| 797 """by subject (missingness = .imiss, mendel = .imendel) | |
| 798 assume replicates have an underscore in family id for | |
| 799 hapmap testing | |
| 800 For sorting, we need floats and integers | |
| 801 """ | |
| 802 isexfile = '%s.sexcheck' % froot | |
| 803 imissfile = '%s.imiss' % froot | |
| 804 imendfile = '%s.imendel' % froot | |
| 805 ihetfile = '%s.het' % froot | |
| 806 logf.write('## subject reports starting at %s\n' % timenow()) | |
| 807 outfile = os.path.join(newfpath,outfname) | |
| 808 idlist = [] | |
| 809 imissdict = {} | |
| 810 imenddict = {} | |
| 811 isexdict = {} | |
| 812 ihetdict = {} | |
| 813 Tops = {} | |
| 814 Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel', | |
| 815 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic'] | |
| 816 Tsorts = [2,3,6,8] | |
| 817 Treverse = [True,True,True,False] # so first values are worser | |
| 818 #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] | |
| 819 ## FID IID MISS_PHENO N_MISS N_GENO F_MISS | |
| 820 ## 1552042370_A 1552042370_A N 5480 549883 0.009966 | |
| 821 ## 1552042410_A 1552042410_A N 1638 549883 0.002979 | |
| 822 | |
| 823 # ------------------missing-------------------- | |
| 824 # imiss has FID IID MISS_PHENO N_MISS F_MISS | |
| 825 # we want F_MISS | |
| 826 try: | |
| 827 f = open(imissfile,'r') | |
| 828 except: | |
| 829 logf.write('# file %s is missing - talk about irony\n' % imissfile) | |
| 830 f = None | |
| 831 if f: | |
| 832 for n,line in enumerate(f): | |
| 833 ll = line.strip().split() | |
| 834 if n == 0: | |
| 835 head = [x.upper() for x in ll] # expect above | |
| 836 fidpos = head.index('FID') | |
| 837 iidpos = head.index('IID') | |
| 838 fpos = head.index('F_MISS') | |
| 839 elif len(ll) >= fpos: # full line | |
| 840 fid = ll[fidpos] | |
| 841 #if fid.find('_') == -1: # not replicate! - icondb ids have these... | |
| 842 iid = ll[iidpos] | |
| 843 fmiss = ll[fpos] | |
| 844 id = '%s%s%s' % (fid,idjoiner,iid) | |
| 845 imissdict[id] = fmiss | |
| 846 idlist.append(id) | |
| 847 f.close() | |
| 848 logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist))) | |
| 849 # ------------------mend------------------- | |
| 850 # *.imendel has FID IID N | |
| 851 # we want N | |
| 852 gotmend = True | |
| 853 try: | |
| 854 f = open(imendfile,'r') | |
| 855 except: | |
| 856 gotmend = False | |
| 857 for id in idlist: | |
| 858 imenddict[id] = '0' | |
| 859 if gotmend: | |
| 860 for n,line in enumerate(f): | |
| 861 ll = line.strip().split() | |
| 862 if n == 0: | |
| 863 head = [x.upper() for x in ll] # expect above | |
| 864 npos = head.index('N') | |
| 865 fidpos = head.index('FID') | |
| 866 iidpos = head.index('IID') | |
| 867 elif len(ll) >= npos: # full line | |
| 868 fid = ll[fidpos] | |
| 869 iid = ll[iidpos] | |
| 870 id = '%s%s%s' % (fid,idjoiner,iid) | |
| 871 nmend = ll[npos] | |
| 872 imenddict[id] = nmend | |
| 873 f.close() | |
| 874 else: | |
| 875 logf.write('## error No %s file - assuming not family data\n' % imendfile) | |
| 876 # ------------------sex check------------------ | |
| 877 #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck | |
| 878 # sexcheck has FID IID PEDSEX SNPSEX STATUS F | |
| 879 ## | |
| 880 ## FID Family ID | |
| 881 ## IID Individual ID | |
| 882 ## PEDSEX Sex as determined in pedigree file (1=male, 2=female) | |
| 883 ## SNPSEX Sex as determined by X chromosome | |
| 884 ## STATUS Displays "PROBLEM" or "OK" for each individual | |
| 885 ## F The actual X chromosome inbreeding (homozygosity) estimate | |
| 886 ## | |
| 887 ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are | |
| 888 ## ambiguous with regard to sex. | |
| 889 ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2. | |
| 890 try: | |
| 891 f = open(isexfile,'r') | |
| 892 got_sexcheck = 1 | |
| 893 except: | |
| 894 got_sexcheck = 0 | |
| 895 if got_sexcheck: | |
| 896 for n,line in enumerate(f): | |
| 897 ll = line.strip().split() | |
| 898 if n == 0: | |
| 899 head = [x.upper() for x in ll] # expect above | |
| 900 fidpos = head.index('FID') | |
| 901 iidpos = head.index('IID') | |
| 902 pedsexpos = head.index('PEDSEX') | |
| 903 snpsexpos = head.index('SNPSEX') | |
| 904 statuspos = head.index('STATUS') | |
| 905 fpos = head.index('F') | |
| 906 elif len(ll) >= fpos: # full line | |
| 907 fid = ll[fidpos] | |
| 908 iid = ll[iidpos] | |
| 909 fest = ll[fpos] | |
| 910 pedsex = ll[pedsexpos] | |
| 911 snpsex = ll[snpsexpos] | |
| 912 stat = ll[statuspos] | |
| 913 id = '%s%s%s' % (fid,idjoiner,iid) | |
| 914 isexdict[id] = (pedsex,snpsex,stat,fest) | |
| 915 f.close() | |
| 916 else: | |
| 917 # this only happens if there are no subjects! | |
| 918 logf.write('No %s file - assuming no sex errors' % isexfile) | |
| 919 ## | |
| 920 ## FID IID O(HOM) E(HOM) N(NM) F | |
| 921 ## 457 2 490665 4.928e+05 722154 -0.009096 | |
| 922 ## 457 3 464519 4.85e+05 710986 -0.0908 | |
| 923 ## 1037 2 461632 4.856e+05 712025 -0.106 | |
| 924 ## 1037 1 491845 4.906e+05 719353 0.005577 | |
| 925 try: | |
| 926 f = open(ihetfile,'r') | |
| 927 except: | |
| 928 f = None | |
| 929 logf.write('## No %s file - did we run --het in plink?\n' % ihetfile) | |
| 930 if f: | |
| 931 for i,line in enumerate(f): | |
| 932 ll = line.strip().split() | |
| 933 if i == 0: | |
| 934 head = [x.upper() for x in ll] # expect above | |
| 935 fidpos = head.index('FID') | |
| 936 iidpos = head.index('IID') | |
| 937 fpos = head.index('F') | |
| 938 n = 0 | |
| 939 elif len(ll) >= fpos: # full line | |
| 940 fid = ll[fidpos] | |
| 941 iid = ll[iidpos] | |
| 942 fhet = ll[fpos] | |
| 943 id = '%s%s%s' % (fid,idjoiner,iid) | |
| 944 ihetdict[id] = fhet | |
| 945 f.close() # now assemble and output result list | |
| 946 rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat'] | |
| 947 res = [] | |
| 948 fres = [] # floats for sorting | |
| 949 for id in idlist: # for each snp in found order | |
| 950 fid,iid = id.split(idjoiner) # recover keys | |
| 951 f_missing = imissdict.get(id,'0.0') | |
| 952 nmend = imenddict.get(id,'0') | |
| 953 (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0')) | |
| 954 fhet = ihetdict.get(id,'0.0') | |
| 955 res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet]) | |
| 956 try: | |
| 957 ff_missing = float(f_missing) | |
| 958 except: | |
| 959 ff_missing = 0.0 | |
| 960 try: | |
| 961 inmend = int(nmend) | |
| 962 except: | |
| 963 inmend = 0 | |
| 964 try: | |
| 965 ffest = float(fest) | |
| 966 except: | |
| 967 fest = 0.0 | |
| 968 try: | |
| 969 ffhet = float(fhet) | |
| 970 except: | |
| 971 ffhet = 0.0 | |
| 972 fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet]) | |
| 973 ntokeep = max(20,len(res)/keepfrac) | |
| 974 for i,col in enumerate(Tsorts): | |
| 975 fres.sort(key=operator.itemgetter(col)) | |
| 976 if Treverse[i]: | |
| 977 fres.reverse() | |
| 978 repname = Tnames[i] | |
| 979 Tops[repname] = fres[0:ntokeep] | |
| 980 Tops[repname] = [map(str,x) for x in Tops[repname]] | |
| 981 Tops[repname].insert(0,rhead) | |
| 982 res.sort() | |
| 983 res.insert(0,rhead) | |
| 984 logf.write('### writing %s report with %s' % ( outfile,res[0])) | |
| 985 f = open(outfile,'w') | |
| 986 f.write('\n'.join(['\t'.join(x) for x in res])) | |
| 987 f.write('\n') | |
| 988 f.close() | |
| 989 return res,Tops | |
| 990 | |
| 991 def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ): | |
| 992 """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq) | |
| 993 keep a list of marker order but keep all stats in dicts | |
| 994 write out a fake xls file for R or SAS etc | |
| 995 kinda clunky, but.. | |
| 996 TODO: ensure stable if any file not found? | |
| 997 """ | |
| 998 mapdict = {} | |
| 999 if maplist <> None: | |
| 1000 rslist = [x[1] for x in maplist] | |
| 1001 offset = [(x[0],x[3]) for x in maplist] | |
| 1002 mapdict = dict(zip(rslist,offset)) | |
| 1003 hwefile = '%s.hwe' % froot | |
| 1004 lmissfile = '%s.lmiss' % froot | |
| 1005 freqfile = '%s.frq' % froot | |
| 1006 lmendfile = '%s.lmendel' % froot | |
| 1007 outfile = os.path.join(newfpath,outfname) | |
| 1008 markerlist = [] | |
| 1009 chromlist = [] | |
| 1010 hwedict = {} | |
| 1011 lmissdict = {} | |
| 1012 freqdict = {} | |
| 1013 lmenddict = {} | |
| 1014 Tops = {} | |
| 1015 Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel'] | |
| 1016 Tsorts = [3,6,10,11] | |
| 1017 Treverse = [False,True,True,True] # so first values are worse(r) | |
| 1018 #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend]) | |
| 1019 #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] | |
| 1020 # -------------------hwe-------------------------- | |
| 1021 # hwe has SNP TEST GENO O(HET) E(HET) P_HWD | |
| 1022 # we want all hwe where P_HWD <> NA | |
| 1023 # ah changed in 1.04 to | |
| 1024 ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P | |
| 1025 ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644 | |
| 1026 ## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA | |
| 1027 ## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644 | |
| 1028 ## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975 | |
| 1029 ## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA | |
| 1030 ## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975 | |
| 1031 ## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067 | |
| 1032 ## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA | |
| 1033 ## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067 | |
| 1034 logf.write('## marker reports starting at %s\n' % timenow()) | |
| 1035 try: | |
| 1036 f = open(hwefile,'r') | |
| 1037 except: | |
| 1038 f = None | |
| 1039 logf.write('## error - no hwefile %s found\n' % hwefile) | |
| 1040 if f: | |
| 1041 for i,line in enumerate(f): | |
| 1042 ll = line.strip().split() | |
| 1043 if i == 0: # head | |
| 1044 head = [x.upper() for x in ll] # expect above | |
| 1045 try: | |
| 1046 testpos = head.index('TEST') | |
| 1047 except: | |
| 1048 testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST') | |
| 1049 try: | |
| 1050 ppos = head.index('P') | |
| 1051 except: | |
| 1052 ppos = 8 # patch - for head.index('P') | |
| 1053 snppos = head.index('SNP') | |
| 1054 chrpos = head.index('CHR') | |
| 1055 logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos)) | |
| 1056 elif len(ll) >= ppos: # full line | |
| 1057 ps = ll[ppos].upper() | |
| 1058 rs = ll[snppos] | |
| 1059 chrom = ll[chrpos] | |
| 1060 test = ll[testpos] | |
| 1061 if not hwedict.get(rs,None): | |
| 1062 hwedict[rs] = {} | |
| 1063 markerlist.append(rs) | |
| 1064 chromlist.append(chrom) # one place to find it? | |
| 1065 lpvals = 0 | |
| 1066 if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping | |
| 1067 lpvals = '0' | |
| 1068 if ps <> '1': | |
| 1069 try: | |
| 1070 pval = float(ps) | |
| 1071 lpvals = '%f' % -math.log10(pval) | |
| 1072 except: | |
| 1073 pass | |
| 1074 hwedict[rs][test] = (ps,lpvals) | |
| 1075 else: | |
| 1076 logf.write('short line #%d = %s\n' % (i,ll)) | |
| 1077 f.close() | |
| 1078 # ------------------missing-------------------- | |
| 1079 """lmiss has | |
| 1080 CHR SNP N_MISS N_GENO F_MISS | |
| 1081 1 rs12354060 0 73 0 | |
| 1082 1 rs4345758 1 73 0.0137 | |
| 1083 1 rs2691310 73 73 1 | |
| 1084 1 rs2531266 73 73 1 | |
| 1085 # we want F_MISS""" | |
| 1086 try: | |
| 1087 f = open(lmissfile,'r') | |
| 1088 except: | |
| 1089 f = None | |
| 1090 if f: | |
| 1091 for i,line in enumerate(f): | |
| 1092 ll = line.strip().split() | |
| 1093 if i == 0: | |
| 1094 head = [x.upper() for x in ll] # expect above | |
| 1095 fracpos = head.index('F_MISS') | |
| 1096 npos = head.index('N_MISS') | |
| 1097 snppos = head.index('SNP') | |
| 1098 elif len(ll) >= fracpos: # full line | |
| 1099 rs = ll[snppos] | |
| 1100 fracval = ll[fracpos] | |
| 1101 lmissdict[rs] = fracval # for now, just that? | |
| 1102 else: | |
| 1103 lmissdict[rs] = 'NA' | |
| 1104 f.close() | |
| 1105 # ------------------freq------------------- | |
| 1106 # frq has CHR SNP A1 A2 MAF NM | |
| 1107 # we want maf | |
| 1108 try: | |
| 1109 f = open(freqfile,'r') | |
| 1110 except: | |
| 1111 f = None | |
| 1112 if f: | |
| 1113 for i,line in enumerate(f): | |
| 1114 ll = line.strip().split() | |
| 1115 if i == 0: | |
| 1116 head = [x.upper() for x in ll] # expect above | |
| 1117 mafpos = head.index('MAF') | |
| 1118 a1pos = head.index('A1') | |
| 1119 a2pos = head.index('A2') | |
| 1120 snppos = head.index('SNP') | |
| 1121 elif len(ll) >= mafpos: # full line | |
| 1122 rs = ll[snppos] | |
| 1123 a1 = ll[a1pos] | |
| 1124 a2 = ll[a2pos] | |
| 1125 maf = ll[mafpos] | |
| 1126 freqdict[rs] = (maf,a1,a2) | |
| 1127 f.close() | |
| 1128 # ------------------mend------------------- | |
| 1129 # lmend has CHR SNP N | |
| 1130 # we want N | |
| 1131 gotmend = True | |
| 1132 try: | |
| 1133 f = open(lmendfile,'r') | |
| 1134 except: | |
| 1135 gotmend = False | |
| 1136 for rs in markerlist: | |
| 1137 lmenddict[rs] = '0' | |
| 1138 if gotmend: | |
| 1139 for i,line in enumerate(f): | |
| 1140 ll = line.strip().split() | |
| 1141 if i == 0: | |
| 1142 head = [x.upper() for x in ll] # expect above | |
| 1143 npos = head.index('N') | |
| 1144 snppos = head.index('SNP') | |
| 1145 elif len(ll) >= npos: # full line | |
| 1146 rs = ll[snppos] | |
| 1147 nmend = ll[npos] | |
| 1148 lmenddict[rs] = nmend | |
| 1149 f.close() | |
| 1150 else: | |
| 1151 logf.write('No %s file - assuming not family data\n' % lmendfile) | |
| 1152 # now assemble result list | |
| 1153 rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] | |
| 1154 res = [] | |
| 1155 fres = [] | |
| 1156 for rs in markerlist: # for each snp in found order | |
| 1157 f_missing = lmissdict.get(rs,'NA') | |
| 1158 maf,a1,a2 = freqdict.get(rs,('NA','NA','NA')) | |
| 1159 hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change... | |
| 1160 hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA')) | |
| 1161 nmend = lmenddict.get(rs,'NA') | |
| 1162 (chrom,offset)=mapdict.get(rs,('?','0')) | |
| 1163 res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend]) | |
| 1164 ntokeep = max(10,len(res)/keepfrac) | |
| 1165 | |
| 1166 def msortk(item=None): | |
| 1167 """ | |
| 1168 deal with non numeric sorting | |
| 1169 """ | |
| 1170 try: | |
| 1171 return float(item) | |
| 1172 except: | |
| 1173 return item | |
| 1174 | |
| 1175 for i,col in enumerate(Tsorts): | |
| 1176 res.sort(key=msortk(lambda x:x[col])) | |
| 1177 if Treverse[i]: | |
| 1178 res.reverse() | |
| 1179 repname = Tnames[i] | |
| 1180 Tops[repname] = res[0:ntokeep] | |
| 1181 Tops[repname].insert(0,rhead) | |
| 1182 res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order | |
| 1183 res.insert(0,rhead) | |
| 1184 f = open(outfile,'w') | |
| 1185 f.write('\n'.join(['\t'.join(x) for x in res])) | |
| 1186 f.close() | |
| 1187 return res,Tops | |
| 1188 | |
| 1189 | |
| 1190 | |
| 1191 | |
| 1192 def getfSize(fpath,outpath): | |
| 1193 """ | |
| 1194 format a nice file size string | |
| 1195 """ | |
| 1196 size = '' | |
| 1197 fp = os.path.join(outpath,fpath) | |
| 1198 if os.path.isfile(fp): | |
| 1199 n = float(os.path.getsize(fp)) | |
| 1200 if n > 2**20: | |
| 1201 size = ' (%1.1f MB)' % (n/2**20) | |
| 1202 elif n > 2**10: | |
| 1203 size = ' (%1.1f KB)' % (n/2**10) | |
| 1204 elif n > 0: | |
| 1205 size = ' (%d B)' % (int(n)) | |
| 1206 return size | |
| 1207 | |
| 1208 | |
| 1209 if __name__ == "__main__": | |
| 1210 u = """ called in xml as | |
| 1211 <command interpreter="python"> | |
| 1212 rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix" | |
| 1213 -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink' | |
| 1214 -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R' | |
| 1215 </command> | |
| 1216 | |
| 1217 Prepare a qc report - eg: | |
| 1218 print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog) | |
| 1219 | |
| 1220 """ | |
| 1221 progname = os.path.basename(sys.argv[0]) | |
| 1222 if len(sys.argv) < 9: | |
| 1223 print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv) | |
| 1224 sys.exit(1) | |
| 1225 parser = OptionParser(usage=u, version="%prog 0.01") | |
| 1226 a = parser.add_option | |
| 1227 a("-i","--infile",dest="infile") | |
| 1228 a("-o","--oprefix",dest="opref") | |
| 1229 a("-l","--plinkexe",dest="plinke", default=plinke) | |
| 1230 a("-r","--rexe",dest="rexe", default=rexe) | |
| 1231 a("-s","--snps",dest="htmlf") | |
| 1232 #a("-m","--markerRaw",dest="markf") | |
| 1233 #a("-r","--rawsubject",dest="subjf") | |
| 1234 a("-p","--patho",dest="newfpath") | |
| 1235 (options,args) = parser.parse_args() | |
| 1236 basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below | |
| 1237 killme = string.punctuation + string.whitespace | |
| 1238 trantab = string.maketrans(killme,'_'*len(killme)) | |
| 1239 opref = options.opref.translate(trantab) | |
| 1240 alogh,alog = tempfile.mkstemp(suffix='.txt') | |
| 1241 plogh,plog = tempfile.mkstemp(suffix='.txt') | |
| 1242 alogf = open(alog,'w') | |
| 1243 plogf = open(plog,'w') | |
| 1244 ahtmlf = options.htmlf | |
| 1245 amarkf = 'MarkerDetails_%s.xls' % opref | |
| 1246 asubjf = 'SubjectDetails_%s.xls' % opref | |
| 1247 newfpath = options.newfpath | |
| 1248 newfpath = os.path.realpath(newfpath) | |
| 1249 try: | |
| 1250 os.makedirs(newfpath) | |
| 1251 except: | |
| 1252 pass | |
| 1253 ofn = basename | |
| 1254 bfn = options.infile | |
| 1255 try: | |
| 1256 mapf = '%s.bim' % bfn | |
| 1257 maplist = file(mapf,'r').readlines() | |
| 1258 maplist = [x.split() for x in maplist] | |
| 1259 except: | |
| 1260 maplist = None | |
| 1261 alogf.write('## error - cannot open %s to read map - no offsets will be available for output files') | |
| 1262 #rerla@beast galaxy]$ head test-data/tinywga.bim | |
| 1263 #22 rs2283802 0 21784722 4 2 | |
| 1264 #22 rs2267000 0 21785366 4 2 | |
| 1265 rgbin = os.path.split(rexe)[0] # get our rg bin path | |
| 1266 #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug! | |
| 1267 # if we could, do all at once? Nope. Probably never. | |
| 1268 plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'], | |
| 1269 ['--mendel',],['--check-sex',]] | |
| 1270 vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0'] | |
| 1271 runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase) | |
| 1272 plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename], | |
| 1273 ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)], | |
| 1274 ['--bfile ldp_%s --het --out %s' % (basename,basename)]] | |
| 1275 # subset of ld independent markers for eigenstrat and other requirements | |
| 1276 vclbase = [options.plinke,'--noweb'] | |
| 1277 plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) | |
| 1278 plogf.write('\n'.join(plogout)) | |
| 1279 plogf.write('\n') | |
| 1280 repout = os.path.join(newfpath,basename) | |
| 1281 subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath, | |
| 1282 logf=alogf) # writes the subject_froot.xls file | |
| 1283 markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath, | |
| 1284 logf=alogf,maplist=maplist) # marker_froot.xls | |
| 1285 nbreaks = 100 | |
| 1286 s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2]) | |
| 1287 alogf.write(s) | |
| 1288 print s | |
| 1289 plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath, | |
| 1290 basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin) | |
| 1291 #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe) | |
| 1292 | |
| 1293 # [titles[n],plotnames[n],outfnames[n] ] | |
| 1294 html = [] | |
| 1295 html.append('<table cellpadding="5" border="0">') | |
| 1296 size = getfSize(amarkf,newfpath) | |
| 1297 html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \ | |
| 1298 (amarkf,'Click here to download the Marker QC Detail report file',size)) | |
| 1299 size = getfSize(asubjf,newfpath) | |
| 1300 html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \ | |
| 1301 (asubjf,'Click here to download the Subject QC Detail report file',size)) | |
| 1302 for (title,url,ofname) in plotpage: | |
| 1303 ttitle = 'Ranked %s' % title | |
| 1304 dat = subjectTops.get(ttitle,None) | |
| 1305 if not dat: | |
| 1306 dat = markerTops.get(ttitle,None) | |
| 1307 imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf | |
| 1308 thumbnail = os.path.join(newfpath,imghref) | |
| 1309 if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack | |
| 1310 imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg | |
| 1311 thumbnail = os.path.join(newfpath,imghref) | |
| 1312 if not os.path.exists(thumbnail): | |
| 1313 html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title)) | |
| 1314 else: | |
| 1315 html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \ | |
| 1316 % (url,imghref,title)) | |
| 1317 if dat: # one or the other - write as an extra file and make a link here | |
| 1318 t = '%s.xls' % (ttitle.replace(' ','_')) | |
| 1319 fname = os.path.join(newfpath,t) | |
| 1320 f = file(fname,'w') | |
| 1321 f.write('\n'.join(['\t'.join(x) for x in dat])) # the report | |
| 1322 size = getfSize(t,newfpath) | |
| 1323 html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size)) | |
| 1324 else: | |
| 1325 html.append('</a></td><td>%s</td><td> </td></tr>' % (title)) | |
| 1326 html.append('</table><hr><h3>All output files from the QC run are available below</h3>') | |
| 1327 html.append('<table cellpadding="5" border="0">\n') | |
| 1328 flist = os.listdir(newfpath) # we want to catch 'em all | |
| 1329 flist.sort() | |
| 1330 for f in flist: | |
| 1331 fname = os.path.split(f)[-1] | |
| 1332 size = getfSize(fname,newfpath) | |
| 1333 html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size)) | |
| 1334 html.append('</table>') | |
| 1335 alogf.close() | |
| 1336 plogf.close() | |
| 1337 llog = open(alog,'r').readlines() | |
| 1338 plogfile = open(plog,'r').readlines() | |
| 1339 os.unlink(alog) | |
| 1340 os.unlink(plog) | |
| 1341 llog += plogfile # add lines from pruneld log | |
| 1342 lf = file(ahtmlf,'w') # galaxy will show this as the default view | |
| 1343 lf.write(galhtmlprefix % progname) | |
| 1344 s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow()) | |
| 1345 lf.write('<h4>%s</h4>\n' % s) | |
| 1346 lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n') | |
| 1347 lf.write('\n'.join(html)) | |
| 1348 lf.write('<h4>QC run log contents</h4>') | |
| 1349 lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs | |
| 1350 if len(cruft) > 0: | |
| 1351 lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup | |
| 1352 lf.write('%s\n<hr>\n' % galhtmlpostfix) | |
| 1353 lf.close() | |
| 1354 |
