Mercurial > repos > xuebing > sharplabtool
view tools/rgenetics/rgQC.py @ 2:c2a356708570
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:45:42 -0500 |
parents | 9071e359b9a3 |
children |
line wrap: on
line source
# oct 15 rpy replaced - temp fix until we get gnuplot working # rpy deprecated - replace with RRun # fixes to run functional test! oct1 2009 # needed to expand our path with os.path.realpath to get newpath working with # all the fancy pdfnup stuff # and a fix to pruneld to write output to where it should be # smallish data in test-data/smallwga in various forms # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga # child files are deprecated and broken as at july 19 2009 # need to move them to the html file extrafiles path # found lots of corner cases with some illumina data where cnv markers were # included # and where affection status was all missing ! # added links to tab files showing worst 1/keepfrac markers and subjects # ross lazarus january 2008 # # added named parameters # to ensure no silly slippages if non required parameter in the most general case # some potentially useful things here reusable in complex scripts # with lots'o'html (TM) # aug 17 2007 rml # # added marker and subject and parenting april 14 rml # took a while to get the absolute paths right for all the file munging # as of april 16 seems to work.. # getting galaxy to serve images in html reports is a little tricky # we don't want QC reports to be dozens of individual files, so need # to use the url /static/rg/... since galaxy's web server will happily serve images # from there # galaxy passes output files as relative paths # these have to be munged by rgQC.py before calling this # galaxy will pass in 2 file names - one for the log # and one for the final html report # of the form './database/files/dataset_66.dat' # we need to be working in that directory so our plink output files are there # so these have to be munged by rgQC.py before calling this # note no ped file passed so had to remove the -l option # for plinkParse.py that makes a heterozygosity report from the ped # file - needs fixing... # new: importing manhattan/qqplot plotter # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir): # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0 # contains some R scripts as text strings - we substitute defaults into the calls # to make them do our bidding - and save the resulting code for posterity # this can be called externally, I guess...for QC eg? # """ # # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey)) # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir) # return rlog,flist from optparse import OptionParser import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string from os.path import abspath from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD import rgManQQ prog = os.path.split(sys.argv[0])[1] vers = '0.4 april 2009 rml' idjoiner = '_~_~_' # need something improbable.. # many of these may need fixing for a new install myversion = vers keepfrac = 20 # fraction to keep after sorting by each interesting value missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change! mogresize = "x300" # this controls the width for jpeg thumbnails def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''): """ marker rhead = ['snp','chrom','maf','a1','a2','missfrac', 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] """ def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50): """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50) # generic histogram and vertical boxplot in a 3:1 layout # returns the graphic file name for inclusion in the web page # broken out here for reuse # ross lazarus march 2007 """ screenmat = (1,2,1,2) # create a 2x2 cabvas widthlist = (80,20) # change to 4:1 ratio for histo and boxplot rpy.r.pdf( outfname, height , width ) #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True) # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T) rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot maint = 'QC for %s - %s' % (basename,title) rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8) rpy.r.boxplot(plotme,main='',col="maroon",outline=False) rpy.r.dev_off() def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100): """ Useful to see what various cutoffs yield - plot percentiles """ n = len(plotme) maxveclen = 1000.0 # for reasonable pdf sizes! yvec = copy.copy(plotme) # arrives already in decending order of importance missingness or mendel count by subj or marker xvec = range(n) xvec = [100.0*(n-x)/n for x in xvec] # convert to centile # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points if n > maxveclen: # oversample part of the distribution always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5% skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values yvec = [yvec[i] for i in samplei] # always get first and last xvec = [xvec[i] for i in samplei] # always get first and last # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure rpy.r.pdf( outfname, height , width ) maint = 'QC for %s - %s' % (basename,title) rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8) rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") rpy.r.dev_off() def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): """ y is data for a qq plot and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? this version called with -log10 transformed hwe """ nrows = len(plotme) fn = float(nrows) xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))] mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line maxveclen = 3000 yvec = copy.copy(plotme) if nrows > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] # always oversample first sorted (here lowest) values yvec = [yvec[i] for i in samplei] # always get first and last xvec = [xvec[i] for i in samplei] # and sample xvec same way maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) else: maint='Log QQ Plot(n=%d)' % (nrows) mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line ylab = '%s' % xlabname xlab = '-log10(Uniform 0-1)' # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure rpy.r.pdf( outfname, height , width ) rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) rpy.r.points(mx,mx,type='l') rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") rpy.r.dev_off() def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''): """ data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a grid of qq plots to show departure from null at extremes of data quality Need to plot qqplot(p,unif) where the p's come from one x and one y quantile and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? """ data = copy.copy(plotme) nvals = len(data) stepsize = nvals/nsplits logstep = math.log10(stepsize) # so is 3 for steps of 1000 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer data.sort(key=itertools.itemgetter(1)) # into x order rpy.r.pdf( outfname, height , width ) rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits)) yvec = [-math.log10(random.random()) for x in range(stepsize)] yvec.sort() # size of each step is expected range for xvec under null?! for rowstart in quints: rowend = rowstart + stepsize if nvals - rowend < stepsize: # finish last split rowend = nvals row = data[rowstart:rowend] row.sort(key=itertools.itemgetter(2)) # into y order for colstart in quints: colend = colstart + stepsize if nvals - colend < stepsize: # finish last split colend = nvals cell = row[colstart:colend] xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8) rpy.r.points(c(0,logstep),c(0,logstep),type='l') rpy.r.dev_off() def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): """ y is data for a qqnorm plot if sampling, oversample low values - all the top 1% ? """ rangeunif = len(plotme) nunif = 1000 maxveclen = 3000 nrows = len(plotme) data = copy.copy(plotme) if nrows > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] # always oversample first sorted (here lowest) values yvec = [data[i] for i in samplei] # always get first and last maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) else: yvec = data maint='Log QQ Plot(n=%d)' % (nrows) n = 1000 ylab = '%s' % xlabname xlab = 'Normal' # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure rpy.r.pdf( outfname, height , width ) rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") rpy.r.dev_off() def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): """ layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness like the GAIN qc tools y is data for a qq plot and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? """ rangeunif = len(plotme) nunif = 1000 fn = float(rangeunif) xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))] skip = max(int(rangeunif/fn),1) # force include last points mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line maxveclen = 2000 nrows = len(plotme) data = copy.copy(plotme) data.sort() # low to high - oversample low values if nrows > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] # always oversample first sorted (here lowest) values yvec = [data[i] for i in samplei] # always get first and last xvec = [xvec[i] for i in samplei] # and sample xvec same way maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) else: yvec = data maint='Log QQ Plot(n=%d)' % (nrows) n = 1000 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line ylab = '%s' % xlabname xlab = '-log10(Uniform 0-1)' # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure rpy.r.pdf( outfname, height , width ) rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) rpy.r.points(mx,mx,type='l') rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") rpy.r.dev_off() fdsto,stofile = tempfile.mkstemp() sto = open(stofile,'w') import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb mog = 'mogrify' pdfnup = 'pdfnup' pdfjoin = 'pdfjoin' shead = subjects.pop(0) # get rid of head mhead = markers.pop(0) maf = mhead.index('maf') missfrac = mhead.index('missfrac') logphweall = mhead.index('logp_hwe_all') logphweunaff = mhead.index('logp_hwe_unaff') # check for at least some unaffected rml june 2009 m_mendel = mhead.index('N_Mendel') fracmiss = shead.index('FracMiss') s_mendel = shead.index('Mendel_errors') s_het = shead.index('F_Stat') params = {} hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff and x[logphweunaff].upper() <> 'NA'] if len(hweres) <> 0: xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] # plot for each of these cols else: # try hwe all instead - maybe no affection status available xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything! oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled qqplotme = [1,0,0,0,0,0,0] # qnplotme = [0,0,0,0,0,0,1] # nplots = len(xs) xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency', 'Marker Mendel Error Count', 'Missing Rate: Subjects', 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic'] plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het'] ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames ordplotnames = ['%s_cum' % x for x in plotnames] ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)] ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)] datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel", "Subject Missing Genotype","Subject Mendel",'Subject F Statistic'] html = [] pdflist = [] for n,column in enumerate(xs): dat = [float(x[column]) for x in datasources[n] if len(x) >= column and x[column][:2].upper() <> 'NA'] # plink gives both! if sum(dat) <> 0: # eg nada for mendel if case control? rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n], title=titles[n],basename=basename,nbreaks=nbreaks) row = [titles[n],ploturls[n],outfnames[n] ] html.append(row) pdflist.append(outfnames[n]) if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up otitle = 'Ranked %s' % titles[n] dat.sort() if oreverseme[n]: dat.reverse() rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n], title=otitle,basename=basename,nbreaks=1000) row = [otitle,ordploturls[n],ordoutfnames[n]] html.append(row) pdflist.append(ordoutfnames[n]) if qqplotme[n]: # otitle = 'LogQQ plot %s' % titles[n] dat.sort() dat.reverse() ofn = os.path.split(ordoutfnames[n]) ofn = os.path.join(ofn[0],'QQ%s' % ofn[1]) ofu = os.path.split(ordploturls[n]) ofu = os.path.join(ofu[0],'QQ%s' % ofu[1]) rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n], title=otitle,basename=basename) row = [otitle,ofu,ofn] html.append(row) pdflist.append(ofn) elif qnplotme[n]: otitle = 'F Statistic %s' % titles[n] dat.sort() dat.reverse() ofn = os.path.split(ordoutfnames[n]) ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1]) ofu = os.path.split(ordploturls[n]) ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1]) rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n], title=otitle,basename=basename) row = [otitle,ofu,ofn] html.append(row) pdflist.append(ofn) else: print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10]) if nup>0: # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf` # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf filestojoin = ' '.join(pdflist) # all the file names so far afname = '%s_All_Paged.pdf' % (basename) fullafname = os.path.join(newfpath,afname) expl = 'All %s QC Plots joined into a single pdf' % basename vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname) # make single page pdf x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) retval = x.wait() row = [expl,afname,fullafname] html.insert(0,row) # last rather than second nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup) fullnfname = os.path.join(newfpath,nfname) expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup) vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname) # make thumbnail images x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) retval = x.wait() row = [expl,nfname,fullnfname] html.insert(1,row) # this goes second vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf')) # make thumbnail images x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto) retval = x.wait() sto.close() cruft = open(stofile,'r').readlines() return html,cruft # elements for an ordered list of urls or whatever.. def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''): """ nice try but the R scripts are huge and take forever to run if there's a lot of data marker rhead = ['snp','chrom','maf','a1','a2','missfrac', 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] """ colour = "maroon" def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks): """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50) # generic histogram and vertical boxplot in a 3:1 layout # returns the graphic file name for inclusion in the web page # broken out here for reuse # ross lazarus march 2007 """ R = [] maint = 'QC for %s - %s' % (basename,title) screenmat = (1,2,1,2) # create a 2x2 canvas widthlist = (80,20) # change to 4:1 ratio for histo and boxplot R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme) R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour)) R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) ) R.append('dev.off()') return R def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100): """ Useful to see what various cutoffs yield - plot percentiles """ R = [] n = len(plotme) R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme) # arrives already in decending order of importance missingness or mendel count by subj or marker maint = 'QC for %s - %s' % (basename,title) R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) R.append("par(lab=c(10,10,10))") R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour)) R.append('dev.off()') return R def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''): """ y is data for a qq plot and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? this version called with -log10 transformed hwe """ R = [] nrows = len(plotme) fn = float(nrows) xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))] #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line maxveclen = 3000 yvec = copy.copy(plotme) if nrows > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points if skip < 2: skip = 2 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)] # always oversample first sorted (here lowest) values yvec = [yvec[i] for i in samplei] # always get first and last xvec = [xvec[i] for i in samplei] # and sample xvec same way maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) else: maint='Log QQ Plot(n=%d)' % (nrows) mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line ylab = '%s' % xlabname xlab = '-log10(Uniform 0-1)' # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure x = ['%f' % x for x in xvec] R.append('xvec = c(%s)' % ','.join(x)) y = ['%f' % x for x in yvec] R.append('yvec = c(%s)' % ','.join(y)) R.append('mx = c(0,%f)' % (math.log10(fn))) R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) R.append("par(lab=c(10,10,10))") R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) R.append('points(mx,mx,type="l")') R.append('grid(col="lightgray",lty="dotted")') R.append('dev.off()') return R def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''): """ data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a grid of qq plots to show departure from null at extremes of data quality Need to plot qqplot(p,unif) where the p's come from one x and one y quantile and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? """ data = copy.copy(plotme) nvals = len(data) stepsize = nvals/nsplits logstep = math.log10(stepsize) # so is 3 for steps of 1000 R.append('mx = c(0,%f)' % logstep) quints = range(0,nvals,stepsize) # quintile cutpoints for each layer data.sort(key=itertools.itemgetter(1)) # into x order #rpy.r.pdf( outfname, h , w ) #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits)) R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) yvec = [-math.log10(random.random()) for x in range(stepsize)] yvec.sort() # size of each step is expected range for xvec under null?! y = ['%f' % x for x in yvec] R.append('yvec = c(%s)' % ','.join(y)) for rowstart in quints: rowend = rowstart + stepsize if nvals - rowend < stepsize: # finish last split rowend = nvals row = data[rowstart:rowend] row.sort(key=itertools.itemgetter(2)) # into y order for colstart in quints: colend = colstart + stepsize if nvals - colend < stepsize: # finish last split colend = nvals cell = row[colstart:colend] xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell x = ['%f' % x for x in xvec] R.append('xvec = c(%s)' % ','.join(x)) R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) R.append('points(mx,mx,type="l")') R.append('grid(col="lightgray",lty="dotted")') #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8) #rpy.r.points(c(0,logstep),c(0,logstep),type='l') R.append('dev.off()') #rpy.r.dev_off() return R def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): """ y is data for a qqnorm plot if sampling, oversample low values - all the top 1% ? """ rangeunif = len(plotme) nunif = 1000 maxveclen = 3000 nrows = len(plotme) data = copy.copy(plotme) if nrows > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] # always oversample first sorted (here lowest) values yvec = [data[i] for i in samplei] # always get first and last maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) else: yvec = data maint='Log QQ Plot(n=%d)' % (nrows) n = 1000 ylab = '%s' % xlabname xlab = 'Normal' # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure #rpy.r.pdf( outfname, h , w ) #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5 #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8) #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted") #rpy.r.dev_off() y = ['%f' % x for x in yvec] R.append('yvec = c(%s)' % ','.join(y)) R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) R.append("par(lab=c(10,10,10))") R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) R.append('grid(col="lightgray",lty="dotted")') R.append('dev.off()') return R def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''): """ layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness like the GAIN qc tools y is data for a qq plot and ends up on the x axis go figure if sampling, oversample low values - all the top 1% ? """ rangeunif = len(plotme) nunif = 1000 fn = float(rangeunif) xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))] skip = max(int(rangeunif/fn),1) # force include last points mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line maxveclen = 2000 nrows = len(plotme) data = copy.copy(plotme) data.sort() # low to high - oversample low values if nrows > maxveclen: # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points # oversample part of the distribution always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5% skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)] # always oversample first sorted (here lowest) values yvec = [data[i] for i in samplei] # always get first and last xvec = [xvec[i] for i in samplei] # and sample xvec same way maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows) else: yvec = data maint='Log QQ Plot(n=%d)' % (nrows) n = 1000 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line ylab = '%s' % xlabname xlab = '-log10(Uniform 0-1)' R.append('mx = c(0,%f)' % (math.log10(fn))) x = ['%f' % x for x in xvec] R.append('xvec = c(%s)' % ','.join(x)) y = ['%f' % x for x in yvec] R.append('yvec = c(%s)' % ','.join(y)) R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width)) R.append("par(lab=c(10,10,10))") R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour)) R.append('points(mx,mx,type="l")') R.append('grid(col="lightgray",lty="dotted")') R.append('dev.off()') shead = subjects.pop(0) # get rid of head mhead = markers.pop(0) maf = mhead.index('maf') missfrac = mhead.index('missfrac') logphweall = mhead.index('logp_hwe_all') logphweunaff = mhead.index('logp_hwe_unaff') # check for at least some unaffected rml june 2009 m_mendel = mhead.index('N_Mendel') fracmiss = shead.index('FracMiss') s_mendel = shead.index('Mendel_errors') s_het = shead.index('F_Stat') params = {} h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff and x[logphweunaff].upper() <> 'NA'] if len(h) <> 0: xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] # plot for each of these cols else: # try hwe all instead - maybe no affection status available xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het] ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything! oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled qqplotme = [1,0,0,0,0,0,0] # qnplotme = [0,0,0,0,0,0,1] # nplots = len(xs) xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency', 'Marker Mendel Error Count', 'Missing Rate: Subjects', 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic'] plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het'] ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames ordplotnames = ['%s_cum' % x for x in plotnames] ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)] ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)] datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel", "Subject Missing Genotype","Subject Mendel",'Subject F Statistic'] html = [] pdflist = [] R = [] for n,column in enumerate(xs): dfn = '%d_%s.txt' % (n,titles[n]) dfilepath = os.path.join(newfpath,dfn) dat = [float(x[column]) for x in datasources[n] if len(x) >= column and x[column][:2].upper() <> 'NA'] # plink gives both! if sum(dat) <> 0: # eg nada for mendel if case control? plotme = file(dfilepath,'w') plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout! tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n], title=titles[n],basename=basename,nbreaks=nbreaks) R += tR row = [titles[n],ploturls[n],outfnames[n] ] html.append(row) pdflist.append(outfnames[n]) if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up otitle = 'Ranked %s' % titles[n] dat.sort() if oreverseme[n]: dat.reverse() ndat = len(dat) xvec = range(ndat) xvec = [100.0*(n-x)/n for x in xvec] # convert to centile # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points maxveclen = 1000.0 # for reasonable pdf sizes! if ndat > maxveclen: # oversample part of the distribution always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5% skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values yvec = [yvec[i] for i in samplei] # always get first and last xvec = [xvec[i] for i in samplei] # always get first and last plotme = file(dfilepath,'w') plotme.write('xvec\tyvec\n') plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout! tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n], title=otitle,basename=basename,nbreaks=nbreaks) R += tR row = [otitle,ordploturls[n],ordoutfnames[n]] html.append(row) pdflist.append(ordoutfnames[n]) if qqplotme[n]: # otitle = 'LogQQ plot %s' % titles[n] dat.sort() dat.reverse() ofn = os.path.split(ordoutfnames[n]) ofn = os.path.join(ofn[0],'QQ%s' % ofn[1]) ofu = os.path.split(ordploturls[n]) ofu = os.path.join(ofu[0],'QQ%s' % ofu[1]) tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n], title=otitle,basename=basename) R += tR row = [otitle,ofu,ofn] html.append(row) pdflist.append(ofn) elif qnplotme[n]: otitle = 'F Statistic %s' % titles[n] dat.sort() dat.reverse() ofn = os.path.split(ordoutfnames[n]) ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1]) ofu = os.path.split(ordploturls[n]) ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1]) tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n], title=otitle,basename=basename) R += tR row = [otitle,ofu,ofn] html.append(row) pdflist.append(ofn) else: print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10]) rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath) if nup>0: # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf` # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf filestojoin = ' '.join(pdflist) # all the file names so far afname = '%s_All_Paged.pdf' % (basename) fullafname = os.path.join(newfpath,afname) expl = 'All %s QC Plots joined into a single pdf' % basename vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname) # make single page pdf x=subprocess.Popen(vcl,shell=True,cwd=newfpath) retval = x.wait() row = [expl,afname,fullafname] html.insert(0,row) # last rather than second nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup) fullnfname = os.path.join(newfpath,nfname) expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup) vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname) # make thumbnail images x=subprocess.Popen(vcl,shell=True,cwd=newfpath) retval = x.wait() row = [expl,nfname,fullnfname] html.insert(1,row) # this goes second vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf')) # make thumbnail images x=subprocess.Popen(vcl,shell=True,cwd=newfpath) retval = x.wait() return html # elements for an ordered list of urls or whatever.. def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'): """ NO LONGER USED - historical interest count het loci for each subject to look for outliers = ? contamination assume ped file is linkage format need to make a ped file from the bed file we were passed """ vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file p=subprocess.Popen(' '.join(vcl),shell=True) retval = p.wait() f = open('%s_recode.recode.ped' % froot,'r') if not linkageped: head = f.next() # throw away header hets = [] # simple count of het loci per subject. Expect poisson? n = 1 for l in f: n += 1 ll = l.strip().split() if len(ll) > 6: iid = idjoiner.join(ll[:2]) # fam_iid gender = ll[4] alleles = ll[6:] nallele = len(alleles) nhet = 0 for i in range(nallele/2): a1=alleles[2*i] a2=alleles[2*i+1] if alleles[2*i] <> alleles[2*i+1]: # must be het if not missvals.get(a1,None) and not missvals.get(a2,None): nhet += 1 hets.append((nhet,iid,gender)) # for sorting later f.close() hets.sort() hets.reverse() # biggest nhet now on top f = open(outfname ,'w') res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions res.insert(0,'nhetloci\tfamid_iid\tgender') res.append('') f.write('\n'.join(res)) f.close() def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None): """by subject (missingness = .imiss, mendel = .imendel) assume replicates have an underscore in family id for hapmap testing For sorting, we need floats and integers """ isexfile = '%s.sexcheck' % froot imissfile = '%s.imiss' % froot imendfile = '%s.imendel' % froot ihetfile = '%s.het' % froot logf.write('## subject reports starting at %s\n' % timenow()) outfile = os.path.join(newfpath,outfname) idlist = [] imissdict = {} imenddict = {} isexdict = {} ihetdict = {} Tops = {} Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel', 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic'] Tsorts = [2,3,6,8] Treverse = [True,True,True,False] # so first values are worser #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest'] ## FID IID MISS_PHENO N_MISS N_GENO F_MISS ## 1552042370_A 1552042370_A N 5480 549883 0.009966 ## 1552042410_A 1552042410_A N 1638 549883 0.002979 # ------------------missing-------------------- # imiss has FID IID MISS_PHENO N_MISS F_MISS # we want F_MISS try: f = open(imissfile,'r') except: logf.write('# file %s is missing - talk about irony\n' % imissfile) f = None if f: for n,line in enumerate(f): ll = line.strip().split() if n == 0: head = [x.upper() for x in ll] # expect above fidpos = head.index('FID') iidpos = head.index('IID') fpos = head.index('F_MISS') elif len(ll) >= fpos: # full line fid = ll[fidpos] #if fid.find('_') == -1: # not replicate! - icondb ids have these... iid = ll[iidpos] fmiss = ll[fpos] id = '%s%s%s' % (fid,idjoiner,iid) imissdict[id] = fmiss idlist.append(id) f.close() logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist))) # ------------------mend------------------- # *.imendel has FID IID N # we want N gotmend = True try: f = open(imendfile,'r') except: gotmend = False for id in idlist: imenddict[id] = '0' if gotmend: for n,line in enumerate(f): ll = line.strip().split() if n == 0: head = [x.upper() for x in ll] # expect above npos = head.index('N') fidpos = head.index('FID') iidpos = head.index('IID') elif len(ll) >= npos: # full line fid = ll[fidpos] iid = ll[iidpos] id = '%s%s%s' % (fid,idjoiner,iid) nmend = ll[npos] imenddict[id] = nmend f.close() else: logf.write('## error No %s file - assuming not family data\n' % imendfile) # ------------------sex check------------------ #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck # sexcheck has FID IID PEDSEX SNPSEX STATUS F ## ## FID Family ID ## IID Individual ID ## PEDSEX Sex as determined in pedigree file (1=male, 2=female) ## SNPSEX Sex as determined by X chromosome ## STATUS Displays "PROBLEM" or "OK" for each individual ## F The actual X chromosome inbreeding (homozygosity) estimate ## ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are ## ambiguous with regard to sex. ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2. try: f = open(isexfile,'r') got_sexcheck = 1 except: got_sexcheck = 0 if got_sexcheck: for n,line in enumerate(f): ll = line.strip().split() if n == 0: head = [x.upper() for x in ll] # expect above fidpos = head.index('FID') iidpos = head.index('IID') pedsexpos = head.index('PEDSEX') snpsexpos = head.index('SNPSEX') statuspos = head.index('STATUS') fpos = head.index('F') elif len(ll) >= fpos: # full line fid = ll[fidpos] iid = ll[iidpos] fest = ll[fpos] pedsex = ll[pedsexpos] snpsex = ll[snpsexpos] stat = ll[statuspos] id = '%s%s%s' % (fid,idjoiner,iid) isexdict[id] = (pedsex,snpsex,stat,fest) f.close() else: # this only happens if there are no subjects! logf.write('No %s file - assuming no sex errors' % isexfile) ## ## FID IID O(HOM) E(HOM) N(NM) F ## 457 2 490665 4.928e+05 722154 -0.009096 ## 457 3 464519 4.85e+05 710986 -0.0908 ## 1037 2 461632 4.856e+05 712025 -0.106 ## 1037 1 491845 4.906e+05 719353 0.005577 try: f = open(ihetfile,'r') except: f = None logf.write('## No %s file - did we run --het in plink?\n' % ihetfile) if f: for i,line in enumerate(f): ll = line.strip().split() if i == 0: head = [x.upper() for x in ll] # expect above fidpos = head.index('FID') iidpos = head.index('IID') fpos = head.index('F') n = 0 elif len(ll) >= fpos: # full line fid = ll[fidpos] iid = ll[iidpos] fhet = ll[fpos] id = '%s%s%s' % (fid,idjoiner,iid) ihetdict[id] = fhet f.close() # now assemble and output result list rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat'] res = [] fres = [] # floats for sorting for id in idlist: # for each snp in found order fid,iid = id.split(idjoiner) # recover keys f_missing = imissdict.get(id,'0.0') nmend = imenddict.get(id,'0') (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0')) fhet = ihetdict.get(id,'0.0') res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet]) try: ff_missing = float(f_missing) except: ff_missing = 0.0 try: inmend = int(nmend) except: inmend = 0 try: ffest = float(fest) except: fest = 0.0 try: ffhet = float(fhet) except: ffhet = 0.0 fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet]) ntokeep = max(20,len(res)/keepfrac) for i,col in enumerate(Tsorts): fres.sort(key=operator.itemgetter(col)) if Treverse[i]: fres.reverse() repname = Tnames[i] Tops[repname] = fres[0:ntokeep] Tops[repname] = [map(str,x) for x in Tops[repname]] Tops[repname].insert(0,rhead) res.sort() res.insert(0,rhead) logf.write('### writing %s report with %s' % ( outfile,res[0])) f = open(outfile,'w') f.write('\n'.join(['\t'.join(x) for x in res])) f.write('\n') f.close() return res,Tops def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ): """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq) keep a list of marker order but keep all stats in dicts write out a fake xls file for R or SAS etc kinda clunky, but.. TODO: ensure stable if any file not found? """ mapdict = {} if maplist <> None: rslist = [x[1] for x in maplist] offset = [(x[0],x[3]) for x in maplist] mapdict = dict(zip(rslist,offset)) hwefile = '%s.hwe' % froot lmissfile = '%s.lmiss' % froot freqfile = '%s.frq' % froot lmendfile = '%s.lmendel' % froot outfile = os.path.join(newfpath,outfname) markerlist = [] chromlist = [] hwedict = {} lmissdict = {} freqdict = {} lmenddict = {} Tops = {} Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel'] Tsorts = [3,6,10,11] Treverse = [False,True,True,True] # so first values are worse(r) #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend]) #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] # -------------------hwe-------------------------- # hwe has SNP TEST GENO O(HET) E(HET) P_HWD # we want all hwe where P_HWD <> NA # ah changed in 1.04 to ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644 ## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA ## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644 ## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975 ## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA ## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975 ## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067 ## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA ## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067 logf.write('## marker reports starting at %s\n' % timenow()) try: f = open(hwefile,'r') except: f = None logf.write('## error - no hwefile %s found\n' % hwefile) if f: for i,line in enumerate(f): ll = line.strip().split() if i == 0: # head head = [x.upper() for x in ll] # expect above try: testpos = head.index('TEST') except: testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST') try: ppos = head.index('P') except: ppos = 8 # patch - for head.index('P') snppos = head.index('SNP') chrpos = head.index('CHR') logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos)) elif len(ll) >= ppos: # full line ps = ll[ppos].upper() rs = ll[snppos] chrom = ll[chrpos] test = ll[testpos] if not hwedict.get(rs,None): hwedict[rs] = {} markerlist.append(rs) chromlist.append(chrom) # one place to find it? lpvals = 0 if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping lpvals = '0' if ps <> '1': try: pval = float(ps) lpvals = '%f' % -math.log10(pval) except: pass hwedict[rs][test] = (ps,lpvals) else: logf.write('short line #%d = %s\n' % (i,ll)) f.close() # ------------------missing-------------------- """lmiss has CHR SNP N_MISS N_GENO F_MISS 1 rs12354060 0 73 0 1 rs4345758 1 73 0.0137 1 rs2691310 73 73 1 1 rs2531266 73 73 1 # we want F_MISS""" try: f = open(lmissfile,'r') except: f = None if f: for i,line in enumerate(f): ll = line.strip().split() if i == 0: head = [x.upper() for x in ll] # expect above fracpos = head.index('F_MISS') npos = head.index('N_MISS') snppos = head.index('SNP') elif len(ll) >= fracpos: # full line rs = ll[snppos] fracval = ll[fracpos] lmissdict[rs] = fracval # for now, just that? else: lmissdict[rs] = 'NA' f.close() # ------------------freq------------------- # frq has CHR SNP A1 A2 MAF NM # we want maf try: f = open(freqfile,'r') except: f = None if f: for i,line in enumerate(f): ll = line.strip().split() if i == 0: head = [x.upper() for x in ll] # expect above mafpos = head.index('MAF') a1pos = head.index('A1') a2pos = head.index('A2') snppos = head.index('SNP') elif len(ll) >= mafpos: # full line rs = ll[snppos] a1 = ll[a1pos] a2 = ll[a2pos] maf = ll[mafpos] freqdict[rs] = (maf,a1,a2) f.close() # ------------------mend------------------- # lmend has CHR SNP N # we want N gotmend = True try: f = open(lmendfile,'r') except: gotmend = False for rs in markerlist: lmenddict[rs] = '0' if gotmend: for i,line in enumerate(f): ll = line.strip().split() if i == 0: head = [x.upper() for x in ll] # expect above npos = head.index('N') snppos = head.index('SNP') elif len(ll) >= npos: # full line rs = ll[snppos] nmend = ll[npos] lmenddict[rs] = nmend f.close() else: logf.write('No %s file - assuming not family data\n' % lmendfile) # now assemble result list rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel'] res = [] fres = [] for rs in markerlist: # for each snp in found order f_missing = lmissdict.get(rs,'NA') maf,a1,a2 = freqdict.get(rs,('NA','NA','NA')) hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change... hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA')) nmend = lmenddict.get(rs,'NA') (chrom,offset)=mapdict.get(rs,('?','0')) res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend]) ntokeep = max(10,len(res)/keepfrac) def msortk(item=None): """ deal with non numeric sorting """ try: return float(item) except: return item for i,col in enumerate(Tsorts): res.sort(key=msortk(lambda x:x[col])) if Treverse[i]: res.reverse() repname = Tnames[i] Tops[repname] = res[0:ntokeep] Tops[repname].insert(0,rhead) res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order res.insert(0,rhead) f = open(outfile,'w') f.write('\n'.join(['\t'.join(x) for x in res])) f.close() return res,Tops def getfSize(fpath,outpath): """ format a nice file size string """ size = '' fp = os.path.join(outpath,fpath) if os.path.isfile(fp): n = float(os.path.getsize(fp)) if n > 2**20: size = ' (%1.1f MB)' % (n/2**20) elif n > 2**10: size = ' (%1.1f KB)' % (n/2**10) elif n > 0: size = ' (%d B)' % (int(n)) return size if __name__ == "__main__": u = """ called in xml as <command interpreter="python"> rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix" -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink' -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R' </command> Prepare a qc report - eg: print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog) """ progname = os.path.basename(sys.argv[0]) if len(sys.argv) < 9: print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv) sys.exit(1) parser = OptionParser(usage=u, version="%prog 0.01") a = parser.add_option a("-i","--infile",dest="infile") a("-o","--oprefix",dest="opref") a("-l","--plinkexe",dest="plinke", default=plinke) a("-r","--rexe",dest="rexe", default=rexe) a("-s","--snps",dest="htmlf") #a("-m","--markerRaw",dest="markf") #a("-r","--rawsubject",dest="subjf") a("-p","--patho",dest="newfpath") (options,args) = parser.parse_args() basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below killme = string.punctuation + string.whitespace trantab = string.maketrans(killme,'_'*len(killme)) opref = options.opref.translate(trantab) alogh,alog = tempfile.mkstemp(suffix='.txt') plogh,plog = tempfile.mkstemp(suffix='.txt') alogf = open(alog,'w') plogf = open(plog,'w') ahtmlf = options.htmlf amarkf = 'MarkerDetails_%s.xls' % opref asubjf = 'SubjectDetails_%s.xls' % opref newfpath = options.newfpath newfpath = os.path.realpath(newfpath) try: os.makedirs(newfpath) except: pass ofn = basename bfn = options.infile try: mapf = '%s.bim' % bfn maplist = file(mapf,'r').readlines() maplist = [x.split() for x in maplist] except: maplist = None alogf.write('## error - cannot open %s to read map - no offsets will be available for output files') #rerla@beast galaxy]$ head test-data/tinywga.bim #22 rs2283802 0 21784722 4 2 #22 rs2267000 0 21785366 4 2 rgbin = os.path.split(rexe)[0] # get our rg bin path #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug! # if we could, do all at once? Nope. Probably never. plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'], ['--mendel',],['--check-sex',]] vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0'] runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase) plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename], ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)], ['--bfile ldp_%s --het --out %s' % (basename,basename)]] # subset of ld independent markers for eigenstrat and other requirements vclbase = [options.plinke,'--noweb'] plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) plogf.write('\n'.join(plogout)) plogf.write('\n') repout = os.path.join(newfpath,basename) subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath, logf=alogf) # writes the subject_froot.xls file markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath, logf=alogf,maplist=maplist) # marker_froot.xls nbreaks = 100 s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2]) alogf.write(s) print s plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath, basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin) #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe) # [titles[n],plotnames[n],outfnames[n] ] html = [] html.append('<table cellpadding="5" border="0">') size = getfSize(amarkf,newfpath) html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \ (amarkf,'Click here to download the Marker QC Detail report file',size)) size = getfSize(asubjf,newfpath) html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \ (asubjf,'Click here to download the Subject QC Detail report file',size)) for (title,url,ofname) in plotpage: ttitle = 'Ranked %s' % title dat = subjectTops.get(ttitle,None) if not dat: dat = markerTops.get(ttitle,None) imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf thumbnail = os.path.join(newfpath,imghref) if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg thumbnail = os.path.join(newfpath,imghref) if not os.path.exists(thumbnail): html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title)) else: html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \ % (url,imghref,title)) if dat: # one or the other - write as an extra file and make a link here t = '%s.xls' % (ttitle.replace(' ','_')) fname = os.path.join(newfpath,t) f = file(fname,'w') f.write('\n'.join(['\t'.join(x) for x in dat])) # the report size = getfSize(t,newfpath) html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size)) else: html.append('</a></td><td>%s</td><td> </td></tr>' % (title)) html.append('</table><hr><h3>All output files from the QC run are available below</h3>') html.append('<table cellpadding="5" border="0">\n') flist = os.listdir(newfpath) # we want to catch 'em all flist.sort() for f in flist: fname = os.path.split(f)[-1] size = getfSize(fname,newfpath) html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size)) html.append('</table>') alogf.close() plogf.close() llog = open(alog,'r').readlines() plogfile = open(plog,'r').readlines() os.unlink(alog) os.unlink(plog) llog += plogfile # add lines from pruneld log lf = file(ahtmlf,'w') # galaxy will show this as the default view lf.write(galhtmlprefix % progname) s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow()) lf.write('<h4>%s</h4>\n' % s) lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n') lf.write('\n'.join(html)) lf.write('<h4>QC run log contents</h4>') lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs if len(cruft) > 0: lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup lf.write('%s\n<hr>\n' % galhtmlpostfix) lf.close()