diff tools/rgenetics/rgManQQ.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/rgenetics/rgManQQ.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,334 @@
+#!/usr/local/bin/python
+# This is a truly ghastly hack
+# all of the heavy data cleaning lifting is done in R which is a really dumb place IMHO
+# Making a new file seems a waste but it would be far easier to set everything up in python
+# seems to work so I'm leaving it alone
+# sigh. Should really move this gig to rpy - writing a robust R script is hard.
+# updated to compress pdf using gs since millions of points = horsechoker pdfs and pdfs are good
+# updated july 20 to fix sort order - R unique() sorts into strict collating order
+# so need to sort after unique to revert to lexicographic order for x axis on Manhattan
+# rgmanqq updated july 19 to deal with x,y and mt
+# lots of fixes
+# ross lazarus
+import sys,math,shutil,subprocess,os,time,tempfile,string
+from os.path import abspath
+from rgutils import timenow, RRun, galhtmlprefix, galhtmlpostfix, galhtmlattr
+progname = os.path.split(sys.argv[0])[1]
+myversion = 'V000.1 March 2010'
+verbose = False
+debug = False
+
+rcode="""
+# generalised so 3 core fields passed as parameters ross lazarus March 24 2010 for rgenetics
+# Originally created as qqman with the following 
+# attribution:
+#--------------
+# Stephen Turner
+# http://StephenTurner.us/
+# http://GettingGeneticsDone.blogspot.com/
+
+# Last updated: 19 July 2011 by Ross Lazarus
+# R code for making manhattan plots and QQ plots from plink output files. 
+# With GWAS data this can take a lot of memory. Recommended for use on 
+# 64bit machines only, for now. 
+
+#
+
+library(ggplot2)
+
+coloursTouse = c('firebrick','darkblue','goldenrod','darkgreen')
+# not too ugly but need a colour expert please...
+
+
+DrawManhattan = function(pvals=Null,chrom=Null,offset=Null,title=NULL, max.y="max",suggestiveline=0, genomewide=T, size.x.labels=9, 
+              size.y.labels=10, annotate=F, SNPlist=NULL,grey=0) {
+        if (annotate & is.null(SNPlist)) stop("You requested annotation but provided no SNPlist!")
+        genomewideline=NULL # was genomewideline=-log10(5e-8)
+        n = length(pvals)
+        if (genomewide) { # use bonferroni since might be only a small region?
+            genomewideline = -log10(0.05/n) }
+        offset = as.integer(offset)
+        if (n > 1000000) { offset = offset/10000 }
+        else if (n > 10000) { offset = offset/1000}
+        chro = as.integer(chrom) # already dealt with X and friends?
+        pvals = as.double(pvals)
+        d=data.frame(CHR=chro,BP=offset,P=pvals)
+        if ("CHR" %in% names(d) & "BP" %in% names(d) & "P" %in% names(d) ) {
+                d=d[!is.na(d$P), ]
+                d=d[!is.na(d$BP), ]
+                d=d[!is.na(d$CHR), ]
+                #limit to only chrs 1-22, x=23,y=24,Mt=25?
+                d=d[d$CHR %in% 1:25, ]
+                d=d[d$P>0 & d$P<=1, ]
+                d$logp = as.double(-log10(d$P))
+                dlen = length(d$P)
+                d$pos=NA
+                ticks=NULL
+                lastbase=0
+                chrlist = unique(d$CHR)
+                chrlist = as.integer(chrlist)
+                chrlist = sort(chrlist) # returns lexical ordering 
+                if (max.y=="max") { maxy = ceiling(max(d$logp)) } 
+                   else { maxy = max.y }
+                nchr = length(chrlist) # may be any number?
+                maxy = max(maxy,1.1*genomewideline)
+                if (nchr >= 2) {
+                    for (x in c(1:nchr)) {
+                        i = chrlist[x] # need the chrom number - may not == index
+                        if (x == 1) { # first time
+                            d[d$CHR==i, ]$pos = d[d$CHR==i, ]$BP # initialize to first BP of chr1
+                            dsub = subset(d,CHR==i)
+                            dlen = length(dsub$P)
+                            lastbase = max(dsub$pos) # last one
+                            tks = d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1]
+                            lastchr = i
+                        } else {
+                            d[d$CHR==i, ]$pos = d[d$CHR==i, ]$BP+lastbase # one humongous contig
+                            if (sum(is.na(lastchr),is.na(lastbase),is.na(d[d$CHR==i, ]$pos))) { 
+                                cat(paste('manhattan: For',title,'chrlistx=',i,'lastchr=',lastchr,'lastbase=',lastbase,'pos=',d[d$CHR==i,]$pos))
+                             }   
+                            tks=c(tks, d[d$CHR==i, ]$pos[floor(length(d[d$CHR==i, ]$pos)/2)+1])
+                            lastchr = i
+                            dsub = subset(d,CHR==i)
+                            lastbase = max(dsub$pos) # last one
+                        }
+                    ticklim=c(min(d$pos),max(d$pos))
+                    xlabs = chrlist
+                    }
+                } else { # nchr is 1
+                   nticks = 10
+                   last = max(d$BP)
+                   first = min(d$BP)
+                   tks = c(first)
+                   t = (last-first)/nticks # units per tick
+                   for (x in c(1:(nticks))) { 
+                        tks = c(tks,round(x*t)+first) }
+                   ticklim = c(first,last)
+                } # else
+                if (grey) {mycols=rep(c("gray10","gray60"),max(d$CHR))
+                           } else {
+                           mycols=rep(coloursTouse,max(d$CHR))
+                           }
+                dlen = length(d$P)
+                d$pranks = rank(d$P)/dlen
+                d$centiles = 100*d$pranks # small are interesting
+                d$sizes = ifelse((d$centile < 1),2,1)
+                if (annotate) d.annotate=d[as.numeric(substr(d$SNP,3,100)) %in% SNPlist, ]
+                if (nchr >= 2) {
+                        manplot=qplot(pos,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR),size=factor(sizes))
+                        manplot=manplot+scale_x_continuous(name="Chromosome", breaks=tks, labels=xlabs,limits=ticklim) 
+                        manplot=manplot+scale_size_manual(values = c(0.5,1.5)) # requires discreet scale - eg factor
+                        #manplot=manplot+scale_size(values=c(0.5,2)) # requires continuous 
+                        }
+                else {
+                        manplot=qplot(BP,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR))
+                        manplot=manplot+scale_x_continuous(name=paste("Chromosome",chrlist[1]), breaks=tks, labels=tks,limits=ticklim) 
+                     }                 
+                manplot=manplot+scale_y_continuous(limits=c(0,maxy), breaks=1:maxy, labels=1:maxy)
+                manplot=manplot+scale_colour_manual(value=mycols)
+                if (annotate) {  manplot=manplot + geom_point(data=d.annotate, colour=I("green3")) } 
+                manplot=manplot + opts(legend.position = "none") 
+                manplot=manplot + opts(title=title)
+                manplot=manplot+opts(
+                     panel.background=theme_blank(), 
+                     axis.text.x=theme_text(size=size.x.labels, colour="grey50"), 
+                     axis.text.y=theme_text(size=size.y.labels, colour="grey50"), 
+                     axis.ticks=theme_segment(colour=NA)
+                )
+                if (suggestiveline) manplot=manplot+geom_hline(yintercept=suggestiveline,colour="blue", alpha=I(1/3))
+                if (genomewideline) manplot=manplot+geom_hline(yintercept=genomewideline,colour="red")
+                manplot
+        }       else {
+                stop("Make sure your data frame contains columns CHR, BP, and P")
+        }
+}
+
+
+
+qq = function(pvector, title=NULL, spartan=F) {
+        # Thanks to Daniel Shriner at NHGRI for providing this code for creating expected and observed values
+        o = -log10(sort(pvector,decreasing=F))
+        e = -log10( 1:length(o)/length(o) )
+        # you could use base graphics
+        # plot(e,o,pch=19,cex=0.25, xlab=expression(Expected~~-log[10](italic(p))), 
+        # ylab=expression(Observed~~-log[10](italic(p))), xlim=c(0,max(e)), ylim=c(0,max(e)))
+        # lines(e,e,col="red")
+        #You'll need ggplot2 installed to do the rest
+        qq=qplot(e,o, xlim=c(0,max(e)), ylim=c(0,max(o))) + stat_abline(intercept=0,slope=1, col="red")
+        qq=qq+opts(title=title)
+        qq=qq+scale_x_continuous(name=expression(Expected~~-log[10](italic(p))))
+        qq=qq+scale_y_continuous(name=expression(Observed~~-log[10](italic(p))))
+        if (spartan) plot=plot+opts(panel.background=theme_rect(col="grey50"), panel.grid.minor=theme_blank())
+        qq
+}
+
+"""
+
+# we need another string to avoid confusion over string substitutions with %in%
+# instantiate rcode2 string with infile,chromcol,offsetcol,pvalscols,title before saving and running
+
+rcode2 = """rgqqMan = function(infile="%s",chromcolumn=%d, offsetcolumn=%d, pvalscolumns=c(%s), 
+title="%s",grey=%d) {
+rawd = read.table(infile,head=T,sep='\\t')
+dn = names(rawd)
+cc = dn[chromcolumn]
+oc = dn[offsetcolumn] 
+rawd[,cc] = sub('chr','',rawd[,cc],ignore.case = T) # just in case
+rawd[,cc] = sub(':','',rawd[,cc],ignore.case = T) # ugh
+rawd[,cc] = sub('X',23,rawd[,cc],ignore.case = T)
+rawd[,cc] = sub('Y',24,rawd[,cc],ignore.case = T)
+rawd[,cc] = sub('Mt',25,rawd[,cc], ignore.case = T)
+nams = c(cc,oc) # for sorting
+plen = length(rawd[,1])
+print(paste('###',plen,'values read from',infile,'read - now running plots',sep=' '))
+rawd = rawd[do.call(order,rawd[nams]),]
+# mmmf - suggested by http://onertipaday.blogspot.com/2007/08/sortingordering-dataframe-according.html
+# in case not yet ordered
+if (plen > 0) {
+  for (pvalscolumn in pvalscolumns) {
+  if (pvalscolumn > 0) 
+     {
+     cname = names(rawd)[pvalscolumn]
+     mytitle = paste('p=',cname,', ',title,sep='')
+     myfname = chartr(' ','_',cname)
+     myqqplot = qq(rawd[,pvalscolumn],title=mytitle)
+     ggsave(filename=paste(myfname,"qqplot.png",sep='_'),myqqplot,width=8,height=6,dpi=96)
+     ggsave(filename=paste(myfname,"qqplot.pdf",sep='_'),myqqplot,width=8,height=6,dpi=96)
+     print(paste('## qqplot on',cname,'done'))
+     if ((chromcolumn > 0) & (offsetcolumn > 0)) {
+         print(paste('## manhattan on',cname,'starting',chromcolumn,offsetcolumn,pvalscolumn))
+         mymanplot= DrawManhattan(chrom=rawd[,chromcolumn],offset=rawd[,offsetcolumn],pvals=rawd[,pvalscolumn],title=mytitle,grey=grey)
+         ggsave(filename=paste(myfname,"manhattan.png",sep='_'),mymanplot,width=8,height=6,dpi=96)
+         ggsave(filename=paste(myfname,"manhattan.pdf",sep='_'),mymanplot,width=8,height=6,dpi=96)
+         print(paste('## manhattan plot on',cname,'done'))
+         }
+         else {
+              print(paste('chrom column =',chromcolumn,'offset column = ',offsetcolumn,
+              'so no Manhattan plot - supply both chromosome and offset as numerics for Manhattan plots if required'))
+              } 
+     } 
+  else {
+        print(paste('pvalue column =',pvalscolumn,'Cannot parse it so no plots possible'))
+      }
+  } # for pvalscolumn
+ } else { print('## Problem - no values available to plot - was there really a chromosome and offset column?') }
+}
+
+rgqqMan() 
+# execute with defaults as substituted
+"""
+
+
+def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir,beTidy=False):
+    """ 
+    we may have an interval file or a tabular file - if interval, will have chr1... so need to adjust
+    to chrom numbers
+    draw a qq for pvals and a manhattan plot if chrom/offset <> 0
+    contains some R scripts as text strings - we substitute defaults into the calls
+    to make them do our bidding - and save the resulting code for posterity
+    this can be called externally, I guess...for QC eg?
+    """
+    if debug:
+        print 'doManQQ',input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir
+    rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
+    if debug:
+        print 'running\n%s\n' % rcmd
+    rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
+    rlog.append('## R script=')
+    rlog.append(rcmd)
+    return rlog,flist
+  
+def compressPDF(inpdf=None):
+    """need absolute path to pdf
+    """
+    assert os.path.isfile(inpdf), "## Input %s supplied to compressPDF not found" % inpdf
+    outpdf = '%s_compressed' % inpdf
+    cl = ["gs", "-sDEVICE=pdfwrite", "-dNOPAUSE", "-dBATCH", "-sOutputFile=%s" % outpdf,inpdf]
+    retval = subprocess.call(cl)
+    if retval == 0:    
+        os.unlink(inpdf) 
+        shutil.move(outpdf,inpdf)
+    return retval
+
+def main():
+    u = """<command interpreter="python">
+        rgManQQ.py '$input_file' "$name" '$out_html' '$out_html.files_path' '$chrom_col' '$offset_col' '$pval_col'
+    </command>
+    """
+    npar = 8
+    if len(sys.argv) < npar:
+            print >> sys.stdout, '## error - too few command line parameters - wanting %d' % npar
+            print >> sys.stdout, u
+            sys.exit(1)
+    input_fname = sys.argv[1]
+    title = sys.argv[2]
+    killme = string.punctuation + string.whitespace
+    trantab = string.maketrans(killme,'_'*len(killme))
+    ctitle = title.translate(trantab)
+    outhtml = sys.argv[3]
+    outdir = sys.argv[4]
+    try:
+         chrom_col = int(sys.argv[5])
+    except:
+         chrom_col = -1
+    try:
+        offset_col = int(sys.argv[6])
+    except:
+        offset_col = -1
+    p = sys.argv[7].strip().split(',')
+    try:
+        q = [int(x) for x in p]
+    except:
+        p = -1
+    if chrom_col == -1 or offset_col == -1: # was passed as zero - do not do manhattan plots
+        chrom_col = -1
+        offset_col = -1
+    grey = 0
+    if (sys.argv[8].lower() in ['1','true']):
+       grey = 1
+    if p == -1:
+        print >> sys.stderr,'## Cannot run rgManQQ - missing pval column'
+        sys.exit(1)
+    p = ['%d' % (int(x) + 1) for x in p]
+    rlog,flist = doManQQ(input_fname,chrom_col+1,offset_col+1,','.join(p),title,grey,ctitle,outdir)
+    flist.sort()
+    html = [galhtmlprefix % progname,]
+    html.append('<h1>%s</h1>' % title)
+    if len(flist) > 0:
+        html.append('<table>\n')
+        for row in flist:
+            fname,expl = row # RRun returns pairs of filenames fiddled for the log and R script
+            n,e = os.path.splitext(fname)
+            if e in ['.png','.jpg']:
+                pdf = '%s.pdf' % n
+                pdff = os.path.join(outdir,pdf)
+                if os.path.exists(pdff):
+                    rval = compressPDF(inpdf=pdff)
+                    if rval <> 0:
+                        pdf = '%s(not_compressed)' % pdf
+                else:
+                    pdf = '%s(not_found)' % pdf
+                s= '<tr><td><a href="%s"><img src="%s" title="%s" hspace="10" width="800"></a></td></tr>' \
+                 % (pdf,fname,expl)
+                html.append(s)
+            else:
+               html.append('<tr><td><a href="%s">%s</a></td></tr>' % (fname,expl))
+        html.append('</table>\n')
+    else:
+        html.append('<h2>### Error - R returned no files - please confirm that parameters are sane</h1>')    
+    html.append('<h3>R log follows below</h3><hr><pre>\n')
+    html += rlog
+    html.append('</pre>\n')   
+    html.append(galhtmlattr % (progname,timenow()))
+    html.append(galhtmlpostfix)
+    htmlf = file(outhtml,'w')
+    htmlf.write('\n'.join(html))
+    htmlf.write('\n')
+    htmlf.close()
+    
+  
+
+if __name__ == "__main__":
+    main()
+
+