annotate tools/rgenetics/rgQC.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 # oct 15 rpy replaced - temp fix until we get gnuplot working
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2 # rpy deprecated - replace with RRun
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 # fixes to run functional test! oct1 2009
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 # needed to expand our path with os.path.realpath to get newpath working with
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 # all the fancy pdfnup stuff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 # and a fix to pruneld to write output to where it should be
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 # smallish data in test-data/smallwga in various forms
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 # python ../tools/rgenetics/rgQC.py -i smallwga -o smallwga -s smallwga/smallwga.html -p smallwga
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 # child files are deprecated and broken as at july 19 2009
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 # need to move them to the html file extrafiles path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 # found lots of corner cases with some illumina data where cnv markers were
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 # included
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 # and where affection status was all missing !
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 # added links to tab files showing worst 1/keepfrac markers and subjects
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 # ross lazarus january 2008
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17 # added named parameters
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 # to ensure no silly slippages if non required parameter in the most general case
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 # some potentially useful things here reusable in complex scripts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 # with lots'o'html (TM)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 # aug 17 2007 rml
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 # added marker and subject and parenting april 14 rml
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 # took a while to get the absolute paths right for all the file munging
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 # as of april 16 seems to work..
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 # getting galaxy to serve images in html reports is a little tricky
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 # we don't want QC reports to be dozens of individual files, so need
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 # to use the url /static/rg/... since galaxy's web server will happily serve images
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 # from there
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 # galaxy passes output files as relative paths
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 # these have to be munged by rgQC.py before calling this
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 # galaxy will pass in 2 file names - one for the log
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 # and one for the final html report
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 # of the form './database/files/dataset_66.dat'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 # we need to be working in that directory so our plink output files are there
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 # so these have to be munged by rgQC.py before calling this
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 # note no ped file passed so had to remove the -l option
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 # for plinkParse.py that makes a heterozygosity report from the ped
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 # file - needs fixing...
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 # new: importing manhattan/qqplot plotter
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 # def doManQQ(input_fname,chrom_col,offset_col,pval_cols,title,grey,ctitle,outdir):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 # """ draw a qq for pvals and a manhattan plot if chrom/offset <> 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 # contains some R scripts as text strings - we substitute defaults into the calls
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 # to make them do our bidding - and save the resulting code for posterity
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # this can be called externally, I guess...for QC eg?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 # """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 # rcmd = '%s%s' % (rcode,rcode2 % (input_fname,chrom_col,offset_col,pval_cols,title,grey))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 # rlog,flist = RRun(rcmd=rcmd,title=ctitle,outdir=outdir)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 # return rlog,flist
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 from optparse import OptionParser
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 import sys,os,shutil, glob, math, subprocess, time, operator, random, tempfile, copy, string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 from os.path import abspath
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 from rgutils import galhtmlprefix, galhtmlpostfix, RRun, timenow, plinke, rexe, runPlink, pruneLD
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 import rgManQQ
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 prog = os.path.split(sys.argv[0])[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 vers = '0.4 april 2009 rml'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 idjoiner = '_~_~_' # need something improbable..
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 # many of these may need fixing for a new install
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 myversion = vers
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 keepfrac = 20 # fraction to keep after sorting by each interesting value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 missvals = {'0':'0','N':'N','-9':'-9','-':'-'} # fix me if these change!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 mogresize = "x300" # this controls the width for jpeg thumbnails
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 def makePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='20',nup=3,height=10,width=8,rgbin=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 def rHist(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=50):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 # generic histogram and vertical boxplot in a 3:1 layout
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 # returns the graphic file name for inclusion in the web page
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 # broken out here for reuse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 # ross lazarus march 2007
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 screenmat = (1,2,1,2) # create a 2x2 cabvas
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 rpy.r.pdf( outfname, height , width )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 #rpy.r.layout(rpy.r.matrix(rpy.r.c(1,1,1,2), 1, 4, byrow = True)) # 3 to 1 vertical plot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 m = rpy.r.matrix((1,1,1,2),nrow=1,ncol=4,byrow=True)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 # in R, m = matrix(c(1,2),nrow=1,ncol=2,byrow=T)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 rpy.r("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))") # 4 to 1 vertical plot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 maint = 'QC for %s - %s' % (basename,title)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 rpy.r.hist(plotme,main=maint, xlab=xlabname,breaks=nbreaks,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 rpy.r.boxplot(plotme,main='',col="maroon",outline=False)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 def rCum(plotme=[],outfname='',xlabname='',title='',basename='',nbreaks=100):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 Useful to see what various cutoffs yield - plot percentiles
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 n = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 maxveclen = 1000.0 # for reasonable pdf sizes!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 yvec = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 # arrives already in decending order of importance missingness or mendel count by subj or marker
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 xvec = range(n)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 if n > maxveclen: # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 always = min(1000,n/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 skip = int(n/maxveclen) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 samplei = [i for i in range(n) if (i % skip == 0) or (i < always)] # always oversample first sorted values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 yvec = [yvec[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 xvec = [xvec[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 rpy.r.pdf( outfname, height , width )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 maint = 'QC for %s - %s' % (basename,title)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 rpy.r.plot(xvec,yvec,type='p',main=maint, ylab=xlabname, xlab='Sample Percentile',col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 def rQQ(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 y is data for a qq plot and ends up on the x axis go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 this version called with -log10 transformed hwe
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 nrows = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 fn = float(nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 maxveclen = 3000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 yvec = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139 if nrows > maxveclen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 # always oversample first sorted (here lowest) values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146 yvec = [yvec[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 xvec = [xvec[i] for i in samplei] # and sample xvec same way
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
148 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
149 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
150 maint='Log QQ Plot(n=%d)' % (nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
151 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
152 ylab = '%s' % xlabname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
153 xlab = '-log10(Uniform 0-1)'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
154 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
155 rpy.r.pdf( outfname, height , width )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
156 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
157 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
158 rpy.r.points(mx,mx,type='l')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
159 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
160 rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
161
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
162 def rMultiQQ(plotme = [],nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
163 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
164 data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
165 grid of qq plots to show departure from null at extremes of data quality
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
166 Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
167 and ends up on the x axis go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
168 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
169 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
170 data = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
171 nvals = len(data)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
172 stepsize = nvals/nsplits
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
173 logstep = math.log10(stepsize) # so is 3 for steps of 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
174 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
175 data.sort(key=itertools.itemgetter(1)) # into x order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
176 rpy.r.pdf( outfname, height , width )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
177 rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
178 yvec = [-math.log10(random.random()) for x in range(stepsize)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
179 yvec.sort() # size of each step is expected range for xvec under null?!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
180 for rowstart in quints:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
181 rowend = rowstart + stepsize
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
182 if nvals - rowend < stepsize: # finish last split
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
183 rowend = nvals
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
184 row = data[rowstart:rowend]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
185 row.sort(key=itertools.itemgetter(2)) # into y order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
186 for colstart in quints:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
187 colend = colstart + stepsize
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
188 if nvals - colend < stepsize: # finish last split
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
189 colend = nvals
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
190 cell = row[colstart:colend]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
191 xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
192 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
193 rpy.r.points(c(0,logstep),c(0,logstep),type='l')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
194 rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
195
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
196
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
197 def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
198 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
199 y is data for a qqnorm plot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
200 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
201 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
202 rangeunif = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
203 nunif = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
204 maxveclen = 3000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
205 nrows = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
206 data = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
207 if nrows > maxveclen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
208 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
209 # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
210 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
211 skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
212 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
213 # always oversample first sorted (here lowest) values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
214 yvec = [data[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
215 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
216 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
217 yvec = data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
218 maint='Log QQ Plot(n=%d)' % (nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
219 n = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
220 ylab = '%s' % xlabname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
221 xlab = 'Normal'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
222 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
223 rpy.r.pdf( outfname, height , width )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
224 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
225 rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
226 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
227 rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
228
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
229 def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
230 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
231 layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
232 like the GAIN qc tools
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
233 y is data for a qq plot and ends up on the x axis go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
234 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
235 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
236 rangeunif = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
237 nunif = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
238 fn = float(rangeunif)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
239 xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
240 skip = max(int(rangeunif/fn),1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
241 # force include last points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
242 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
243 maxveclen = 2000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
244 nrows = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
245 data = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
246 data.sort() # low to high - oversample low values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
247 if nrows > maxveclen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
248 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
249 # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
250 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
251 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
252 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
253 # always oversample first sorted (here lowest) values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
254 yvec = [data[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
255 xvec = [xvec[i] for i in samplei] # and sample xvec same way
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
256 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
257 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
258 yvec = data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
259 maint='Log QQ Plot(n=%d)' % (nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
260 n = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
261 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
262 ylab = '%s' % xlabname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
263 xlab = '-log10(Uniform 0-1)'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
264 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
265 rpy.r.pdf( outfname, height , width )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
266 rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
267 rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
268 rpy.r.points(mx,mx,type='l')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
269 rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
270 rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
271
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
272
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
273 fdsto,stofile = tempfile.mkstemp()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
274 sto = open(stofile,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
275 import rpy # delay to avoid rpy stdout chatter replacing galaxy file blurb
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
276 mog = 'mogrify'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
277 pdfnup = 'pdfnup'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
278 pdfjoin = 'pdfjoin'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
279 shead = subjects.pop(0) # get rid of head
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
280 mhead = markers.pop(0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
281 maf = mhead.index('maf')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
282 missfrac = mhead.index('missfrac')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
283 logphweall = mhead.index('logp_hwe_all')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
284 logphweunaff = mhead.index('logp_hwe_unaff')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
285 # check for at least some unaffected rml june 2009
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
286 m_mendel = mhead.index('N_Mendel')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
287 fracmiss = shead.index('FracMiss')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
288 s_mendel = shead.index('Mendel_errors')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
289 s_het = shead.index('F_Stat')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
290 params = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
291 hweres = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
292 and x[logphweunaff].upper() <> 'NA']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
293 if len(hweres) <> 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
294 xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
295 # plot for each of these cols
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
296 else: # try hwe all instead - maybe no affection status available
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
297 xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
298 ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
299 oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
300 qqplotme = [1,0,0,0,0,0,0] #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
301 qnplotme = [0,0,0,0,0,0,1] #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
302 nplots = len(xs)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
303 xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
304 'Marker Mendel Error Count', 'Missing Rate: Subjects',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
305 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
306 plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
307 ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
308 ordplotnames = ['%s_cum' % x for x in plotnames]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
309 ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
310 outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
311 ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
312 datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
313 titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
314 "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
315 html = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
316 pdflist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
317 for n,column in enumerate(xs):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
318 dat = [float(x[column]) for x in datasources[n] if len(x) >= column
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
319 and x[column][:2].upper() <> 'NA'] # plink gives both!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
320 if sum(dat) <> 0: # eg nada for mendel if case control?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
321 rHist(plotme=dat,outfname=outfnames[n],xlabname=xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
322 title=titles[n],basename=basename,nbreaks=nbreaks)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
323 row = [titles[n],ploturls[n],outfnames[n] ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
324 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
325 pdflist.append(outfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
326 if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
327 otitle = 'Ranked %s' % titles[n]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
328 dat.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
329 if oreverseme[n]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
330 dat.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
331 rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
332 title=otitle,basename=basename,nbreaks=1000)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
333 row = [otitle,ordploturls[n],ordoutfnames[n]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
334 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
335 pdflist.append(ordoutfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
336 if qqplotme[n]: #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
337 otitle = 'LogQQ plot %s' % titles[n]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
338 dat.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
339 dat.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
340 ofn = os.path.split(ordoutfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
341 ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
342 ofu = os.path.split(ordploturls[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
343 ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
344 rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
345 title=otitle,basename=basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
346 row = [otitle,ofu,ofn]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
347 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
348 pdflist.append(ofn)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
349 elif qnplotme[n]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
350 otitle = 'F Statistic %s' % titles[n]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
351 dat.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
352 dat.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
353 ofn = os.path.split(ordoutfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
354 ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
355 ofu = os.path.split(ordploturls[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
356 ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
357 rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
358 title=otitle,basename=basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
359 row = [otitle,ofu,ofn]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
360 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
361 pdflist.append(ofn)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
362 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
363 print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
364 if nup>0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
365 # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
366 # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
367 filestojoin = ' '.join(pdflist) # all the file names so far
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
368 afname = '%s_All_Paged.pdf' % (basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
369 fullafname = os.path.join(newfpath,afname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
370 expl = 'All %s QC Plots joined into a single pdf' % basename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
371 vcl = '%s %s --outfile %s ' % (pdfjoin,filestojoin, fullafname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
372 # make single page pdf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
373 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
374 retval = x.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
375 row = [expl,afname,fullafname]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
376 html.insert(0,row) # last rather than second
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
377 nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
378 fullnfname = os.path.join(newfpath,nfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
379 expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
380 vcl = '%s %s --nup %dx%d --frame true --outfile %s' % (pdfnup,afname,nup,nup,fullnfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
381 # make thumbnail images
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
382 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
383 retval = x.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
384 row = [expl,nfname,fullnfname]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
385 html.insert(1,row) # this goes second
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
386 vcl = '%s -format jpg -resize %s %s' % (mog, mogresize, os.path.join(newfpath,'*.pdf'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
387 # make thumbnail images
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
388 x=subprocess.Popen(vcl,shell=True,cwd=newfpath,stderr=sto,stdout=sto)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
389 retval = x.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
390 sto.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
391 cruft = open(stofile,'r').readlines()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
392 return html,cruft # elements for an ordered list of urls or whatever..
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
393
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
394
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
395 def RmakePlots(markers=[],subjects=[],newfpath='.',basename='test',nbreaks='100',nup=3,height=8,width=10,rexe=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
396 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
397 nice try but the R scripts are huge and take forever to run if there's a lot of data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
398 marker rhead = ['snp','chrom','maf','a1','a2','missfrac',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
399 'p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
400 subject rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
401 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
402 colour = "maroon"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
403
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
404 def rHist(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=nbreaks):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
405 """ rHist <- function(plotme,froot,plotname,title,mfname,nbreaks=50)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
406 # generic histogram and vertical boxplot in a 3:1 layout
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
407 # returns the graphic file name for inclusion in the web page
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
408 # broken out here for reuse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
409 # ross lazarus march 2007
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
410 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
411 R = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
412 maint = 'QC for %s - %s' % (basename,title)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
413 screenmat = (1,2,1,2) # create a 2x2 canvas
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
414 widthlist = (80,20) # change to 4:1 ratio for histo and boxplot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
415 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
416 R.append("layout(matrix(c(1,1,1,2),nrow=1,ncol=4,byrow=T))")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
417 R.append("plotme = read.table(file='%s',head=F,sep='\t')" % plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
418 R.append('hist(plotme, main="%s",xlab="%s",breaks=%d,col="%s")' % (maint,xlabname,nbreaks,colour))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
419 R.append('boxplot(plotme,main="",col="%s",outline=F)' % (colour) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
420 R.append('dev.off()')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
421 return R
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
422
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
423 def rCum(plotme='',outfname='',xlabname='',title='',basename='',nbreaks=100):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
424 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
425 Useful to see what various cutoffs yield - plot percentiles
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
426 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
427 R = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
428 n = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
429 R.append("plotme = read.table(file='%s',head=T,sep='\t')" % plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
430 # arrives already in decending order of importance missingness or mendel count by subj or marker
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
431 maint = 'QC for %s - %s' % (basename,title)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
432 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
433 R.append("par(lab=c(10,10,10))")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
434 R.append('plot(plotme$xvec,plotme$yvec,type="p",main="%s",ylab="%s",xlab="Sample Percentile",col="%s")' % (maint,xlabname,colour))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
435 R.append('dev.off()')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
436 return R
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
437
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
438 def rQQ(plotme='', outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
439 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
440 y is data for a qq plot and ends up on the x axis go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
441 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
442 this version called with -log10 transformed hwe
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
443 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
444 R = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
445 nrows = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
446 fn = float(nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
447 xvec = [-math.log10(x/fn) for x in range(1,(nrows+1))]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
448 #mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
449 maxveclen = 3000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
450 yvec = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
451 if nrows > maxveclen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
452 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
453 # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
454 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
455 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
456 if skip < 2:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
457 skip = 2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
458 samplei = [i for i in range(nrows) if (i < always) or (i % skip == 0)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
459 # always oversample first sorted (here lowest) values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
460 yvec = [yvec[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
461 xvec = [xvec[i] for i in samplei] # and sample xvec same way
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
462 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
463 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
464 maint='Log QQ Plot(n=%d)' % (nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
465 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
466 ylab = '%s' % xlabname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
467 xlab = '-log10(Uniform 0-1)'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
468 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
469 x = ['%f' % x for x in xvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
470 R.append('xvec = c(%s)' % ','.join(x))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
471 y = ['%f' % x for x in yvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
472 R.append('yvec = c(%s)' % ','.join(y))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
473 R.append('mx = c(0,%f)' % (math.log10(fn)))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
474 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
475 R.append("par(lab=c(10,10,10))")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
476 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
477 R.append('points(mx,mx,type="l")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
478 R.append('grid(col="lightgray",lty="dotted")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
479 R.append('dev.off()')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
480 return R
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
481
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
482 def rMultiQQ(plotme = '',nsplits=5, outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
483 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
484 data must contain p,x,y as data for a qq plot, quantiles of x and y axis used to create a
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
485 grid of qq plots to show departure from null at extremes of data quality
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
486 Need to plot qqplot(p,unif) where the p's come from one x and one y quantile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
487 and ends up on the x axis go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
488 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
489 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
490 data = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
491 nvals = len(data)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
492 stepsize = nvals/nsplits
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
493 logstep = math.log10(stepsize) # so is 3 for steps of 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
494 R.append('mx = c(0,%f)' % logstep)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
495 quints = range(0,nvals,stepsize) # quintile cutpoints for each layer
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
496 data.sort(key=itertools.itemgetter(1)) # into x order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
497 #rpy.r.pdf( outfname, h , w )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
498 #rpy.r("par(mfrow = c(%d,%d))" % (nsplits,nsplits))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
499 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
500 yvec = [-math.log10(random.random()) for x in range(stepsize)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
501 yvec.sort() # size of each step is expected range for xvec under null?!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
502 y = ['%f' % x for x in yvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
503 R.append('yvec = c(%s)' % ','.join(y))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
504 for rowstart in quints:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
505 rowend = rowstart + stepsize
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
506 if nvals - rowend < stepsize: # finish last split
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
507 rowend = nvals
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
508 row = data[rowstart:rowend]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
509 row.sort(key=itertools.itemgetter(2)) # into y order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
510 for colstart in quints:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
511 colend = colstart + stepsize
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
512 if nvals - colend < stepsize: # finish last split
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
513 colend = nvals
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
514 cell = row[colstart:colend]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
515 xvec = [-math.log10(x[0]) for x in cell] # all the pvalues for this cell
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
516 x = ['%f' % x for x in xvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
517 R.append('xvec = c(%s)' % ','.join(x))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
518 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
519 R.append('points(mx,mx,type="l")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
520 R.append('grid(col="lightgray",lty="dotted")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
521 #rpy.r.qqplot(xvec,yvec,xlab=xlab,ylab=ylab,pch=19,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
522 #rpy.r.points(c(0,logstep),c(0,logstep),type='l')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
523 R.append('dev.off()')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
524 #rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
525 return R
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
526
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
527
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
528 def rQQNorm(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
529 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
530 y is data for a qqnorm plot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
531 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
532 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
533 rangeunif = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
534 nunif = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
535 maxveclen = 3000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
536 nrows = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
537 data = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
538 if nrows > maxveclen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
539 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
540 # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
541 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
542 skip = int((nrows-always)/float(maxveclen)) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
543 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
544 # always oversample first sorted (here lowest) values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
545 yvec = [data[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
546 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
547 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
548 yvec = data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
549 maint='Log QQ Plot(n=%d)' % (nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
550 n = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
551 ylab = '%s' % xlabname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
552 xlab = 'Normal'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
553 # need to supply the x axis or else rpy prints the freaking vector on the pdf - go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
554 #rpy.r.pdf( outfname, h , w )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
555 #rpy.r("par(lab=c(10,10,10))") # so our grid is denser than the default 5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
556 #rpy.r.qqnorm(yvec,xlab=xlab,ylab=ylab,main=maint,sub=title,pch=19,col="maroon",cex=0.8)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
557 #rpy.r.grid(nx = None, ny = None, col = "lightgray", lty = "dotted")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
558 #rpy.r.dev_off()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
559 y = ['%f' % x for x in yvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
560 R.append('yvec = c(%s)' % ','.join(y))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
561 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
562 R.append("par(lab=c(10,10,10))")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
563 R.append('qqnorm(yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
564 R.append('grid(col="lightgray",lty="dotted")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
565 R.append('dev.off()')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
566 return R
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
567
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
568 def rMAFMissqq(plotme=[], outfname='fname',title='title',xlabname='Sample',basename=''):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
569 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
570 layout qq plots for pvalues within rows of increasing MAF and columns of increasing missingness
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
571 like the GAIN qc tools
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
572 y is data for a qq plot and ends up on the x axis go figure
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
573 if sampling, oversample low values - all the top 1% ?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
574 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
575 rangeunif = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
576 nunif = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
577 fn = float(rangeunif)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
578 xvec = [-math.log10(x/fn) for x in range(1,(rangeunif+1))]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
579 skip = max(int(rangeunif/fn),1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
580 # force include last points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
581 mx = [0,math.log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
582 maxveclen = 2000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
583 nrows = len(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
584 data = copy.copy(plotme)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
585 data.sort() # low to high - oversample low values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
586 if nrows > maxveclen:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
587 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
588 # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
589 always = min(1000,nrows/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
590 skip = int(nrows/float(maxveclen)) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
591 samplei = [i for i in range(nrows) if (i % skip == 0) or (i < always)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
592 # always oversample first sorted (here lowest) values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
593 yvec = [data[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
594 xvec = [xvec[i] for i in samplei] # and sample xvec same way
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
595 maint='Log QQ Plot (random %d of %d)' % (len(yvec),nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
596 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
597 yvec = data
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
598 maint='Log QQ Plot(n=%d)' % (nrows)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
599 n = 1000
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
600 mx = [0,log10(fn)] # if 1000, becomes 3 for the null line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
601 ylab = '%s' % xlabname
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
602 xlab = '-log10(Uniform 0-1)'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
603 R.append('mx = c(0,%f)' % (math.log10(fn)))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
604 x = ['%f' % x for x in xvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
605 R.append('xvec = c(%s)' % ','.join(x))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
606 y = ['%f' % x for x in yvec]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
607 R.append('yvec = c(%s)' % ','.join(y))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
608 R.append('pdf("%s",h=%d,w=%d)' % (outfname,height,width))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
609 R.append("par(lab=c(10,10,10))")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
610 R.append('qqplot(xvec,yvec,xlab="%s",ylab="%s",main="%s",sub="%s",pch=19,col="%s",cex=0.8)' % (xlab,ylab,maint,title,colour))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
611 R.append('points(mx,mx,type="l")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
612 R.append('grid(col="lightgray",lty="dotted")')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
613 R.append('dev.off()')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
614
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
615
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
616 shead = subjects.pop(0) # get rid of head
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
617 mhead = markers.pop(0)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
618 maf = mhead.index('maf')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
619 missfrac = mhead.index('missfrac')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
620 logphweall = mhead.index('logp_hwe_all')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
621 logphweunaff = mhead.index('logp_hwe_unaff')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
622 # check for at least some unaffected rml june 2009
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
623 m_mendel = mhead.index('N_Mendel')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
624 fracmiss = shead.index('FracMiss')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
625 s_mendel = shead.index('Mendel_errors')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
626 s_het = shead.index('F_Stat')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
627 params = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
628 h = [float(x[logphweunaff]) for x in markers if len(x[logphweunaff]) >= logphweunaff
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
629 and x[logphweunaff].upper() <> 'NA']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
630 if len(h) <> 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
631 xs = [logphweunaff, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
632 # plot for each of these cols
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
633 else: # try hwe all instead - maybe no affection status available
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
634 xs = [logphweall, missfrac, maf, m_mendel, fracmiss, s_mendel, s_het]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
635 ordplotme = [1,1,1,1,1,1,1] # ordered plots for everything!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
636 oreverseme = [1,1,0,1,1,1,0] # so larger values are oversampled
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
637 qqplotme = [1,0,0,0,0,0,0] #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
638 qnplotme = [0,0,0,0,0,0,1] #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
639 nplots = len(xs)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
640 xlabnames = ['log(p) HWE (unaff)', 'Missing Rate: Markers', 'Minor Allele Frequency',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
641 'Marker Mendel Error Count', 'Missing Rate: Subjects',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
642 'Subject Mendel Error Count','Subject Inbreeding (het) F Statistic']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
643 plotnames = ['logphweunaff', 'missfrac', 'maf', 'm_mendel', 'fracmiss', 's_mendel','s_het']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
644 ploturls = ['%s_%s.pdf' % (basename,x) for x in plotnames] # real plotnames
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
645 ordplotnames = ['%s_cum' % x for x in plotnames]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
646 ordploturls = ['%s_%s.pdf' % (basename,x) for x in ordplotnames] # real plotnames
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
647 outfnames = [os.path.join(newfpath,ploturls[x]) for x in range(nplots)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
648 ordoutfnames = [os.path.join(newfpath,ordploturls[x]) for x in range(nplots)]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
649 datasources = [markers,markers,markers,markers,subjects,subjects,subjects] # use this table
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
650 titles = ["Marker HWE","Marker Missing Genotype", "Marker MAF","Marker Mendel",
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
651 "Subject Missing Genotype","Subject Mendel",'Subject F Statistic']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
652 html = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
653 pdflist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
654 R = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
655 for n,column in enumerate(xs):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
656 dfn = '%d_%s.txt' % (n,titles[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
657 dfilepath = os.path.join(newfpath,dfn)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
658 dat = [float(x[column]) for x in datasources[n] if len(x) >= column
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
659 and x[column][:2].upper() <> 'NA'] # plink gives both!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
660 if sum(dat) <> 0: # eg nada for mendel if case control?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
661 plotme = file(dfilepath,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
662 plotme.write('\n'.join(['%f' % x for x in dat])) # pass as a file - copout!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
663 tR = rHist(plotme=dfilepath,outfname=outfnames[n],xlabname=xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
664 title=titles[n],basename=basename,nbreaks=nbreaks)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
665 R += tR
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
666 row = [titles[n],ploturls[n],outfnames[n] ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
667 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
668 pdflist.append(outfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
669 if ordplotme[n]: # for missingness, hwe - plots to see where cutoffs will end up
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
670 otitle = 'Ranked %s' % titles[n]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
671 dat.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
672 if oreverseme[n]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
673 dat.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
674 ndat = len(dat)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
675 xvec = range(ndat)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
676 xvec = [100.0*(n-x)/n for x in xvec] # convert to centile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
677 # now have half a million markers eg - too many to plot all for a pdf - sample to get 10k or so points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
678 maxveclen = 1000.0 # for reasonable pdf sizes!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
679 if ndat > maxveclen: # oversample part of the distribution
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
680 always = min(1000,ndat/20) # oversample smaller of lowest few hundred items or 5%
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
681 skip = int(ndat/maxveclen) # take 1 in skip to get about maxveclen points
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
682 samplei = [i for i in range(ndat) if (i % skip == 0) or (i < always)] # always oversample first sorted values
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
683 yvec = [yvec[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
684 xvec = [xvec[i] for i in samplei] # always get first and last
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
685 plotme = file(dfilepath,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
686 plotme.write('xvec\tyvec\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
687 plotme.write('\n'.join(['%f\t%f' % (xvec[i],y) for y in yvec])) # pass as a file - copout!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
688 tR = rCum(plotme=dat,outfname=ordoutfnames[n],xlabname='Ordered %s' % xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
689 title=otitle,basename=basename,nbreaks=nbreaks)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
690 R += tR
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
691 row = [otitle,ordploturls[n],ordoutfnames[n]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
692 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
693 pdflist.append(ordoutfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
694 if qqplotme[n]: #
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
695 otitle = 'LogQQ plot %s' % titles[n]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
696 dat.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
697 dat.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
698 ofn = os.path.split(ordoutfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
699 ofn = os.path.join(ofn[0],'QQ%s' % ofn[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
700 ofu = os.path.split(ordploturls[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
701 ofu = os.path.join(ofu[0],'QQ%s' % ofu[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
702 tR = rQQ(plotme=dat,outfname=ofn,xlabname='QQ %s' % xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
703 title=otitle,basename=basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
704 R += tR
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
705 row = [otitle,ofu,ofn]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
706 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
707 pdflist.append(ofn)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
708 elif qnplotme[n]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
709 otitle = 'F Statistic %s' % titles[n]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
710 dat.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
711 dat.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
712 ofn = os.path.split(ordoutfnames[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
713 ofn = os.path.join(ofn[0],'FQNorm%s' % ofn[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
714 ofu = os.path.split(ordploturls[n])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
715 ofu = os.path.join(ofu[0],'FQNorm%s' % ofu[1])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
716 tR = rQQNorm(plotme=dat,outfname=ofn,xlabname='F QNorm %s' % xlabnames[n],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
717 title=otitle,basename=basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
718 R += tR
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
719 row = [otitle,ofu,ofn]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
720 html.append(row)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
721 pdflist.append(ofn)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
722 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
723 print '#$# no data for # %d - %s, data[:10]=%s' % (n,titles[n],dat[:10])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
724 rlog,flist = RRun(rcmd=R,title='makeQCplots',outdir=newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
725 if nup>0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
726 # pdfjoin --outfile chr1test.pdf `ls database/files/dataset_396_files/*.pdf`
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
727 # pdfnup chr1test.pdf --nup 3x3 --frame true --outfile chr1test3.pdf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
728 filestojoin = ' '.join(pdflist) # all the file names so far
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
729 afname = '%s_All_Paged.pdf' % (basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
730 fullafname = os.path.join(newfpath,afname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
731 expl = 'All %s QC Plots joined into a single pdf' % basename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
732 vcl = 'pdfjoin %s --outfile %s ' % (filestojoin, fullafname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
733 # make single page pdf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
734 x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
735 retval = x.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
736 row = [expl,afname,fullafname]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
737 html.insert(0,row) # last rather than second
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
738 nfname = '%s_All_%dx%d.pdf' % (basename,nup,nup)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
739 fullnfname = os.path.join(newfpath,nfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
740 expl = 'All %s QC Plots %d by %d to a page' % (basename,nup,nup)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
741 vcl = 'pdfnup %s --nup %dx%d --frame true --outfile %s' % (afname,nup,nup,fullnfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
742 # make thumbnail images
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
743 x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
744 retval = x.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
745 row = [expl,nfname,fullnfname]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
746 html.insert(1,row) # this goes second
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
747 vcl = 'mogrify -format jpg -resize %s %s' % (mogresize, os.path.join(newfpath,'*.pdf'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
748 # make thumbnail images
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
749 x=subprocess.Popen(vcl,shell=True,cwd=newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
750 retval = x.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
751 return html # elements for an ordered list of urls or whatever..
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
752
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
753 def countHet(bedf='fakeped_500000',linkageped=True,froot='fake500k',outfname="ahetf",logf='rgQC.log'):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
754 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
755 NO LONGER USED - historical interest
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
756 count het loci for each subject to look for outliers = ? contamination
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
757 assume ped file is linkage format
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
758 need to make a ped file from the bed file we were passed
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
759 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
760 vcl = [plinke,'--bfile',bedf,'--recode','--out','%s_recode' % froot] # write a recoded ped file from the real .bed file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
761 p=subprocess.Popen(' '.join(vcl),shell=True)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
762 retval = p.wait()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
763 f = open('%s_recode.recode.ped' % froot,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
764 if not linkageped:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
765 head = f.next() # throw away header
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
766 hets = [] # simple count of het loci per subject. Expect poisson?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
767 n = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
768 for l in f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
769 n += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
770 ll = l.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
771 if len(ll) > 6:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
772 iid = idjoiner.join(ll[:2]) # fam_iid
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
773 gender = ll[4]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
774 alleles = ll[6:]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
775 nallele = len(alleles)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
776 nhet = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
777 for i in range(nallele/2):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
778 a1=alleles[2*i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
779 a2=alleles[2*i+1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
780 if alleles[2*i] <> alleles[2*i+1]: # must be het
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
781 if not missvals.get(a1,None) and not missvals.get(a2,None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
782 nhet += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
783 hets.append((nhet,iid,gender)) # for sorting later
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
784 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
785 hets.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
786 hets.reverse() # biggest nhet now on top
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
787 f = open(outfname ,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
788 res = ['%d\t%s\t%s' % (x) for x in hets] # I love list comprehensions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
789 res.insert(0,'nhetloci\tfamid_iid\tgender')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
790 res.append('')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
791 f.write('\n'.join(res))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
792 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
793
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
794
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
795
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
796 def subjectRep(froot='cleantest',outfname="srep",newfpath='.',logf = None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
797 """by subject (missingness = .imiss, mendel = .imendel)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
798 assume replicates have an underscore in family id for
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
799 hapmap testing
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
800 For sorting, we need floats and integers
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
801 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
802 isexfile = '%s.sexcheck' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
803 imissfile = '%s.imiss' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
804 imendfile = '%s.imendel' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
805 ihetfile = '%s.het' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
806 logf.write('## subject reports starting at %s\n' % timenow())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
807 outfile = os.path.join(newfpath,outfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
808 idlist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
809 imissdict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
810 imenddict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
811 isexdict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
812 ihetdict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
813 Tops = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
814 Tnames = ['Ranked Subject Missing Genotype', 'Ranked Subject Mendel',
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
815 'Ranked Sex check', 'Ranked Inbreeding (het) F statistic']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
816 Tsorts = [2,3,6,8]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
817 Treverse = [True,True,True,False] # so first values are worser
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
818 #rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','Fest']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
819 ## FID IID MISS_PHENO N_MISS N_GENO F_MISS
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
820 ## 1552042370_A 1552042370_A N 5480 549883 0.009966
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
821 ## 1552042410_A 1552042410_A N 1638 549883 0.002979
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
822
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
823 # ------------------missing--------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
824 # imiss has FID IID MISS_PHENO N_MISS F_MISS
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
825 # we want F_MISS
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
826 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
827 f = open(imissfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
828 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
829 logf.write('# file %s is missing - talk about irony\n' % imissfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
830 f = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
831 if f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
832 for n,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
833 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
834 if n == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
835 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
836 fidpos = head.index('FID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
837 iidpos = head.index('IID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
838 fpos = head.index('F_MISS')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
839 elif len(ll) >= fpos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
840 fid = ll[fidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
841 #if fid.find('_') == -1: # not replicate! - icondb ids have these...
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
842 iid = ll[iidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
843 fmiss = ll[fpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
844 id = '%s%s%s' % (fid,idjoiner,iid)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
845 imissdict[id] = fmiss
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
846 idlist.append(id)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
847 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
848 logf.write('## imissfile %s contained %d ids\n' % (imissfile,len(idlist)))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
849 # ------------------mend-------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
850 # *.imendel has FID IID N
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
851 # we want N
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
852 gotmend = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
853 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
854 f = open(imendfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
855 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
856 gotmend = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
857 for id in idlist:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
858 imenddict[id] = '0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
859 if gotmend:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
860 for n,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
861 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
862 if n == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
863 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
864 npos = head.index('N')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
865 fidpos = head.index('FID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
866 iidpos = head.index('IID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
867 elif len(ll) >= npos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
868 fid = ll[fidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
869 iid = ll[iidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
870 id = '%s%s%s' % (fid,idjoiner,iid)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
871 nmend = ll[npos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
872 imenddict[id] = nmend
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
873 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
874 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
875 logf.write('## error No %s file - assuming not family data\n' % imendfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
876 # ------------------sex check------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
877 #[rerla@hg fresh]$ head /home/rerla/fresh/database/files/dataset_978_files/CAMP2007Dirty.sexcheck
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
878 # sexcheck has FID IID PEDSEX SNPSEX STATUS F
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
879 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
880 ## FID Family ID
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
881 ## IID Individual ID
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
882 ## PEDSEX Sex as determined in pedigree file (1=male, 2=female)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
883 ## SNPSEX Sex as determined by X chromosome
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
884 ## STATUS Displays "PROBLEM" or "OK" for each individual
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
885 ## F The actual X chromosome inbreeding (homozygosity) estimate
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
886 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
887 ## A PROBLEM arises if the two sexes do not match, or if the SNP data or pedigree data are
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
888 ## ambiguous with regard to sex.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
889 ## A male call is made if F is more than 0.8; a femle call is made if F is less than 0.2.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
890 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
891 f = open(isexfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
892 got_sexcheck = 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
893 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
894 got_sexcheck = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
895 if got_sexcheck:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
896 for n,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
897 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
898 if n == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
899 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
900 fidpos = head.index('FID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
901 iidpos = head.index('IID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
902 pedsexpos = head.index('PEDSEX')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
903 snpsexpos = head.index('SNPSEX')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
904 statuspos = head.index('STATUS')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
905 fpos = head.index('F')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
906 elif len(ll) >= fpos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
907 fid = ll[fidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
908 iid = ll[iidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
909 fest = ll[fpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
910 pedsex = ll[pedsexpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
911 snpsex = ll[snpsexpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
912 stat = ll[statuspos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
913 id = '%s%s%s' % (fid,idjoiner,iid)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
914 isexdict[id] = (pedsex,snpsex,stat,fest)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
915 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
916 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
917 # this only happens if there are no subjects!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
918 logf.write('No %s file - assuming no sex errors' % isexfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
919 ##
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
920 ## FID IID O(HOM) E(HOM) N(NM) F
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
921 ## 457 2 490665 4.928e+05 722154 -0.009096
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
922 ## 457 3 464519 4.85e+05 710986 -0.0908
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
923 ## 1037 2 461632 4.856e+05 712025 -0.106
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
924 ## 1037 1 491845 4.906e+05 719353 0.005577
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
925 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
926 f = open(ihetfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
927 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
928 f = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
929 logf.write('## No %s file - did we run --het in plink?\n' % ihetfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
930 if f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
931 for i,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
932 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
933 if i == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
934 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
935 fidpos = head.index('FID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
936 iidpos = head.index('IID')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
937 fpos = head.index('F')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
938 n = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
939 elif len(ll) >= fpos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
940 fid = ll[fidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
941 iid = ll[iidpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
942 fhet = ll[fpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
943 id = '%s%s%s' % (fid,idjoiner,iid)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
944 ihetdict[id] = fhet
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
945 f.close() # now assemble and output result list
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
946 rhead = ['famId','iId','FracMiss','Mendel_errors','Ped_sex','SNP_sex','Status','XHomEst','F_Stat']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
947 res = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
948 fres = [] # floats for sorting
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
949 for id in idlist: # for each snp in found order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
950 fid,iid = id.split(idjoiner) # recover keys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
951 f_missing = imissdict.get(id,'0.0')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
952 nmend = imenddict.get(id,'0')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
953 (pedsex,snpsex,status,fest) = isexdict.get(id,('0','0','0','0.0'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
954 fhet = ihetdict.get(id,'0.0')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
955 res.append([fid,iid,f_missing,nmend,pedsex,snpsex,status,fest,fhet])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
956 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
957 ff_missing = float(f_missing)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
958 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
959 ff_missing = 0.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
960 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
961 inmend = int(nmend)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
962 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
963 inmend = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
964 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
965 ffest = float(fest)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
966 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
967 fest = 0.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
968 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
969 ffhet = float(fhet)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
970 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
971 ffhet = 0.0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
972 fres.append([fid,iid,ff_missing,inmend,pedsex,snpsex,status,ffest,ffhet])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
973 ntokeep = max(20,len(res)/keepfrac)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
974 for i,col in enumerate(Tsorts):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
975 fres.sort(key=operator.itemgetter(col))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
976 if Treverse[i]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
977 fres.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
978 repname = Tnames[i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
979 Tops[repname] = fres[0:ntokeep]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
980 Tops[repname] = [map(str,x) for x in Tops[repname]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
981 Tops[repname].insert(0,rhead)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
982 res.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
983 res.insert(0,rhead)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
984 logf.write('### writing %s report with %s' % ( outfile,res[0]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
985 f = open(outfile,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
986 f.write('\n'.join(['\t'.join(x) for x in res]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
987 f.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
988 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
989 return res,Tops
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
990
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
991 def markerRep(froot='cleantest',outfname="mrep",newfpath='.',logf=None,maplist=None ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
992 """by marker (hwe = .hwe, missingness=.lmiss, freq = .frq)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
993 keep a list of marker order but keep all stats in dicts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
994 write out a fake xls file for R or SAS etc
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
995 kinda clunky, but..
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
996 TODO: ensure stable if any file not found?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
997 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
998 mapdict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
999 if maplist <> None:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1000 rslist = [x[1] for x in maplist]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1001 offset = [(x[0],x[3]) for x in maplist]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1002 mapdict = dict(zip(rslist,offset))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1003 hwefile = '%s.hwe' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1004 lmissfile = '%s.lmiss' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1005 freqfile = '%s.frq' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1006 lmendfile = '%s.lmendel' % froot
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1007 outfile = os.path.join(newfpath,outfname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1008 markerlist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1009 chromlist = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1010 hwedict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1011 lmissdict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1012 freqdict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1013 lmenddict = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1014 Tops = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1015 Tnames = ['Ranked Marker MAF', 'Ranked Marker Missing Genotype', 'Ranked Marker HWE', 'Ranked Marker Mendel']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1016 Tsorts = [3,6,10,11]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1017 Treverse = [False,True,True,True] # so first values are worse(r)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1018 #res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1019 #rhead = ['snp','chrom','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1020 # -------------------hwe--------------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1021 # hwe has SNP TEST GENO O(HET) E(HET) P_HWD
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1022 # we want all hwe where P_HWD <> NA
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1023 # ah changed in 1.04 to
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1024 ## CHR SNP TEST A1 A2 GENO O(HET) E(HET) P
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1025 ## 1 rs6671164 ALL 2 3 34/276/613 0.299 0.3032 0.6644
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1026 ## 1 rs6671164 AFF 2 3 0/0/0 nan nan NA
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1027 ## 1 rs6671164 UNAFF 2 3 34/276/613 0.299 0.3032 0.6644
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1028 ## 1 rs4448553 ALL 2 3 8/176/748 0.1888 0.1848 0.5975
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1029 ## 1 rs4448553 AFF 2 3 0/0/0 nan nan NA
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1030 ## 1 rs4448553 UNAFF 2 3 8/176/748 0.1888 0.1848 0.5975
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1031 ## 1 rs1990150 ALL 1 3 54/303/569 0.3272 0.3453 0.1067
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1032 ## 1 rs1990150 AFF 1 3 0/0/0 nan nan NA
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1033 ## 1 rs1990150 UNAFF 1 3 54/303/569 0.3272 0.3453 0.1067
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1034 logf.write('## marker reports starting at %s\n' % timenow())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1035 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1036 f = open(hwefile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1037 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1038 f = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1039 logf.write('## error - no hwefile %s found\n' % hwefile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1040 if f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1041 for i,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1042 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1043 if i == 0: # head
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1044 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1045 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1046 testpos = head.index('TEST')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1047 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1048 testpos = 2 # patch for 1.04 which has b0rken headers - otherwise use head.index('TEST')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1049 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1050 ppos = head.index('P')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1051 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1052 ppos = 8 # patch - for head.index('P')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1053 snppos = head.index('SNP')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1054 chrpos = head.index('CHR')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1055 logf.write('hwe header testpos=%d,ppos=%d,snppos=%d\n' % (testpos,ppos,snppos))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1056 elif len(ll) >= ppos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1057 ps = ll[ppos].upper()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1058 rs = ll[snppos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1059 chrom = ll[chrpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1060 test = ll[testpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1061 if not hwedict.get(rs,None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1062 hwedict[rs] = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1063 markerlist.append(rs)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1064 chromlist.append(chrom) # one place to find it?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1065 lpvals = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1066 if ps.upper() <> 'NA' and ps.upper() <> 'NAN': # worth keeping
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1067 lpvals = '0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1068 if ps <> '1':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1069 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1070 pval = float(ps)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1071 lpvals = '%f' % -math.log10(pval)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1072 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1073 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1074 hwedict[rs][test] = (ps,lpvals)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1075 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1076 logf.write('short line #%d = %s\n' % (i,ll))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1077 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1078 # ------------------missing--------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1079 """lmiss has
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1080 CHR SNP N_MISS N_GENO F_MISS
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1081 1 rs12354060 0 73 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1082 1 rs4345758 1 73 0.0137
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1083 1 rs2691310 73 73 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1084 1 rs2531266 73 73 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1085 # we want F_MISS"""
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1086 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1087 f = open(lmissfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1088 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1089 f = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1090 if f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1091 for i,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1092 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1093 if i == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1094 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1095 fracpos = head.index('F_MISS')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1096 npos = head.index('N_MISS')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1097 snppos = head.index('SNP')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1098 elif len(ll) >= fracpos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1099 rs = ll[snppos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1100 fracval = ll[fracpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1101 lmissdict[rs] = fracval # for now, just that?
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1102 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1103 lmissdict[rs] = 'NA'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1104 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1105 # ------------------freq-------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1106 # frq has CHR SNP A1 A2 MAF NM
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1107 # we want maf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1108 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1109 f = open(freqfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1110 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1111 f = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1112 if f:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1113 for i,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1114 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1115 if i == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1116 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1117 mafpos = head.index('MAF')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1118 a1pos = head.index('A1')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1119 a2pos = head.index('A2')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1120 snppos = head.index('SNP')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1121 elif len(ll) >= mafpos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1122 rs = ll[snppos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1123 a1 = ll[a1pos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1124 a2 = ll[a2pos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1125 maf = ll[mafpos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1126 freqdict[rs] = (maf,a1,a2)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1127 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1128 # ------------------mend-------------------
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1129 # lmend has CHR SNP N
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1130 # we want N
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1131 gotmend = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1132 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1133 f = open(lmendfile,'r')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1134 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1135 gotmend = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1136 for rs in markerlist:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1137 lmenddict[rs] = '0'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1138 if gotmend:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1139 for i,line in enumerate(f):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1140 ll = line.strip().split()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1141 if i == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1142 head = [x.upper() for x in ll] # expect above
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1143 npos = head.index('N')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1144 snppos = head.index('SNP')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1145 elif len(ll) >= npos: # full line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1146 rs = ll[snppos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1147 nmend = ll[npos]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1148 lmenddict[rs] = nmend
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1149 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1150 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1151 logf.write('No %s file - assuming not family data\n' % lmendfile)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1152 # now assemble result list
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1153 rhead = ['snp','chromosome','offset','maf','a1','a2','missfrac','p_hwe_all','logp_hwe_all','p_hwe_unaff','logp_hwe_unaff','N_Mendel']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1154 res = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1155 fres = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1156 for rs in markerlist: # for each snp in found order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1157 f_missing = lmissdict.get(rs,'NA')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1158 maf,a1,a2 = freqdict.get(rs,('NA','NA','NA'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1159 hwe_all = hwedict[rs].get('ALL',('NA','NA')) # hope this doesn't change...
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1160 hwe_unaff = hwedict[rs].get('UNAFF',('NA','NA'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1161 nmend = lmenddict.get(rs,'NA')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1162 (chrom,offset)=mapdict.get(rs,('?','0'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1163 res.append([rs,chrom,offset,maf,a1,a2,f_missing,hwe_all[0],hwe_all[1],hwe_unaff[0],hwe_unaff[1],nmend])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1164 ntokeep = max(10,len(res)/keepfrac)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1165
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1166 def msortk(item=None):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1167 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1168 deal with non numeric sorting
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1169 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1170 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1171 return float(item)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1172 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1173 return item
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1174
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1175 for i,col in enumerate(Tsorts):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1176 res.sort(key=msortk(lambda x:x[col]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1177 if Treverse[i]:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1178 res.reverse()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1179 repname = Tnames[i]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1180 Tops[repname] = res[0:ntokeep]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1181 Tops[repname].insert(0,rhead)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1182 res.sort(key=lambda x: '%s_%10d' % (x[1].ljust(4,'0'),int(x[2]))) # in chrom offset order
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1183 res.insert(0,rhead)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1184 f = open(outfile,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1185 f.write('\n'.join(['\t'.join(x) for x in res]))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1186 f.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1187 return res,Tops
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1188
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1189
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1190
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1191
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1192 def getfSize(fpath,outpath):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1193 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1194 format a nice file size string
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1195 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1196 size = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1197 fp = os.path.join(outpath,fpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1198 if os.path.isfile(fp):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1199 n = float(os.path.getsize(fp))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1200 if n > 2**20:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1201 size = ' (%1.1f MB)' % (n/2**20)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1202 elif n > 2**10:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1203 size = ' (%1.1f KB)' % (n/2**10)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1204 elif n > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1205 size = ' (%d B)' % (int(n))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1206 return size
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1207
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1208
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1209 if __name__ == "__main__":
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1210 u = """ called in xml as
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1211 <command interpreter="python">
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1212 rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1213 -s '$html_file' -p '$html_file.files_path' -l '${GALAXY_DATA_INDEX_DIR}/rg/bin/plink'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1214 -r '${GALAXY_DATA_INDEX_DIR}/rg/bin/R'
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1215 </command>
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1216
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1217 Prepare a qc report - eg:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1218 print "%s %s -i birdlped -o birdlped -l qc.log -s htmlf -m marker.xls -s sub.xls -p ./" % (sys.executable,prog)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1219
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1220 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1221 progname = os.path.basename(sys.argv[0])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1222 if len(sys.argv) < 9:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1223 print '%s requires 6 parameters - got %d = %s' % (progname,len(sys.argv),sys.argv)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1224 sys.exit(1)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1225 parser = OptionParser(usage=u, version="%prog 0.01")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1226 a = parser.add_option
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1227 a("-i","--infile",dest="infile")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1228 a("-o","--oprefix",dest="opref")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1229 a("-l","--plinkexe",dest="plinke", default=plinke)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1230 a("-r","--rexe",dest="rexe", default=rexe)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1231 a("-s","--snps",dest="htmlf")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1232 #a("-m","--markerRaw",dest="markf")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1233 #a("-r","--rawsubject",dest="subjf")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1234 a("-p","--patho",dest="newfpath")
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1235 (options,args) = parser.parse_args()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1236 basename = os.path.split(options.infile)[-1] # just want the file prefix to find the .xls files below
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1237 killme = string.punctuation + string.whitespace
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1238 trantab = string.maketrans(killme,'_'*len(killme))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1239 opref = options.opref.translate(trantab)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1240 alogh,alog = tempfile.mkstemp(suffix='.txt')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1241 plogh,plog = tempfile.mkstemp(suffix='.txt')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1242 alogf = open(alog,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1243 plogf = open(plog,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1244 ahtmlf = options.htmlf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1245 amarkf = 'MarkerDetails_%s.xls' % opref
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1246 asubjf = 'SubjectDetails_%s.xls' % opref
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1247 newfpath = options.newfpath
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1248 newfpath = os.path.realpath(newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1249 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1250 os.makedirs(newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1251 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1252 pass
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1253 ofn = basename
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1254 bfn = options.infile
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1255 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1256 mapf = '%s.bim' % bfn
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1257 maplist = file(mapf,'r').readlines()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1258 maplist = [x.split() for x in maplist]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1259 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1260 maplist = None
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1261 alogf.write('## error - cannot open %s to read map - no offsets will be available for output files')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1262 #rerla@beast galaxy]$ head test-data/tinywga.bim
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1263 #22 rs2283802 0 21784722 4 2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1264 #22 rs2267000 0 21785366 4 2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1265 rgbin = os.path.split(rexe)[0] # get our rg bin path
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1266 #plinktasks = [' --freq',' --missing',' --mendel',' --hardy',' --check-sex'] # plink v1 fixes that bug!
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1267 # if we could, do all at once? Nope. Probably never.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1268 plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1269 ['--mendel',],['--check-sex',]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1270 vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1271 runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1272 plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1273 ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)],
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1274 ['--bfile ldp_%s --het --out %s' % (basename,basename)]]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1275 # subset of ld independent markers for eigenstrat and other requirements
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1276 vclbase = [options.plinke,'--noweb']
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1277 plogout = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1278 plogf.write('\n'.join(plogout))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1279 plogf.write('\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1280 repout = os.path.join(newfpath,basename)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1281 subjects,subjectTops = subjectRep(froot=repout,outfname=asubjf,newfpath=newfpath,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1282 logf=alogf) # writes the subject_froot.xls file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1283 markers,markerTops = markerRep(froot=repout,outfname=amarkf,newfpath=newfpath,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1284 logf=alogf,maplist=maplist) # marker_froot.xls
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1285 nbreaks = 100
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1286 s = '## starting plotpage, newfpath=%s,m=%s,s=%s/n' % (newfpath,markers[:2],subjects[:2])
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1287 alogf.write(s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1288 print s
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1289 plotpage,cruft = makePlots(markers=markers,subjects=subjects,newfpath=newfpath,
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1290 basename=basename,nbreaks=nbreaks,height=10,width=8,rgbin=rgbin)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1291 #plotpage = RmakePlots(markers=markers,subjects=subjects,newfpath=newfpath,basename=basename,nbreaks=nbreaks,rexe=rexe)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1292
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1293 # [titles[n],plotnames[n],outfnames[n] ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1294 html = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1295 html.append('<table cellpadding="5" border="0">')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1296 size = getfSize(amarkf,newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1297 html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1298 (amarkf,'Click here to download the Marker QC Detail report file',size))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1299 size = getfSize(asubjf,newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1300 html.append('<tr><td colspan="3"><a href="%s" type="application/vnd.ms-excel">%s</a>%s tab delimited</td></tr>' % \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1301 (asubjf,'Click here to download the Subject QC Detail report file',size))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1302 for (title,url,ofname) in plotpage:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1303 ttitle = 'Ranked %s' % title
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1304 dat = subjectTops.get(ttitle,None)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1305 if not dat:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1306 dat = markerTops.get(ttitle,None)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1307 imghref = '%s.jpg' % os.path.splitext(url)[0] # removes .pdf
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1308 thumbnail = os.path.join(newfpath,imghref)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1309 if not os.path.exists(thumbnail): # for multipage pdfs, mogrify makes multiple jpgs - fugly hack
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1310 imghref = '%s-0.jpg' % os.path.splitext(url)[0] # try the first jpg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1311 thumbnail = os.path.join(newfpath,imghref)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1312 if not os.path.exists(thumbnail):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1313 html.append('<tr><td colspan="3"><a href="%s">%s</a></td></tr>' % (url,title))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1314 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1315 html.append('<tr><td><a href="%s"><img src="%s" alt="%s" hspace="10" align="middle">' \
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1316 % (url,imghref,title))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1317 if dat: # one or the other - write as an extra file and make a link here
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1318 t = '%s.xls' % (ttitle.replace(' ','_'))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1319 fname = os.path.join(newfpath,t)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1320 f = file(fname,'w')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1321 f.write('\n'.join(['\t'.join(x) for x in dat])) # the report
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1322 size = getfSize(t,newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1323 html.append('</a></td><td>%s</td><td><a href="%s">Worst data</a>%s</td></tr>' % (title,t,size))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1324 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1325 html.append('</a></td><td>%s</td><td>&nbsp;</td></tr>' % (title))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1326 html.append('</table><hr><h3>All output files from the QC run are available below</h3>')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1327 html.append('<table cellpadding="5" border="0">\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1328 flist = os.listdir(newfpath) # we want to catch 'em all
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1329 flist.sort()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1330 for f in flist:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1331 fname = os.path.split(f)[-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1332 size = getfSize(fname,newfpath)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1333 html.append('<tr><td><a href="%s">%s</a>%s</td></tr>' % (fname,fname,size))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1334 html.append('</table>')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1335 alogf.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1336 plogf.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1337 llog = open(alog,'r').readlines()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1338 plogfile = open(plog,'r').readlines()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1339 os.unlink(alog)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1340 os.unlink(plog)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1341 llog += plogfile # add lines from pruneld log
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1342 lf = file(ahtmlf,'w') # galaxy will show this as the default view
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1343 lf.write(galhtmlprefix % progname)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1344 s = '\n<div>Output from Rgenetics QC report tool run at %s<br>\n' % (timenow())
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1345 lf.write('<h4>%s</h4>\n' % s)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1346 lf.write('</div><div><h4>(Click any preview image to download a full sized PDF version)</h4><br><ol>\n')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1347 lf.write('\n'.join(html))
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1348 lf.write('<h4>QC run log contents</h4>')
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1349 lf.write('<pre>%s</pre>' % (''.join(llog))) # plink logs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1350 if len(cruft) > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1351 lf.write('<h2>Blather from pdfnup follows:</h2><pre>%s</pre>' % (''.join(cruft))) # pdfnup
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1352 lf.write('%s\n<hr>\n' % galhtmlpostfix)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1353 lf.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1354