comparison tools/rgenetics/rgLDIndep.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 # oct 2009 - must make a map file in case later usage requires it...
3 # galaxy tool xml files can define a galaxy supplied output filename
4 # that must be passed to the tool and used to return output
5 # here, the plink log file is copied to that file and removed
6 # took a while to figure this out!
7 # use exec_before_job to give files sensible names
8 #
9 # ross april 14 2007
10 # plink cleanup script
11 # ross lazarus March 2007 for camp illumina whole genome data
12 # note problems with multiple commands being ignored - eg --freq --missing --mendel
13 # only the first seems to get done...
14 #
15 ##Summary statistics versus inclusion criteria
16 ##
17 ##Feature As summary statistic As inclusion criteria
18 ##Missingness per individual --missing --mind N
19 ##Missingness per marker --missing --geno N
20 ##Allele frequency --freq --maf N
21 ##Hardy-Weinberg equilibrium --hardy --hwe N
22 ##Mendel error rates --mendel --me N M
23 #
24 # this is rgLDIndep.py - main task is to decrease LD by filtering high LD pairs
25 # remove that function from rgClean.py as it may not be needed.
26
27 """
28 import sys,shutil,os,subprocess, glob, string, tempfile, time
29 from rgutils import plinke, timenow, galhtmlprefix
30
31 prog = os.path.split(sys.argv[0])[-1]
32 myversion = 'January 4 2010'
33
34
35 def pruneld(plinktasks=[] ,cd='./',vclbase = []):
36 """
37 plink blathers when doing pruning - ignore
38 Linkage disequilibrium based SNP pruning
39 if a million snps in 3 billion base pairs, have mean 3k spacing
40 assume 40-60k of ld in ceu, a window of 120k width is about 40 snps
41 so lots more is perhaps less efficient - each window computational cost is
42 ON^2 unless the code is smart enough to avoid unecessary computation where
43 allele frequencies make it impossible to see ld > the r^2 cutoff threshold
44 So, do a window and move forward 20?
45 from the plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune
46
47 Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation.
48
49 Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning.
50
51 The VIF pruning routine is performed:
52 plink --file data --indep 50 5 2
53
54 will create files
55
56 plink.prune.in
57 plink.prune.out
58
59 Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for
60 a --extract or --exclude command.
61
62 The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the
63 window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed.
64
65 The second procedure is performed:
66 plink --file data --indep-pairwise 50 5 0.5
67
68 This generates the same output files as the first version; the only difference is that a
69 simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic correlation, i.e. it does not involve phasing.
70
71 To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a
72 window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 SNPs forward and repeat the procedure.
73
74 To make a new, pruned file, then use something like (in this example, we also convert the
75 standard PED fileset to a binary one):
76 plink --file data --extract plink.prune.in --make-bed --out pruneddata
77 """
78 logres = ['## Rgenetics %s: http://rgenetics.org Galaxy Tools rgLDIndep.py Plink pruneLD runner\n' % myversion,]
79 for task in plinktasks: # each is a list
80 fplog,plog = tempfile.mkstemp()
81 sto = open(plog,'w') # to catch the blather
82 vcl = vclbase + task
83 s = '## ldindep now executing %s\n' % ' '.join(vcl)
84 print s
85 logres.append(s)
86 x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
87 retval = x.wait()
88 sto.close()
89 sto = open(plog,'r') # read
90 try:
91 lplog = sto.readlines()
92 lplog = [x for x in lplog if x.find('Pruning SNP') == -1]
93 logres += lplog
94 logres.append('\n')
95 except:
96 logres.append('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl)))
97 sto.close()
98 os.unlink(plog) # no longer needed
99 return logres
100
101
102
103 def clean():
104 """
105 """
106 if len(sys.argv) < 14:
107 print >> sys.stdout, '## %s expected 14 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
108 print >> sys.stdout, """this script will filter a linkage format ped
109 and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
110 a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
111 documented in the plink docs plus the file to be returned to Galaxy
112 Called as:
113 <command interpreter="python">
114 rgLDIndep.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
115 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1'
116 '$out_file1.extra_files_path' '$window' '$step' '$r2'
117 </command>
118 """
119 sys.exit(1)
120 plog = ['## Rgenetics: http://rgenetics.org Galaxy Tools rgLDIndep.py started %s\n' % timenow()]
121 inpath = sys.argv[1]
122 inbase = sys.argv[2]
123 killme = string.punctuation + string.whitespace
124 trantab = string.maketrans(killme,'_'*len(killme))
125 title = sys.argv[3].translate(trantab)
126 mind = sys.argv[4]
127 geno = sys.argv[5]
128 hwe = sys.argv[6]
129 maf = sys.argv[7]
130 me1 = sys.argv[8]
131 me2 = sys.argv[9]
132 outfname = sys.argv[10]
133 outfpath = sys.argv[11]
134 winsize = sys.argv[12]
135 step = sys.argv[13]
136 r2 = sys.argv[14]
137 output = os.path.join(outfpath,outfname)
138 outpath = os.path.join(outfpath,title)
139 outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
140 try:
141 os.makedirs(outfpath)
142 except:
143 pass
144 bfile = os.path.join(inpath,inbase)
145 filterout = os.path.join(outpath,'filtered_%s' % inbase)
146 outf = file(outfname,'w')
147 outf.write(galhtmlprefix % prog)
148 ldin = bfile
149 plinktasks = [['--bfile',ldin,'--indep-pairwise %s %s %s' % (winsize,step,r2),'--out',outpath,
150 '--mind',mind,'--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2,],
151 ['--bfile',ldin,'--extract %s.prune.in --make-bed --out %s' % (outpath,outpath)],
152 ['--bfile',outpath,'--recode --out',outpath]] # make map file - don't really need ped but...
153 # subset of ld independent markers for eigenstrat and other requirements
154 vclbase = [plinke,'--noweb']
155 prunelog = pruneld(plinktasks=plinktasks,cd=outfpath,vclbase = vclbase)
156 """This generates the same output files as the first version;
157 the only difference is that a simple pairwise threshold is used.
158 The first two parameters (50 and 5) are the same as above (window size and step);
159 the third parameter represents the r^2 threshold.
160 Note: this represents the pairwise SNP-SNP metric now, not the
161 multiple correlation coefficient; also note, this is based on the
162 genotypic correlation, i.e. it does not involve phasing.
163 """
164 plog += prunelog
165 flog = '%s.log' % outpath
166 flogf = open(flog,'w')
167 flogf.write(''.join(plog))
168 flogf.write('\n')
169 flogf.close()
170 globme = os.path.join(outfpath,'*')
171 flist = glob.glob(globme)
172 flist.sort()
173 for i, data in enumerate( flist ):
174 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
175 outf.write('</ol></div>\n')
176 outf.write("</div></body></html>")
177 outf.close()
178
179
180 if __name__ == "__main__":
181 clean()
182