Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgClean.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 """ | |
2 # galaxy tool xml files can define a galaxy supplied output filename | |
3 # that must be passed to the tool and used to return output | |
4 # here, the plink log file is copied to that file and removed | |
5 # took a while to figure this out! | |
6 # use exec_before_job to give files sensible names | |
7 # | |
8 # ross april 14 2007 | |
9 # plink cleanup script | |
10 # ross lazarus March 2007 for camp illumina whole genome data | |
11 # note problems with multiple commands being ignored - eg --freq --missing --mendel | |
12 # only the first seems to get done... | |
13 # | |
14 ##Summary statistics versus inclusion criteria | |
15 ## | |
16 ##Feature As summary statistic As inclusion criteria | |
17 ##Missingness per individual --missing --mind N | |
18 ##Missingness per marker --missing --geno N | |
19 ##Allele frequency --freq --maf N | |
20 ##Hardy-Weinberg equilibrium --hardy --hwe N | |
21 ##Mendel error rates --mendel --me N M | |
22 # | |
23 # call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile | |
24 # note plinkClean_code.py does some renaming before the job starts | |
25 | |
26 | |
27 <command interpreter="python2.4"> | |
28 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf' | |
29 '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId' | |
30 | |
31 | |
32 """ | |
33 import sys,shutil,os,subprocess, glob, string, tempfile, time | |
34 from rgutils import galhtmlprefix, timenow, plinke | |
35 prog = os.path.split(sys.argv[0])[-1] | |
36 myversion = 'January 4 2010' | |
37 verbose=False | |
38 | |
39 | |
40 def fixoutaff(outpath='',newaff='1'): | |
41 """ quick way to create test data sets - set all aff to 1 or 2 for | |
42 some hapmap data and then merge | |
43 [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam | |
44 1341 14 0 0 2 1 | |
45 1341 2 13 14 2 1 | |
46 1341 13 0 0 1 1 | |
47 1340 9 0 0 1 1 | |
48 1340 10 0 0 2 1 | |
49 """ | |
50 nchanged = 0 | |
51 fam = '%s.fam' % outpath | |
52 famf = open(fam,'r') | |
53 fl = famf.readlines() | |
54 famf.close() | |
55 for i,row in enumerate(fl): | |
56 lrow = row.split() | |
57 if lrow[-1] <> newaff: | |
58 lrow[-1] = newaff | |
59 fl[i] = ' '.join(lrow) | |
60 fl[i] += '\n' | |
61 nchanged += 1 | |
62 fo = open(fam,'w') | |
63 fo.write(''.join(fl)) | |
64 fo.close() | |
65 return nchanged | |
66 | |
67 | |
68 | |
69 def clean(): | |
70 """ | |
71 """ | |
72 if len(sys.argv) < 16: | |
73 print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv) | |
74 print >> sys.stdout, """this script will filter a linkage format ped | |
75 and map file containing genotypes. It takes 14 parameters - the plink --f parameter and" | |
76 a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei" | |
77 documented in the plink docs plus the file to be returned to Galaxy | |
78 called as: | |
79 <command interpreter="python"> | |
80 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' | |
81 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path' | |
82 '$relfilter' '$afffilter' '$sexfilter' '$fixaff' | |
83 </command> | |
84 | |
85 """ | |
86 sys.exit(1) | |
87 plog = [] | |
88 inpath = sys.argv[1] | |
89 inbase = sys.argv[2] | |
90 killme = string.punctuation + string.whitespace | |
91 trantab = string.maketrans(killme,'_'*len(killme)) | |
92 title = sys.argv[3].translate(trantab) | |
93 mind = sys.argv[4] | |
94 geno = sys.argv[5] | |
95 hwe = sys.argv[6] | |
96 maf = sys.argv[7] | |
97 me1 = sys.argv[8] | |
98 me2 = sys.argv[9] | |
99 outfname = sys.argv[10] | |
100 outfpath = sys.argv[11] | |
101 relf = sys.argv[12] | |
102 afff = sys.argv[13] | |
103 sexf = sys.argv[14] | |
104 fixaff = sys.argv[15] | |
105 output = os.path.join(outfpath,outfname) | |
106 outpath = os.path.join(outfpath,title) | |
107 outprunepath = os.path.join(outfpath,'ldprune_%s' % title) | |
108 try: | |
109 os.makedirs(outfpath) | |
110 except: | |
111 pass | |
112 bfile = os.path.join(inpath,inbase) | |
113 outf = file(outfname,'w') | |
114 vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out', | |
115 outpath,'--set-hh-missing','--mind',mind, | |
116 '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2] | |
117 # yes - the --me parameter takes 2 values - mendels per snp and per family | |
118 if relf == 'oo': # plink filters are what they leave... | |
119 vcl.append('--filter-nonfounders') # leave only offspring | |
120 elif relf == 'fo': | |
121 vcl.append('--filter-founders') | |
122 if afff == 'affonly': | |
123 vcl.append('--filter-controls') | |
124 elif relf == 'unaffonly': | |
125 vcl.append('--filter-cases') | |
126 if sexf == 'fsex': | |
127 vcl.append('--filter-females') | |
128 elif relf == 'msex': | |
129 vcl.append('--filter-males') | |
130 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath) | |
131 retval = p.wait() | |
132 plog.append('%s started, called as %s' % (prog,' '.join(sys.argv))) | |
133 outf.write(galhtmlprefix % prog) | |
134 outf.write('<ul>\n') | |
135 plogf = '%s.log' % os.path.join(outfpath,title) | |
136 try: | |
137 plogl = file(plogf,'r').readlines() | |
138 plog += [x.strip() for x in plogl] | |
139 except: | |
140 plog += ['###Cannot open plink log file %s' % plogf,] | |
141 # if fixaff, want to 'fix' the fam file | |
142 if fixaff <> '0': | |
143 nchanged = fixoutaff(outpath=outpath,newaff=fixaff) | |
144 plog += ['## fixaff was requested %d subjects affection status changed to %s' % (nchanged,fixaff)] | |
145 pf = file(plogf,'w') | |
146 pf.write('\n'.join(plog)) | |
147 pf.close() | |
148 globme = os.path.join(outfpath,'*') | |
149 flist = glob.glob(globme) | |
150 flist.sort() | |
151 for i, data in enumerate( flist ): | |
152 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) | |
153 outf.write('</ul>\n') | |
154 outf.write("</ul></br></div></body></html>") | |
155 outf.close() | |
156 | |
157 | |
158 if __name__ == "__main__": | |
159 clean() | |
160 |