comparison tools/rgenetics/rgClean.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 """
2 # galaxy tool xml files can define a galaxy supplied output filename
3 # that must be passed to the tool and used to return output
4 # here, the plink log file is copied to that file and removed
5 # took a while to figure this out!
6 # use exec_before_job to give files sensible names
7 #
8 # ross april 14 2007
9 # plink cleanup script
10 # ross lazarus March 2007 for camp illumina whole genome data
11 # note problems with multiple commands being ignored - eg --freq --missing --mendel
12 # only the first seems to get done...
13 #
14 ##Summary statistics versus inclusion criteria
15 ##
16 ##Feature As summary statistic As inclusion criteria
17 ##Missingness per individual --missing --mind N
18 ##Missingness per marker --missing --geno N
19 ##Allele frequency --freq --maf N
20 ##Hardy-Weinberg equilibrium --hardy --hwe N
21 ##Mendel error rates --mendel --me N M
22 #
23 # call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile
24 # note plinkClean_code.py does some renaming before the job starts
25
26
27 <command interpreter="python2.4">
28 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf'
29 '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId'
30
31
32 """
33 import sys,shutil,os,subprocess, glob, string, tempfile, time
34 from rgutils import galhtmlprefix, timenow, plinke
35 prog = os.path.split(sys.argv[0])[-1]
36 myversion = 'January 4 2010'
37 verbose=False
38
39
40 def fixoutaff(outpath='',newaff='1'):
41 """ quick way to create test data sets - set all aff to 1 or 2 for
42 some hapmap data and then merge
43 [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam
44 1341 14 0 0 2 1
45 1341 2 13 14 2 1
46 1341 13 0 0 1 1
47 1340 9 0 0 1 1
48 1340 10 0 0 2 1
49 """
50 nchanged = 0
51 fam = '%s.fam' % outpath
52 famf = open(fam,'r')
53 fl = famf.readlines()
54 famf.close()
55 for i,row in enumerate(fl):
56 lrow = row.split()
57 if lrow[-1] <> newaff:
58 lrow[-1] = newaff
59 fl[i] = ' '.join(lrow)
60 fl[i] += '\n'
61 nchanged += 1
62 fo = open(fam,'w')
63 fo.write(''.join(fl))
64 fo.close()
65 return nchanged
66
67
68
69 def clean():
70 """
71 """
72 if len(sys.argv) < 16:
73 print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
74 print >> sys.stdout, """this script will filter a linkage format ped
75 and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
76 a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
77 documented in the plink docs plus the file to be returned to Galaxy
78 called as:
79 <command interpreter="python">
80 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
81 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path'
82 '$relfilter' '$afffilter' '$sexfilter' '$fixaff'
83 </command>
84
85 """
86 sys.exit(1)
87 plog = []
88 inpath = sys.argv[1]
89 inbase = sys.argv[2]
90 killme = string.punctuation + string.whitespace
91 trantab = string.maketrans(killme,'_'*len(killme))
92 title = sys.argv[3].translate(trantab)
93 mind = sys.argv[4]
94 geno = sys.argv[5]
95 hwe = sys.argv[6]
96 maf = sys.argv[7]
97 me1 = sys.argv[8]
98 me2 = sys.argv[9]
99 outfname = sys.argv[10]
100 outfpath = sys.argv[11]
101 relf = sys.argv[12]
102 afff = sys.argv[13]
103 sexf = sys.argv[14]
104 fixaff = sys.argv[15]
105 output = os.path.join(outfpath,outfname)
106 outpath = os.path.join(outfpath,title)
107 outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
108 try:
109 os.makedirs(outfpath)
110 except:
111 pass
112 bfile = os.path.join(inpath,inbase)
113 outf = file(outfname,'w')
114 vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out',
115 outpath,'--set-hh-missing','--mind',mind,
116 '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2]
117 # yes - the --me parameter takes 2 values - mendels per snp and per family
118 if relf == 'oo': # plink filters are what they leave...
119 vcl.append('--filter-nonfounders') # leave only offspring
120 elif relf == 'fo':
121 vcl.append('--filter-founders')
122 if afff == 'affonly':
123 vcl.append('--filter-controls')
124 elif relf == 'unaffonly':
125 vcl.append('--filter-cases')
126 if sexf == 'fsex':
127 vcl.append('--filter-females')
128 elif relf == 'msex':
129 vcl.append('--filter-males')
130 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
131 retval = p.wait()
132 plog.append('%s started, called as %s' % (prog,' '.join(sys.argv)))
133 outf.write(galhtmlprefix % prog)
134 outf.write('<ul>\n')
135 plogf = '%s.log' % os.path.join(outfpath,title)
136 try:
137 plogl = file(plogf,'r').readlines()
138 plog += [x.strip() for x in plogl]
139 except:
140 plog += ['###Cannot open plink log file %s' % plogf,]
141 # if fixaff, want to 'fix' the fam file
142 if fixaff <> '0':
143 nchanged = fixoutaff(outpath=outpath,newaff=fixaff)
144 plog += ['## fixaff was requested %d subjects affection status changed to %s' % (nchanged,fixaff)]
145 pf = file(plogf,'w')
146 pf.write('\n'.join(plog))
147 pf.close()
148 globme = os.path.join(outfpath,'*')
149 flist = glob.glob(globme)
150 flist.sort()
151 for i, data in enumerate( flist ):
152 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
153 outf.write('</ul>\n')
154 outf.write("</ul></br></div></body></html>")
155 outf.close()
156
157
158 if __name__ == "__main__":
159 clean()
160