0
|
1 """
|
|
2 # galaxy tool xml files can define a galaxy supplied output filename
|
|
3 # that must be passed to the tool and used to return output
|
|
4 # here, the plink log file is copied to that file and removed
|
|
5 # took a while to figure this out!
|
|
6 # use exec_before_job to give files sensible names
|
|
7 #
|
|
8 # ross april 14 2007
|
|
9 # plink cleanup script
|
|
10 # ross lazarus March 2007 for camp illumina whole genome data
|
|
11 # note problems with multiple commands being ignored - eg --freq --missing --mendel
|
|
12 # only the first seems to get done...
|
|
13 #
|
|
14 ##Summary statistics versus inclusion criteria
|
|
15 ##
|
|
16 ##Feature As summary statistic As inclusion criteria
|
|
17 ##Missingness per individual --missing --mind N
|
|
18 ##Missingness per marker --missing --geno N
|
|
19 ##Allele frequency --freq --maf N
|
|
20 ##Hardy-Weinberg equilibrium --hardy --hwe N
|
|
21 ##Mendel error rates --mendel --me N M
|
|
22 #
|
|
23 # call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile
|
|
24 # note plinkClean_code.py does some renaming before the job starts
|
|
25
|
|
26
|
|
27 <command interpreter="python2.4">
|
|
28 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf'
|
|
29 '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId'
|
|
30
|
|
31
|
|
32 """
|
|
33 import sys,shutil,os,subprocess, glob, string, tempfile, time
|
|
34 from rgutils import galhtmlprefix, timenow, plinke
|
|
35 prog = os.path.split(sys.argv[0])[-1]
|
|
36 myversion = 'January 4 2010'
|
|
37 verbose=False
|
|
38
|
|
39
|
|
40 def fixoutaff(outpath='',newaff='1'):
|
|
41 """ quick way to create test data sets - set all aff to 1 or 2 for
|
|
42 some hapmap data and then merge
|
|
43 [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam
|
|
44 1341 14 0 0 2 1
|
|
45 1341 2 13 14 2 1
|
|
46 1341 13 0 0 1 1
|
|
47 1340 9 0 0 1 1
|
|
48 1340 10 0 0 2 1
|
|
49 """
|
|
50 nchanged = 0
|
|
51 fam = '%s.fam' % outpath
|
|
52 famf = open(fam,'r')
|
|
53 fl = famf.readlines()
|
|
54 famf.close()
|
|
55 for i,row in enumerate(fl):
|
|
56 lrow = row.split()
|
|
57 if lrow[-1] <> newaff:
|
|
58 lrow[-1] = newaff
|
|
59 fl[i] = ' '.join(lrow)
|
|
60 fl[i] += '\n'
|
|
61 nchanged += 1
|
|
62 fo = open(fam,'w')
|
|
63 fo.write(''.join(fl))
|
|
64 fo.close()
|
|
65 return nchanged
|
|
66
|
|
67
|
|
68
|
|
69 def clean():
|
|
70 """
|
|
71 """
|
|
72 if len(sys.argv) < 16:
|
|
73 print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
|
|
74 print >> sys.stdout, """this script will filter a linkage format ped
|
|
75 and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
|
|
76 a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
|
|
77 documented in the plink docs plus the file to be returned to Galaxy
|
|
78 called as:
|
|
79 <command interpreter="python">
|
|
80 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
|
|
81 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path'
|
|
82 '$relfilter' '$afffilter' '$sexfilter' '$fixaff'
|
|
83 </command>
|
|
84
|
|
85 """
|
|
86 sys.exit(1)
|
|
87 plog = []
|
|
88 inpath = sys.argv[1]
|
|
89 inbase = sys.argv[2]
|
|
90 killme = string.punctuation + string.whitespace
|
|
91 trantab = string.maketrans(killme,'_'*len(killme))
|
|
92 title = sys.argv[3].translate(trantab)
|
|
93 mind = sys.argv[4]
|
|
94 geno = sys.argv[5]
|
|
95 hwe = sys.argv[6]
|
|
96 maf = sys.argv[7]
|
|
97 me1 = sys.argv[8]
|
|
98 me2 = sys.argv[9]
|
|
99 outfname = sys.argv[10]
|
|
100 outfpath = sys.argv[11]
|
|
101 relf = sys.argv[12]
|
|
102 afff = sys.argv[13]
|
|
103 sexf = sys.argv[14]
|
|
104 fixaff = sys.argv[15]
|
|
105 output = os.path.join(outfpath,outfname)
|
|
106 outpath = os.path.join(outfpath,title)
|
|
107 outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
|
|
108 try:
|
|
109 os.makedirs(outfpath)
|
|
110 except:
|
|
111 pass
|
|
112 bfile = os.path.join(inpath,inbase)
|
|
113 outf = file(outfname,'w')
|
|
114 vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out',
|
|
115 outpath,'--set-hh-missing','--mind',mind,
|
|
116 '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2]
|
|
117 # yes - the --me parameter takes 2 values - mendels per snp and per family
|
|
118 if relf == 'oo': # plink filters are what they leave...
|
|
119 vcl.append('--filter-nonfounders') # leave only offspring
|
|
120 elif relf == 'fo':
|
|
121 vcl.append('--filter-founders')
|
|
122 if afff == 'affonly':
|
|
123 vcl.append('--filter-controls')
|
|
124 elif relf == 'unaffonly':
|
|
125 vcl.append('--filter-cases')
|
|
126 if sexf == 'fsex':
|
|
127 vcl.append('--filter-females')
|
|
128 elif relf == 'msex':
|
|
129 vcl.append('--filter-males')
|
|
130 p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
|
|
131 retval = p.wait()
|
|
132 plog.append('%s started, called as %s' % (prog,' '.join(sys.argv)))
|
|
133 outf.write(galhtmlprefix % prog)
|
|
134 outf.write('<ul>\n')
|
|
135 plogf = '%s.log' % os.path.join(outfpath,title)
|
|
136 try:
|
|
137 plogl = file(plogf,'r').readlines()
|
|
138 plog += [x.strip() for x in plogl]
|
|
139 except:
|
|
140 plog += ['###Cannot open plink log file %s' % plogf,]
|
|
141 # if fixaff, want to 'fix' the fam file
|
|
142 if fixaff <> '0':
|
|
143 nchanged = fixoutaff(outpath=outpath,newaff=fixaff)
|
|
144 plog += ['## fixaff was requested %d subjects affection status changed to %s' % (nchanged,fixaff)]
|
|
145 pf = file(plogf,'w')
|
|
146 pf.write('\n'.join(plog))
|
|
147 pf.close()
|
|
148 globme = os.path.join(outfpath,'*')
|
|
149 flist = glob.glob(globme)
|
|
150 flist.sort()
|
|
151 for i, data in enumerate( flist ):
|
|
152 outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
|
|
153 outf.write('</ul>\n')
|
|
154 outf.write("</ul></br></div></body></html>")
|
|
155 outf.close()
|
|
156
|
|
157
|
|
158 if __name__ == "__main__":
|
|
159 clean()
|
|
160
|