Mercurial > repos > gbcs-embl-heidelberg > jemultiplexer
comparison jemultiplexer.py @ 2:1b79b43626ef draft
Uploaded
author | gbcs-embl-heidelberg |
---|---|
date | Wed, 03 Sep 2014 04:12:06 -0400 |
parents | |
children | 861cbe4eca25 |
comparison
equal
deleted
inserted
replaced
1:9764802ffae8 | 2:1b79b43626ef |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import os, sys, string, shutil, subprocess, tempfile, re | |
4 | |
5 def cleanup(tmpdir): | |
6 # cleanup | |
7 try: | |
8 if os.path.exists( tmpdir ): | |
9 shutil.rmtree( tmpdir ) | |
10 except Exception, e: | |
11 stop_err( 'Error cleaning. ' + str( e ) ) | |
12 | |
13 # In the unlikely event of a fire, please use the nearest emergency exit | |
14 def stop_err( msg ): | |
15 sys.stderr.write( '%s\n' % msg ) | |
16 sys.exit() | |
17 | |
18 def __main__(): | |
19 tooldir = os.path.dirname(sys.argv[0]) | |
20 executable = tooldir + "/jemultiplexer_embase_1.0.4_bundle.jar" | |
21 if not os.path.exists(executable): | |
22 stop_err( "The file \"%s\" was not found. " % ( executable ) ) | |
23 mpxdata = sys.argv[1] | |
24 output1 = sys.argv[2] | |
25 output1id = sys.argv[3] | |
26 barcodes = sys.argv[4] | |
27 barcode_list = sys.argv[5] | |
28 newfilepath = sys.argv[6] | |
29 extension = sys.argv[7] | |
30 barlen = sys.argv[8] | |
31 qualityFormat = sys.argv[9] | |
32 maxMismatches = sys.argv[10] | |
33 minBaseQuality = sys.argv[11] | |
34 minMismatchingDelta = sys.argv[12] | |
35 xTrimLen = sys.argv[13] | |
36 zTrimLen = sys.argv[14] | |
37 clipBarcode = sys.argv[15] | |
38 addBarcodeToHeader = sys.argv[16] | |
39 gzipOutput = sys.argv[17] | |
40 barcodeDiagFile = sys.argv[18] | |
41 rChar = sys.argv[19] | |
42 barcodeReadPos = sys.argv[20] | |
43 barcodeForSampleMatching = sys.argv[21] | |
44 redundantBarcode = sys.argv[22] | |
45 strict = sys.argv[23] | |
46 MpxData2 = sys.argv[24] | |
47 | |
48 ## create the tmpdir & co | |
49 tmpdir = newfilepath + "/demultiplex_" + output1id | |
50 oldnames=[] | |
51 stat = tmpdir+"/jemultiplexer_out_stats.txt" #the default output stat file name | |
52 try: | |
53 if not os.path.isdir(tmpdir): | |
54 os.mkdir(tmpdir) | |
55 except Exception, e: | |
56 stop_err( "Error creating directory \"%s\". %s" % ( tmpdir, str( e ) ) ) | |
57 | |
58 #output file extension | |
59 if gzipOutput == "true": | |
60 outputExtension=".gz" | |
61 else: | |
62 outputExtension="" | |
63 | |
64 # Reconstructing the output file names as jemultiplexer writes them | |
65 if MpxData2 != "single": | |
66 oldnames.append('unassigned_1.txt' + outputExtension) | |
67 oldnames.append('unassigned_2.txt' + outputExtension) | |
68 else: | |
69 oldnames.append('unassigned_1.txt' + outputExtension) | |
70 if barcodeDiagFile=="true": | |
71 oldnames.append('barcode_match_report.txt') | |
72 if barcode_list == "none": # If a .bs file was given | |
73 bc = open(barcodes, "r") | |
74 bcw = open( tmpdir + "/barcodes.txt", "w" ) | |
75 for line in bc.readlines(): | |
76 l = line.split() | |
77 if l[0] != "": | |
78 if MpxData2 != "single": | |
79 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) | |
80 oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension) | |
81 bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n") | |
82 else: | |
83 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) | |
84 bcw.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n") | |
85 bc.close() | |
86 bcw.close() | |
87 elif barcodes == "none": # If the text area was used | |
88 lines = barcode_list.split("__cr____cn__") | |
89 #bc = csv.writer( open( tmpdir + "/barcodes.txt", "w" ), delimiter = "\t" ) | |
90 bc = open( tmpdir + "/barcodes.txt", "w" ) | |
91 for l in lines: | |
92 l = l.replace("__tc__", " ").split() | |
93 if len(l) < 2: | |
94 continue | |
95 if l[0] != "": | |
96 if MpxData2 != "single": | |
97 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) | |
98 oldnames.append(l[0]+'_'+l[1]+'_2.txt' + outputExtension) | |
99 bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\t" + l[0]+'_'+l[1]+'_2.txt' + outputExtension + "\n") | |
100 else: | |
101 oldnames.append(l[0]+'_'+l[1]+'_1.txt' + outputExtension) | |
102 bc.write(l[0] + "\t" + l[1] + "\t" + l[0]+'_'+l[1]+'_1.txt' + outputExtension + "\n") | |
103 bc.close() | |
104 | |
105 ## Building the command line | |
106 cmd = "java -Xmx8g -jar " + executable | |
107 cmd+= " F1=" | |
108 cmd+= mpxdata | |
109 ## if we have PE data | |
110 if MpxData2 != "single": | |
111 cmd+= " F2=" | |
112 cmd+= MpxData2 | |
113 cmd+= " BPOS=" | |
114 cmd+= barcodeReadPos | |
115 cmd+= " BRED=" | |
116 cmd+= redundantBarcode | |
117 cmd+= " BM=" | |
118 cmd+= barcodeForSampleMatching | |
119 cmd+= " S=" | |
120 cmd+= strict | |
121 cmd+= " BF=" | |
122 cmd+= tmpdir + "/barcodes.txt" | |
123 cmd+= " OUTPUT_DIR=\"" | |
124 cmd+= tmpdir | |
125 cmd+= "\"" | |
126 cmd+= " BCLEN=" | |
127 cmd+= barlen | |
128 cmd+= " QUALITY_FORMAT=" | |
129 cmd+= qualityFormat | |
130 cmd+= " MAX_MISMATCHES=" | |
131 cmd+= maxMismatches | |
132 cmd+= " MIN_BASE_QUALITY=" | |
133 cmd+= minBaseQuality | |
134 cmd+= " MMD=" | |
135 cmd+= minMismatchingDelta | |
136 cmd+= " XT=" | |
137 cmd+= xTrimLen | |
138 cmd+= " ZT=" | |
139 cmd+= zTrimLen | |
140 cmd+= " C=" | |
141 cmd+= clipBarcode | |
142 cmd+= " ADD=" | |
143 cmd+= addBarcodeToHeader | |
144 cmd+= " GZ=" | |
145 cmd+= gzipOutput | |
146 if rChar=="2": | |
147 cmd+= " RCHAR=" | |
148 cmd+= ":" | |
149 elif rChar=="3": | |
150 cmd+= " RCHAR=" | |
151 cmd+= "_" | |
152 elif rChar=="4": | |
153 cmd+= " RCHAR=" | |
154 cmd+= "-" | |
155 if barcodeDiagFile=="true": | |
156 cmd+= " BARCODE_DIAG_FILE=" | |
157 cmd+= 'barcode_match_report.txt' | |
158 | |
159 | |
160 # Executing jemultiplexer | |
161 # status = os.system(cmd) | |
162 try: | |
163 tmperr = tempfile.NamedTemporaryFile( dir=tmpdir ).name | |
164 tmpout = tempfile.NamedTemporaryFile( dir=tmpdir ).name | |
165 tmp_stderr = open( tmperr, 'wb' ) | |
166 tmp_stdout = open( tmpout, 'wb' ) | |
167 proc = subprocess.Popen( args=cmd, | |
168 shell=True, | |
169 cwd=tmpdir, | |
170 stdout=tmp_stdout.fileno(), | |
171 stderr=tmp_stderr.fileno() ) | |
172 returncode = proc.wait() | |
173 tmp_stderr.close() | |
174 tmp_stdout.close() | |
175 # get stderr, allowing for case where it's very large | |
176 tmp_stderr = open( tmperr, 'rb' ) | |
177 stderr = '' | |
178 buffsize = 1048576 | |
179 try: | |
180 while True: | |
181 stderr += tmp_stderr.read( buffsize ) | |
182 if not stderr or len( stderr ) % buffsize != 0: | |
183 break | |
184 except OverflowError: | |
185 pass | |
186 tmp_stderr.close() | |
187 if returncode != 0: | |
188 raise Exception, stderr | |
189 except Exception, e: | |
190 #cleanup(tmpdir) | |
191 stop_err( 'Error demultiplexing sequence. ' + str( e ) ) | |
192 | |
193 # Creating the required paths for multiple outputs | |
194 newnames=[] | |
195 if os.path.isdir(tmpdir): | |
196 for f in oldnames: | |
197 tmpf = tmpdir+"/"+f | |
198 if os.path.isfile(tmpf): | |
199 # check the size | |
200 if os.path.getsize(tmpf) == 0: | |
201 stop_err( 'The output file: ' + f + ' is empty, there may be an error with your barcode file or settings.') | |
202 name = f | |
203 s = "primary_" | |
204 s+= output1id | |
205 s+= "_" | |
206 s+= string.replace(name, "_", "-") | |
207 s+= "_visible_" | |
208 if extension == "fastqillumina": | |
209 if gzipOutput == "true": | |
210 s+="gz" | |
211 else: | |
212 s+= extension | |
213 else: | |
214 if f=="barcode_match_report.txt": | |
215 s+="text" | |
216 else: | |
217 if gzipOutput == "true": | |
218 s+="gz" | |
219 else: | |
220 s+="fastqsanger" | |
221 newnames.append(newfilepath+"/"+s) | |
222 # Adding the appropriate prefixes to the old filenames | |
223 for i in range(len(oldnames)): | |
224 oldnames[i] = tmpdir+"/"+oldnames[i] | |
225 | |
226 ### NUMSTAT rewriting ### | |
227 htmlout = open(output1, "w") | |
228 numstat = open(stat, "r") | |
229 htmlout.write("<html><head><title>numStat</title></head><body><!--\n") | |
230 for l in numstat.readlines(): | |
231 htmlout.write(l) | |
232 numstat.seek(0) | |
233 htmlout.write("-->\n<h2>Please refresh your history to display the new datasets</h2>\n<h3>numStat</h3><table border=\"1\">\n") | |
234 htmlout.write("<tr><td>%s</td></tr>\n" % (cmd )) | |
235 for l in numstat.readlines(): | |
236 l = l.split() | |
237 htmlout.write("<tr><td>%s</td><td>%s</td></tr>\n" % (l[0], l[1]) ) | |
238 htmlout.write("</table></body></html>") | |
239 numstat.close() | |
240 htmlout.close() | |
241 | |
242 #~ # Moving the first file (the mandatory output file defined in the xml (the txt stats)) | |
243 #~ shutil.move(stat,output1) | |
244 #~ | |
245 #~ # add a warning to the numStat | |
246 #~ statfh = open(stat,'a') | |
247 #~ statfh.write("\nRefresh you library to see the new DataSets.\n") | |
248 #~ statfh.close() | |
249 | |
250 # Moving everything where it will be seen properly by Galaxy | |
251 for i in range(len(oldnames)): | |
252 shutil.move(oldnames[i],newnames[i]) | |
253 | |
254 # cleanup | |
255 cleanup(tmpdir) | |
256 | |
257 # Ta-da! | |
258 if __name__=="__main__": __main__() |