Mercurial > repos > pfrommolt > ngsrich
comparison NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:89ad0a9cca52 |
---|---|
1 package _main; | |
2 | |
3 import java.io.File; | |
4 import java.io.FileWriter; | |
5 import java.io.IOException; | |
6 import java.sql.Time; | |
7 import java.util.Scanner; | |
8 | |
9 import middlewares.Misc; | |
10 import converters.Read2Wig; | |
11 import converters.ReadOnTarget2Wig; | |
12 import exceptions.ChromosomeFormatException; | |
13 import exceptions.ChromosomeMismatchException; | |
14 import exceptions.ChromosomeNotFoundException; | |
15 import exceptions.FileFormatException; | |
16 import exceptions.GenomeAnnotationException; | |
17 import exceptions.NullOrNegativeRangeException; | |
18 import exceptions.RangeFormatException; | |
19 import exceptions.RangeLimitNotFoundException; | |
20 import filters.Filter; | |
21 import filters.GenomeFilter; | |
22 import filters.ReadFilter; | |
23 import filters.TargetFilter; | |
24 | |
25 /** | |
26 * This is a scheduler for the phases of the evaluation. | |
27 * | |
28 * @author Ali Abdallah | |
29 * @version 19.03.2011 | |
30 * @since jdk 1.6 | |
31 */ | |
32 | |
33 public class Enrichment { | |
34 | |
35 String | |
36 /** | |
37 * The name of the read alignment file. | |
38 */ | |
39 readFileName, | |
40 /** | |
41 * the name of the genome annotation file. | |
42 */ | |
43 genomeFName, | |
44 /** | |
45 * the ucsc-name of the genome (e.g. hg19). | |
46 */ | |
47 genomeName, | |
48 /** | |
49 * the name of the file of the targeted regions. | |
50 */ | |
51 targetFName, | |
52 /** | |
53 * the temporary directory. | |
54 */ | |
55 tmpDir, | |
56 /** | |
57 * the output directory. | |
58 */ | |
59 outDir, | |
60 /** | |
61 * the xml summary file containing various overall statistics. | |
62 */ | |
63 xmlSummaryFile, | |
64 /** | |
65 * the name of the detailed coverage data file. | |
66 */ | |
67 detailsFileName, | |
68 /** | |
69 * the .bed file containing target-level coverage statistics data. | |
70 */ | |
71 beddetailsFileName, | |
72 /** | |
73 * the process number used for unique naming. | |
74 */ | |
75 proc, | |
76 /** | |
77 * the name of the read alignment file without extension. | |
78 */ | |
79 prefix, | |
80 /** | |
81 * the extension of the read alignment file. | |
82 */ | |
83 suffix, | |
84 /** | |
85 * The name of the read alignment file after the reduction. | |
86 */ | |
87 areadFileName, | |
88 /** | |
89 * The name of the genome annotation file after the reduction. | |
90 */ | |
91 aGenomeFName, | |
92 /** | |
93 * the name of the file of the targeted regions after the reduction. | |
94 */ | |
95 aTargetFName; | |
96 int | |
97 /** | |
98 * Cutoff for poor coverage [default: 2]. | |
99 * | |
100 */ | |
101 poor, | |
102 /** | |
103 * Cutoff for high coverage [default: 200]. | |
104 */ | |
105 high, | |
106 details; | |
107 | |
108 /** | |
109 * Construcs an enrichment object and initializes the evaluation parameters. | |
110 * @param args the list of all evaluation parameters. | |
111 */ | |
112 public Enrichment(String... args) { | |
113 readFileName = args[0]; | |
114 genomeName = args[8]; | |
115 targetFName = args[2]; | |
116 if (!targetFName.endsWith("bed")) { | |
117 System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format."); | |
118 } | |
119 if (args.length > 3) { | |
120 Time time = new Time(System.currentTimeMillis()); | |
121 String nano = ""+System.nanoTime(); | |
122 | |
123 tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+ | |
124 time.getHours()+"_"+ | |
125 time.getMinutes()+"_"+ | |
126 time.getSeconds()+"_"+nano+ "/"; | |
127 new File(tmpDir).mkdir(); | |
128 new File(tmpDir).setReadable(true); | |
129 new File(tmpDir).setWritable(true); | |
130 outDir = args[4] + Misc.slash(args[4]); | |
131 | |
132 if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) { | |
133 System.out.println("File " + tmpDir | |
134 + " is not a directory.\nProgram stopped."); | |
135 System.exit(0); | |
136 } else if (!(new File(tmpDir).exists())) { | |
137 System.out.println("File " + tmpDir | |
138 + " not found.\nProgram stopped."); | |
139 System.exit(0); | |
140 } | |
141 } | |
142 if (args.length > 5) { | |
143 try { | |
144 poor = Integer.parseInt(args[5]); | |
145 high = Integer.parseInt(args[6]); | |
146 } catch (NumberFormatException nfe) { | |
147 System.out | |
148 .println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values."); | |
149 poor = 2; | |
150 high = 200; | |
151 } | |
152 } else { | |
153 poor = 2; | |
154 high = 200; | |
155 } | |
156 | |
157 genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1]; | |
158 | |
159 prefix = Misc.prefix(readFileName); | |
160 if (args[7] != "none") { | |
161 prefix = args[7]; | |
162 } | |
163 details = Integer.parseInt(args[9]); | |
164 | |
165 suffix = Misc.suffix(readFileName); | |
166 proc = Misc.getHostName(); | |
167 areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt"; | |
168 aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt"; | |
169 aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed"; | |
170 detailsFileName = tmpDir + "coverage_" + proc + ".txt"; | |
171 xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml"; | |
172 beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed"; | |
173 } | |
174 | |
175 public static final String[] GENOME_NAMES = { | |
176 "anoGam1", "bosTau4", "cb3", "ce4", "ce6", | |
177 "danRer5", "danRer6", "danRer7", "dm2", "dm3", | |
178 "galGal2", "galGal3", "hg19", "hg18", "mm8", | |
179 "mm9", "panTro3", "rn3", "rn4", "susScr2" | |
180 }; | |
181 | |
182 public int genomeID(String gname){ | |
183 for(int i = 0; i < GENOME_NAMES.length; i++) | |
184 if(GENOME_NAMES[i].equals(gname)) | |
185 return i; | |
186 return -1; | |
187 } | |
188 | |
189 /** | |
190 * Downloads the specified genome annotation, if internet connection | |
191 * exists. | |
192 * | |
193 * @param genome the ucsc-name of the genome. (e.g. hg18, hg19) | |
194 * @return the very same name of the genome. | |
195 * @throws GenomeAnnotationException | |
196 */ | |
197 | |
198 public String getGenomeAnnotation() throws GenomeAnnotationException{ | |
199 try{ | |
200 return downloadGenomeAnnotation(); | |
201 }catch(Exception e){ | |
202 if(genomeID(genomeName)!=-1) | |
203 return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt"; | |
204 else if(new File(genomeName).exists()) | |
205 return genomeName; | |
206 else | |
207 throw new GenomeAnnotationException(); | |
208 } | |
209 } | |
210 | |
211 public String getChromInfo(){ | |
212 //genomeName | |
213 return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt"; | |
214 } | |
215 | |
216 public String downloadGenomeAnnotation() { | |
217 if (genomeName.indexOf("/") == -1) { | |
218 String tmpGenomeDir = tmpDir + genomeName; | |
219 try { | |
220 if (!new File(tmpGenomeDir).exists()) { | |
221 new File(tmpGenomeDir).mkdir(); | |
222 } | |
223 File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh"); | |
224 if (!getGenomeScript.exists()) { | |
225 getGenomeScript.createNewFile(); | |
226 } | |
227 FileWriter fw = new FileWriter(getGenomeScript); | |
228 | |
229 getGenomeScript.setReadable(true); | |
230 getGenomeScript.setWritable(true); | |
231 getGenomeScript.setExecutable(true); | |
232 getGenomeScript.setExecutable(true); | |
233 String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir) | |
234 + "refGene.txt.gz"; | |
235 fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n" | |
236 + "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName | |
237 + "/database/refGene.txt.gz " + "-O " + tmpGenomeFile | |
238 + "\n" + "gunzip " + tmpGenomeFile + "\n"); | |
239 fw.close(); | |
240 Runtime.getRuntime().exec("sh " + getGenomeScript); | |
241 try { | |
242 Thread.sleep(10000); | |
243 } catch (InterruptedException e) { | |
244 e.printStackTrace(); | |
245 } | |
246 | |
247 } catch (IOException e) { | |
248 e.printStackTrace(); | |
249 } | |
250 return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt"; | |
251 } else { | |
252 return genomeName; | |
253 } | |
254 } | |
255 | |
256 /** | |
257 * A bridge method which converts a bam file to a sam file for further | |
258 * processing. This method assures therefore the compatibility of the | |
259 * software with the bam format. | |
260 * | |
261 * @return the name of the generated sam file. | |
262 * @throws IOException | |
263 */ | |
264 public String bam2sam() throws IOException { | |
265 | |
266 String[] bampath = readFileName.split("/"); | |
267 String bamfile = bampath[bampath.length - 1]; | |
268 String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam"; | |
269 | |
270 File getBam2SamScript = new File(tmpDir + "bam2sam.sh"); | |
271 if (!getBam2SamScript.exists()) { | |
272 getBam2SamScript.createNewFile(); | |
273 } | |
274 FileWriter fw = new FileWriter(getBam2SamScript); | |
275 fw.write("#!/bin/bash\n" + Misc.binDir() | |
276 + "/../thirdparty/samtools/0.1.16/samtools view " | |
277 + readFileName + " > " + samfile); | |
278 fw.close(); | |
279 | |
280 Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript); | |
281 // try { | |
282 // @SuppressWarnings("unused") | |
283 // "exitstate" variable unused | |
284 //int exitstate = p.waitFor(); | |
285 // } catch (InterruptedException e) { | |
286 //e.printStackTrace(); | |
287 // } | |
288 | |
289 Scanner stderr=new Scanner(p.getErrorStream()); | |
290 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} | |
291 stderr.close(); | |
292 | |
293 return (samfile); | |
294 | |
295 } | |
296 | |
297 /** | |
298 * First phase of the evaluation. In this phase input files are being | |
299 * simplified for further processing. | |
300 * | |
301 * @throws FileFormatException | |
302 * @throws ChromosomeMismatchException | |
303 */ | |
304 public void reduceFiles() throws FileFormatException, ChromosomeMismatchException { | |
305 String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt"; | |
306 String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt"; | |
307 String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt"; | |
308 | |
309 | |
310 Filter[] filters = {new ReadFilter(readFileName, aReadFileName), | |
311 new GenomeFilter(genomeFName, aGenomeFName), | |
312 new TargetFilter(targetFName, aTargetFName)}; | |
313 | |
314 for(Filter f: filters) f.filter(); | |
315 | |
316 if(!Misc.areChromosomeCompatible(new File(aReadFileName), | |
317 new File(aTargetFName))) | |
318 throw new ChromosomeMismatchException(); | |
319 | |
320 System.out.println("\nSTEP 1 successfully completed"); | |
321 } | |
322 | |
323 /** | |
324 * Second phase of the evaluation. An EnrichmentStatsComputer object is | |
325 * called with the right parameters and computation is checked for formal | |
326 * correctness. | |
327 */ | |
328 public void computeTargetCoverageFiles() { | |
329 | |
330 File data = new File(outDir + "/data"); | |
331 data.mkdir(); | |
332 File plots = new File(outDir + "/plots"); | |
333 plots.mkdir(); | |
334 | |
335 EnrichmentStatsComputer esc = | |
336 new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir); | |
337 | |
338 try { | |
339 esc.compute(); | |
340 } catch (RangeFormatException e) { | |
341 e.printStackTrace(); | |
342 } catch (ChromosomeFormatException e) { | |
343 e.printStackTrace(); | |
344 } catch (ChromosomeNotFoundException e) { | |
345 e.printStackTrace(); | |
346 } catch (RangeLimitNotFoundException e) { | |
347 e.printStackTrace(); | |
348 } catch (NullOrNegativeRangeException e) { | |
349 e.printStackTrace(); | |
350 } catch (IOException e) { | |
351 e.printStackTrace(); | |
352 } | |
353 | |
354 if (new File(detailsFileName).exists()) { | |
355 System.out.println("\nSTEP 2 successfully completed"); | |
356 } else { | |
357 System.out.println("STEP 2 incomplete."); | |
358 } | |
359 | |
360 } | |
361 | |
362 /** | |
363 * Third phase of the evaluation. Based on the results from phase 2, the | |
364 * evaluation is further developed and evaluation data are visualized by | |
365 * calling "eval_enrichment.R" Script. | |
366 */ | |
367 public void evaluate() { | |
368 Runtime rt = Runtime.getRuntime(); | |
369 try { | |
370 String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R"; | |
371 int lastSlash = outDir.lastIndexOf("/"); | |
372 String outDirR = (lastSlash == outDir.length() - 1) ? outDir | |
373 .substring(0, lastSlash) : outDir; | |
374 | |
375 String call = "Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " " | |
376 + outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " " | |
377 + targetFName+" "+details; | |
378 | |
379 Process p=rt.exec(call); | |
380 | |
381 Scanner stdout = new Scanner(p.getInputStream()); | |
382 Scanner stderr=new Scanner(p.getErrorStream()); | |
383 while (stdout.hasNextLine()){System.out.println(stdout.nextLine());} | |
384 while (stderr.hasNextLine()){System.out.println(stderr.nextLine());} | |
385 stdout.close(); stderr.close(); | |
386 | |
387 if(details == 1){ | |
388 String chrInfoPath = getChromInfo(); | |
389 | |
390 //System.out.println("chrInfoPath\t"+chrInfoPath); | |
391 script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R"; | |
392 outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir; | |
393 | |
394 String call2 = "Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName | |
395 + " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " " | |
396 + genomeName; | |
397 Process p2 = rt.exec(call2); | |
398 stdout=new Scanner(p2.getInputStream()); | |
399 stderr=new Scanner(p2.getErrorStream()); | |
400 | |
401 while(stdout.hasNextLine()){System.out.println(stdout.nextLine());} | |
402 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} | |
403 stdout.close(); stderr.close(); | |
404 } | |
405 | |
406 String path = outDir+prefix+"_enrichment.html"; | |
407 System.out.println("Created plots and HTML summary report"); | |
408 if (new File(path).exists()) { | |
409 System.out.println("\nSTEP 3 successfully completed"); | |
410 } else { | |
411 System.out.println("HTML FILE " + path + " not found"); | |
412 System.out.println("\nSTEP 3 unsuccessful"); | |
413 } | |
414 } | |
415 catch (IOException e) { | |
416 e.printStackTrace(); | |
417 } | |
418 } | |
419 | |
420 /** | |
421 * This is the fourth phase. In this phase the detailed datta from phase 2 | |
422 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig | |
423 * object to accomplish this task. | |
424 * | |
425 */ | |
426 public void computeWiggleFile() { | |
427 try { | |
428 System.out.println("Computing wiggle file for on-target reads"); | |
429 if (!new File(detailsFileName).exists()) | |
430 detailsFileName = tmpDir + new File(detailsFileName).getName(); | |
431 | |
432 new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data", | |
433 Misc.prefix(readFileName)+ "\nonTarget"); | |
434 System.out.println("Wiggle file for on-target reads created"); | |
435 System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig"); | |
436 File[] outputFiles = new File(outDir + "data/") | |
437 .listFiles(); | |
438 for (int i = 0; i < outputFiles.length; i++) { | |
439 if (outputFiles[i].getName().endsWith("wig") | |
440 && outputFiles[i].getName().startsWith( | |
441 detailsFileName.substring( | |
442 detailsFileName.lastIndexOf("/") + 1, | |
443 detailsFileName.lastIndexOf(".")))) { | |
444 File f = new File(outDir + "data" | |
445 + "/" + Misc.prefix(readFileName) + "_onTarget.wig"); | |
446 outputFiles[i].renameTo(f); | |
447 break; | |
448 } | |
449 } | |
450 if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){ | |
451 System.out.println("\nSTEP 4 successfully completed."); | |
452 } else { | |
453 System.out.println(outDir+"data/"+ prefix | |
454 + "_onTarget.wig not found!\n\nSTEP 4 unsuccessful"); | |
455 } | |
456 } catch (IOException e) { | |
457 System.err.println("Target conversion in Wig unsuccessful"); | |
458 e.printStackTrace(); | |
459 } | |
460 } | |
461 | |
462 /** | |
463 * This is the fifth and last phase of the evaluation process. In contrast | |
464 * to the fourth phase, this method uses a Read2Wig object to convert the | |
465 * detailed data for all reads (and not only for reads on targets) to the | |
466 * wiggle format. | |
467 * | |
468 */ | |
469 void computeOverallWiggleFile() { | |
470 try { | |
471 System.out.println("Computing wiggle file for all reads"); | |
472 new Read2Wig(areadFileName, prefix, outDir + "data", | |
473 genomeFName, tmpDir); | |
474 String overallWiggleFileName = prefix + ".wig"; | |
475 System.out.println("Wiggle file for all reads created"); | |
476 System.out.println("Output written to " + outDir + "data/" + | |
477 overallWiggleFileName); | |
478 if (new File(outDir + "data/" + overallWiggleFileName).exists()) { | |
479 System.out.println("\nSTEP 5 successfully completed."); | |
480 } else { | |
481 System.out.println(outDir + "data/" + overallWiggleFileName | |
482 + "not found\n\nSTEP 5 unsuccessful"); | |
483 } | |
484 } catch (IOException e) { | |
485 System.err | |
486 .println("Creating wiggle file for all reads was unsuccessful"); | |
487 e.printStackTrace(); | |
488 } | |
489 } | |
490 } |