0
|
1 package _main;
|
|
2
|
|
3 import java.io.File;
|
|
4 import java.io.FileWriter;
|
|
5 import java.io.IOException;
|
|
6 import java.sql.Time;
|
|
7 import java.util.Scanner;
|
|
8
|
|
9 import middlewares.Misc;
|
|
10 import converters.Read2Wig;
|
|
11 import converters.ReadOnTarget2Wig;
|
|
12 import exceptions.ChromosomeFormatException;
|
|
13 import exceptions.ChromosomeMismatchException;
|
|
14 import exceptions.ChromosomeNotFoundException;
|
|
15 import exceptions.FileFormatException;
|
|
16 import exceptions.GenomeAnnotationException;
|
|
17 import exceptions.NullOrNegativeRangeException;
|
|
18 import exceptions.RangeFormatException;
|
|
19 import exceptions.RangeLimitNotFoundException;
|
|
20 import filters.Filter;
|
|
21 import filters.GenomeFilter;
|
|
22 import filters.ReadFilter;
|
|
23 import filters.TargetFilter;
|
|
24
|
|
25 /**
|
|
26 * This is a scheduler for the phases of the evaluation.
|
|
27 *
|
|
28 * @author Ali Abdallah
|
|
29 * @version 19.03.2011
|
|
30 * @since jdk 1.6
|
|
31 */
|
|
32
|
|
33 public class Enrichment {
|
|
34
|
|
35 String
|
|
36 /**
|
|
37 * The name of the read alignment file.
|
|
38 */
|
|
39 readFileName,
|
|
40 /**
|
|
41 * the name of the genome annotation file.
|
|
42 */
|
|
43 genomeFName,
|
|
44 /**
|
|
45 * the ucsc-name of the genome (e.g. hg19).
|
|
46 */
|
|
47 genomeName,
|
|
48 /**
|
|
49 * the name of the file of the targeted regions.
|
|
50 */
|
|
51 targetFName,
|
|
52 /**
|
|
53 * the temporary directory.
|
|
54 */
|
|
55 tmpDir,
|
|
56 /**
|
|
57 * the output directory.
|
|
58 */
|
|
59 outDir,
|
|
60 /**
|
|
61 * the xml summary file containing various overall statistics.
|
|
62 */
|
|
63 xmlSummaryFile,
|
|
64 /**
|
|
65 * the name of the detailed coverage data file.
|
|
66 */
|
|
67 detailsFileName,
|
|
68 /**
|
|
69 * the .bed file containing target-level coverage statistics data.
|
|
70 */
|
|
71 beddetailsFileName,
|
|
72 /**
|
|
73 * the process number used for unique naming.
|
|
74 */
|
|
75 proc,
|
|
76 /**
|
|
77 * the name of the read alignment file without extension.
|
|
78 */
|
|
79 prefix,
|
|
80 /**
|
|
81 * the extension of the read alignment file.
|
|
82 */
|
|
83 suffix,
|
|
84 /**
|
|
85 * The name of the read alignment file after the reduction.
|
|
86 */
|
|
87 areadFileName,
|
|
88 /**
|
|
89 * The name of the genome annotation file after the reduction.
|
|
90 */
|
|
91 aGenomeFName,
|
|
92 /**
|
|
93 * the name of the file of the targeted regions after the reduction.
|
|
94 */
|
|
95 aTargetFName;
|
|
96 int
|
|
97 /**
|
|
98 * Cutoff for poor coverage [default: 2].
|
|
99 *
|
|
100 */
|
|
101 poor,
|
|
102 /**
|
|
103 * Cutoff for high coverage [default: 200].
|
|
104 */
|
|
105 high,
|
|
106 details;
|
|
107
|
|
108 /**
|
|
109 * Construcs an enrichment object and initializes the evaluation parameters.
|
|
110 * @param args the list of all evaluation parameters.
|
|
111 */
|
|
112 public Enrichment(String... args) {
|
|
113 readFileName = args[0];
|
|
114 genomeName = args[8];
|
|
115 targetFName = args[2];
|
|
116 if (!targetFName.endsWith("bed")) {
|
|
117 System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format.");
|
|
118 }
|
|
119 if (args.length > 3) {
|
|
120 Time time = new Time(System.currentTimeMillis());
|
|
121 String nano = ""+System.nanoTime();
|
|
122
|
|
123 tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+
|
|
124 time.getHours()+"_"+
|
|
125 time.getMinutes()+"_"+
|
|
126 time.getSeconds()+"_"+nano+ "/";
|
|
127 new File(tmpDir).mkdir();
|
|
128 new File(tmpDir).setReadable(true);
|
|
129 new File(tmpDir).setWritable(true);
|
|
130 outDir = args[4] + Misc.slash(args[4]);
|
|
131
|
|
132 if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) {
|
|
133 System.out.println("File " + tmpDir
|
|
134 + " is not a directory.\nProgram stopped.");
|
|
135 System.exit(0);
|
|
136 } else if (!(new File(tmpDir).exists())) {
|
|
137 System.out.println("File " + tmpDir
|
|
138 + " not found.\nProgram stopped.");
|
|
139 System.exit(0);
|
|
140 }
|
|
141 }
|
|
142 if (args.length > 5) {
|
|
143 try {
|
|
144 poor = Integer.parseInt(args[5]);
|
|
145 high = Integer.parseInt(args[6]);
|
|
146 } catch (NumberFormatException nfe) {
|
|
147 System.out
|
|
148 .println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values.");
|
|
149 poor = 2;
|
|
150 high = 200;
|
|
151 }
|
|
152 } else {
|
|
153 poor = 2;
|
|
154 high = 200;
|
|
155 }
|
|
156
|
|
157 genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1];
|
|
158
|
|
159 prefix = Misc.prefix(readFileName);
|
|
160 if (args[7] != "none") {
|
|
161 prefix = args[7];
|
|
162 }
|
|
163 details = Integer.parseInt(args[9]);
|
|
164
|
|
165 suffix = Misc.suffix(readFileName);
|
|
166 proc = Misc.getHostName();
|
|
167 areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt";
|
|
168 aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt";
|
|
169 aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed";
|
|
170 detailsFileName = tmpDir + "coverage_" + proc + ".txt";
|
|
171 xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml";
|
|
172 beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed";
|
|
173 }
|
|
174
|
|
175 public static final String[] GENOME_NAMES = {
|
|
176 "anoGam1", "bosTau4", "cb3", "ce4", "ce6",
|
|
177 "danRer5", "danRer6", "danRer7", "dm2", "dm3",
|
|
178 "galGal2", "galGal3", "hg19", "hg18", "mm8",
|
|
179 "mm9", "panTro3", "rn3", "rn4", "susScr2"
|
|
180 };
|
|
181
|
|
182 public int genomeID(String gname){
|
|
183 for(int i = 0; i < GENOME_NAMES.length; i++)
|
|
184 if(GENOME_NAMES[i].equals(gname))
|
|
185 return i;
|
|
186 return -1;
|
|
187 }
|
|
188
|
|
189 /**
|
|
190 * Downloads the specified genome annotation, if internet connection
|
|
191 * exists.
|
|
192 *
|
|
193 * @param genome the ucsc-name of the genome. (e.g. hg18, hg19)
|
|
194 * @return the very same name of the genome.
|
|
195 * @throws GenomeAnnotationException
|
|
196 */
|
|
197
|
|
198 public String getGenomeAnnotation() throws GenomeAnnotationException{
|
|
199 try{
|
|
200 return downloadGenomeAnnotation();
|
|
201 }catch(Exception e){
|
|
202 if(genomeID(genomeName)!=-1)
|
|
203 return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt";
|
|
204 else if(new File(genomeName).exists())
|
|
205 return genomeName;
|
|
206 else
|
|
207 throw new GenomeAnnotationException();
|
|
208 }
|
|
209 }
|
|
210
|
|
211 public String getChromInfo(){
|
|
212 //genomeName
|
|
213 return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt";
|
|
214 }
|
|
215
|
|
216 public String downloadGenomeAnnotation() {
|
|
217 if (genomeName.indexOf("/") == -1) {
|
|
218 String tmpGenomeDir = tmpDir + genomeName;
|
|
219 try {
|
|
220 if (!new File(tmpGenomeDir).exists()) {
|
|
221 new File(tmpGenomeDir).mkdir();
|
|
222 }
|
|
223 File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh");
|
|
224 if (!getGenomeScript.exists()) {
|
|
225 getGenomeScript.createNewFile();
|
|
226 }
|
|
227 FileWriter fw = new FileWriter(getGenomeScript);
|
|
228
|
|
229 getGenomeScript.setReadable(true);
|
|
230 getGenomeScript.setWritable(true);
|
|
231 getGenomeScript.setExecutable(true);
|
|
232 getGenomeScript.setExecutable(true);
|
|
233 String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir)
|
|
234 + "refGene.txt.gz";
|
|
235 fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n"
|
|
236 + "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName
|
|
237 + "/database/refGene.txt.gz " + "-O " + tmpGenomeFile
|
|
238 + "\n" + "gunzip " + tmpGenomeFile + "\n");
|
|
239 fw.close();
|
|
240 Runtime.getRuntime().exec("sh " + getGenomeScript);
|
|
241 try {
|
|
242 Thread.sleep(10000);
|
|
243 } catch (InterruptedException e) {
|
|
244 e.printStackTrace();
|
|
245 }
|
|
246
|
|
247 } catch (IOException e) {
|
|
248 e.printStackTrace();
|
|
249 }
|
|
250 return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt";
|
|
251 } else {
|
|
252 return genomeName;
|
|
253 }
|
|
254 }
|
|
255
|
|
256 /**
|
|
257 * A bridge method which converts a bam file to a sam file for further
|
|
258 * processing. This method assures therefore the compatibility of the
|
|
259 * software with the bam format.
|
|
260 *
|
|
261 * @return the name of the generated sam file.
|
|
262 * @throws IOException
|
|
263 */
|
|
264 public String bam2sam() throws IOException {
|
|
265
|
|
266 String[] bampath = readFileName.split("/");
|
|
267 String bamfile = bampath[bampath.length - 1];
|
|
268 String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam";
|
|
269
|
|
270 File getBam2SamScript = new File(tmpDir + "bam2sam.sh");
|
|
271 if (!getBam2SamScript.exists()) {
|
|
272 getBam2SamScript.createNewFile();
|
|
273 }
|
|
274 FileWriter fw = new FileWriter(getBam2SamScript);
|
|
275 fw.write("#!/bin/bash\n" + Misc.binDir()
|
|
276 + "/../thirdparty/samtools/0.1.16/samtools view "
|
|
277 + readFileName + " > " + samfile);
|
|
278 fw.close();
|
|
279
|
|
280 Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript);
|
|
281 // try {
|
|
282 // @SuppressWarnings("unused")
|
|
283 // "exitstate" variable unused
|
|
284 //int exitstate = p.waitFor();
|
|
285 // } catch (InterruptedException e) {
|
|
286 //e.printStackTrace();
|
|
287 // }
|
|
288
|
|
289 Scanner stderr=new Scanner(p.getErrorStream());
|
|
290 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
|
|
291 stderr.close();
|
|
292
|
|
293 return (samfile);
|
|
294
|
|
295 }
|
|
296
|
|
297 /**
|
|
298 * First phase of the evaluation. In this phase input files are being
|
|
299 * simplified for further processing.
|
|
300 *
|
|
301 * @throws FileFormatException
|
|
302 * @throws ChromosomeMismatchException
|
|
303 */
|
|
304 public void reduceFiles() throws FileFormatException, ChromosomeMismatchException {
|
|
305 String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt";
|
|
306 String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt";
|
|
307 String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt";
|
|
308
|
|
309
|
|
310 Filter[] filters = {new ReadFilter(readFileName, aReadFileName),
|
|
311 new GenomeFilter(genomeFName, aGenomeFName),
|
|
312 new TargetFilter(targetFName, aTargetFName)};
|
|
313
|
|
314 for(Filter f: filters) f.filter();
|
|
315
|
|
316 if(!Misc.areChromosomeCompatible(new File(aReadFileName),
|
|
317 new File(aTargetFName)))
|
|
318 throw new ChromosomeMismatchException();
|
|
319
|
|
320 System.out.println("\nSTEP 1 successfully completed");
|
|
321 }
|
|
322
|
|
323 /**
|
|
324 * Second phase of the evaluation. An EnrichmentStatsComputer object is
|
|
325 * called with the right parameters and computation is checked for formal
|
|
326 * correctness.
|
|
327 */
|
|
328 public void computeTargetCoverageFiles() {
|
|
329
|
|
330 File data = new File(outDir + "/data");
|
|
331 data.mkdir();
|
|
332 File plots = new File(outDir + "/plots");
|
|
333 plots.mkdir();
|
|
334
|
|
335 EnrichmentStatsComputer esc =
|
|
336 new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir);
|
|
337
|
|
338 try {
|
|
339 esc.compute();
|
|
340 } catch (RangeFormatException e) {
|
|
341 e.printStackTrace();
|
|
342 } catch (ChromosomeFormatException e) {
|
|
343 e.printStackTrace();
|
|
344 } catch (ChromosomeNotFoundException e) {
|
|
345 e.printStackTrace();
|
|
346 } catch (RangeLimitNotFoundException e) {
|
|
347 e.printStackTrace();
|
|
348 } catch (NullOrNegativeRangeException e) {
|
|
349 e.printStackTrace();
|
|
350 } catch (IOException e) {
|
|
351 e.printStackTrace();
|
|
352 }
|
|
353
|
|
354 if (new File(detailsFileName).exists()) {
|
|
355 System.out.println("\nSTEP 2 successfully completed");
|
|
356 } else {
|
|
357 System.out.println("STEP 2 incomplete.");
|
|
358 }
|
|
359
|
|
360 }
|
|
361
|
|
362 /**
|
|
363 * Third phase of the evaluation. Based on the results from phase 2, the
|
|
364 * evaluation is further developed and evaluation data are visualized by
|
|
365 * calling "eval_enrichment.R" Script.
|
|
366 */
|
|
367 public void evaluate() {
|
|
368 Runtime rt = Runtime.getRuntime();
|
|
369 try {
|
|
370 String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R";
|
|
371 int lastSlash = outDir.lastIndexOf("/");
|
|
372 String outDirR = (lastSlash == outDir.length() - 1) ? outDir
|
|
373 .substring(0, lastSlash) : outDir;
|
|
374
|
|
375 String call = "Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " "
|
|
376 + outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " "
|
|
377 + targetFName+" "+details;
|
|
378
|
|
379 Process p=rt.exec(call);
|
|
380
|
|
381 Scanner stdout = new Scanner(p.getInputStream());
|
|
382 Scanner stderr=new Scanner(p.getErrorStream());
|
|
383 while (stdout.hasNextLine()){System.out.println(stdout.nextLine());}
|
|
384 while (stderr.hasNextLine()){System.out.println(stderr.nextLine());}
|
|
385 stdout.close(); stderr.close();
|
|
386
|
|
387 if(details == 1){
|
|
388 String chrInfoPath = getChromInfo();
|
|
389
|
|
390 //System.out.println("chrInfoPath\t"+chrInfoPath);
|
|
391 script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R";
|
|
392 outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir;
|
|
393
|
|
394 String call2 = "Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName
|
|
395 + " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " "
|
|
396 + genomeName;
|
|
397 Process p2 = rt.exec(call2);
|
|
398 stdout=new Scanner(p2.getInputStream());
|
|
399 stderr=new Scanner(p2.getErrorStream());
|
|
400
|
|
401 while(stdout.hasNextLine()){System.out.println(stdout.nextLine());}
|
|
402 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
|
|
403 stdout.close(); stderr.close();
|
|
404 }
|
|
405
|
|
406 String path = outDir+prefix+"_enrichment.html";
|
|
407 System.out.println("Created plots and HTML summary report");
|
|
408 if (new File(path).exists()) {
|
|
409 System.out.println("\nSTEP 3 successfully completed");
|
|
410 } else {
|
|
411 System.out.println("HTML FILE " + path + " not found");
|
|
412 System.out.println("\nSTEP 3 unsuccessful");
|
|
413 }
|
|
414 }
|
|
415 catch (IOException e) {
|
|
416 e.printStackTrace();
|
|
417 }
|
|
418 }
|
|
419
|
|
420 /**
|
|
421 * This is the fourth phase. In this phase the detailed datta from phase 2
|
|
422 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig
|
|
423 * object to accomplish this task.
|
|
424 *
|
|
425 */
|
|
426 public void computeWiggleFile() {
|
|
427 try {
|
|
428 System.out.println("Computing wiggle file for on-target reads");
|
|
429 if (!new File(detailsFileName).exists())
|
|
430 detailsFileName = tmpDir + new File(detailsFileName).getName();
|
|
431
|
|
432 new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data",
|
|
433 Misc.prefix(readFileName)+ "\nonTarget");
|
|
434 System.out.println("Wiggle file for on-target reads created");
|
|
435 System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig");
|
|
436 File[] outputFiles = new File(outDir + "data/")
|
|
437 .listFiles();
|
|
438 for (int i = 0; i < outputFiles.length; i++) {
|
|
439 if (outputFiles[i].getName().endsWith("wig")
|
|
440 && outputFiles[i].getName().startsWith(
|
|
441 detailsFileName.substring(
|
|
442 detailsFileName.lastIndexOf("/") + 1,
|
|
443 detailsFileName.lastIndexOf(".")))) {
|
|
444 File f = new File(outDir + "data"
|
|
445 + "/" + Misc.prefix(readFileName) + "_onTarget.wig");
|
|
446 outputFiles[i].renameTo(f);
|
|
447 break;
|
|
448 }
|
|
449 }
|
|
450 if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){
|
|
451 System.out.println("\nSTEP 4 successfully completed.");
|
|
452 } else {
|
|
453 System.out.println(outDir+"data/"+ prefix
|
|
454 + "_onTarget.wig not found!\n\nSTEP 4 unsuccessful");
|
|
455 }
|
|
456 } catch (IOException e) {
|
|
457 System.err.println("Target conversion in Wig unsuccessful");
|
|
458 e.printStackTrace();
|
|
459 }
|
|
460 }
|
|
461
|
|
462 /**
|
|
463 * This is the fifth and last phase of the evaluation process. In contrast
|
|
464 * to the fourth phase, this method uses a Read2Wig object to convert the
|
|
465 * detailed data for all reads (and not only for reads on targets) to the
|
|
466 * wiggle format.
|
|
467 *
|
|
468 */
|
|
469 void computeOverallWiggleFile() {
|
|
470 try {
|
|
471 System.out.println("Computing wiggle file for all reads");
|
|
472 new Read2Wig(areadFileName, prefix, outDir + "data",
|
|
473 genomeFName, tmpDir);
|
|
474 String overallWiggleFileName = prefix + ".wig";
|
|
475 System.out.println("Wiggle file for all reads created");
|
|
476 System.out.println("Output written to " + outDir + "data/" +
|
|
477 overallWiggleFileName);
|
|
478 if (new File(outDir + "data/" + overallWiggleFileName).exists()) {
|
|
479 System.out.println("\nSTEP 5 successfully completed.");
|
|
480 } else {
|
|
481 System.out.println(outDir + "data/" + overallWiggleFileName
|
|
482 + "not found\n\nSTEP 5 unsuccessful");
|
|
483 }
|
|
484 } catch (IOException e) {
|
|
485 System.err
|
|
486 .println("Creating wiggle file for all reads was unsuccessful");
|
|
487 e.printStackTrace();
|
|
488 }
|
|
489 }
|
|
490 } |