comparison NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:89ad0a9cca52
1 package _main;
2
3 import java.io.File;
4 import java.io.FileWriter;
5 import java.io.IOException;
6 import java.sql.Time;
7 import java.util.Scanner;
8
9 import middlewares.Misc;
10 import converters.Read2Wig;
11 import converters.ReadOnTarget2Wig;
12 import exceptions.ChromosomeFormatException;
13 import exceptions.ChromosomeMismatchException;
14 import exceptions.ChromosomeNotFoundException;
15 import exceptions.FileFormatException;
16 import exceptions.GenomeAnnotationException;
17 import exceptions.NullOrNegativeRangeException;
18 import exceptions.RangeFormatException;
19 import exceptions.RangeLimitNotFoundException;
20 import filters.Filter;
21 import filters.GenomeFilter;
22 import filters.ReadFilter;
23 import filters.TargetFilter;
24
25 /**
26 * This is a scheduler for the phases of the evaluation.
27 *
28 * @author Ali Abdallah
29 * @version 19.03.2011
30 * @since jdk 1.6
31 */
32
33 public class Enrichment {
34
35 String
36 /**
37 * The name of the read alignment file.
38 */
39 readFileName,
40 /**
41 * the name of the genome annotation file.
42 */
43 genomeFName,
44 /**
45 * the ucsc-name of the genome (e.g. hg19).
46 */
47 genomeName,
48 /**
49 * the name of the file of the targeted regions.
50 */
51 targetFName,
52 /**
53 * the temporary directory.
54 */
55 tmpDir,
56 /**
57 * the output directory.
58 */
59 outDir,
60 /**
61 * the xml summary file containing various overall statistics.
62 */
63 xmlSummaryFile,
64 /**
65 * the name of the detailed coverage data file.
66 */
67 detailsFileName,
68 /**
69 * the .bed file containing target-level coverage statistics data.
70 */
71 beddetailsFileName,
72 /**
73 * the process number used for unique naming.
74 */
75 proc,
76 /**
77 * the name of the read alignment file without extension.
78 */
79 prefix,
80 /**
81 * the extension of the read alignment file.
82 */
83 suffix,
84 /**
85 * The name of the read alignment file after the reduction.
86 */
87 areadFileName,
88 /**
89 * The name of the genome annotation file after the reduction.
90 */
91 aGenomeFName,
92 /**
93 * the name of the file of the targeted regions after the reduction.
94 */
95 aTargetFName;
96 int
97 /**
98 * Cutoff for poor coverage [default: 2].
99 *
100 */
101 poor,
102 /**
103 * Cutoff for high coverage [default: 200].
104 */
105 high,
106 details;
107
108 /**
109 * Construcs an enrichment object and initializes the evaluation parameters.
110 * @param args the list of all evaluation parameters.
111 */
112 public Enrichment(String... args) {
113 readFileName = args[0];
114 genomeName = args[8];
115 targetFName = args[2];
116 if (!targetFName.endsWith("bed")) {
117 System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format.");
118 }
119 if (args.length > 3) {
120 Time time = new Time(System.currentTimeMillis());
121 String nano = ""+System.nanoTime();
122
123 tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+
124 time.getHours()+"_"+
125 time.getMinutes()+"_"+
126 time.getSeconds()+"_"+nano+ "/";
127 new File(tmpDir).mkdir();
128 new File(tmpDir).setReadable(true);
129 new File(tmpDir).setWritable(true);
130 outDir = args[4] + Misc.slash(args[4]);
131
132 if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) {
133 System.out.println("File " + tmpDir
134 + " is not a directory.\nProgram stopped.");
135 System.exit(0);
136 } else if (!(new File(tmpDir).exists())) {
137 System.out.println("File " + tmpDir
138 + " not found.\nProgram stopped.");
139 System.exit(0);
140 }
141 }
142 if (args.length > 5) {
143 try {
144 poor = Integer.parseInt(args[5]);
145 high = Integer.parseInt(args[6]);
146 } catch (NumberFormatException nfe) {
147 System.out
148 .println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values.");
149 poor = 2;
150 high = 200;
151 }
152 } else {
153 poor = 2;
154 high = 200;
155 }
156
157 genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1];
158
159 prefix = Misc.prefix(readFileName);
160 if (args[7] != "none") {
161 prefix = args[7];
162 }
163 details = Integer.parseInt(args[9]);
164
165 suffix = Misc.suffix(readFileName);
166 proc = Misc.getHostName();
167 areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt";
168 aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt";
169 aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed";
170 detailsFileName = tmpDir + "coverage_" + proc + ".txt";
171 xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml";
172 beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed";
173 }
174
175 public static final String[] GENOME_NAMES = {
176 "anoGam1", "bosTau4", "cb3", "ce4", "ce6",
177 "danRer5", "danRer6", "danRer7", "dm2", "dm3",
178 "galGal2", "galGal3", "hg19", "hg18", "mm8",
179 "mm9", "panTro3", "rn3", "rn4", "susScr2"
180 };
181
182 public int genomeID(String gname){
183 for(int i = 0; i < GENOME_NAMES.length; i++)
184 if(GENOME_NAMES[i].equals(gname))
185 return i;
186 return -1;
187 }
188
189 /**
190 * Downloads the specified genome annotation, if internet connection
191 * exists.
192 *
193 * @param genome the ucsc-name of the genome. (e.g. hg18, hg19)
194 * @return the very same name of the genome.
195 * @throws GenomeAnnotationException
196 */
197
198 public String getGenomeAnnotation() throws GenomeAnnotationException{
199 try{
200 return downloadGenomeAnnotation();
201 }catch(Exception e){
202 if(genomeID(genomeName)!=-1)
203 return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt";
204 else if(new File(genomeName).exists())
205 return genomeName;
206 else
207 throw new GenomeAnnotationException();
208 }
209 }
210
211 public String getChromInfo(){
212 //genomeName
213 return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt";
214 }
215
216 public String downloadGenomeAnnotation() {
217 if (genomeName.indexOf("/") == -1) {
218 String tmpGenomeDir = tmpDir + genomeName;
219 try {
220 if (!new File(tmpGenomeDir).exists()) {
221 new File(tmpGenomeDir).mkdir();
222 }
223 File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh");
224 if (!getGenomeScript.exists()) {
225 getGenomeScript.createNewFile();
226 }
227 FileWriter fw = new FileWriter(getGenomeScript);
228
229 getGenomeScript.setReadable(true);
230 getGenomeScript.setWritable(true);
231 getGenomeScript.setExecutable(true);
232 getGenomeScript.setExecutable(true);
233 String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir)
234 + "refGene.txt.gz";
235 fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n"
236 + "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName
237 + "/database/refGene.txt.gz " + "-O " + tmpGenomeFile
238 + "\n" + "gunzip " + tmpGenomeFile + "\n");
239 fw.close();
240 Runtime.getRuntime().exec("sh " + getGenomeScript);
241 try {
242 Thread.sleep(10000);
243 } catch (InterruptedException e) {
244 e.printStackTrace();
245 }
246
247 } catch (IOException e) {
248 e.printStackTrace();
249 }
250 return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt";
251 } else {
252 return genomeName;
253 }
254 }
255
256 /**
257 * A bridge method which converts a bam file to a sam file for further
258 * processing. This method assures therefore the compatibility of the
259 * software with the bam format.
260 *
261 * @return the name of the generated sam file.
262 * @throws IOException
263 */
264 public String bam2sam() throws IOException {
265
266 String[] bampath = readFileName.split("/");
267 String bamfile = bampath[bampath.length - 1];
268 String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam";
269
270 File getBam2SamScript = new File(tmpDir + "bam2sam.sh");
271 if (!getBam2SamScript.exists()) {
272 getBam2SamScript.createNewFile();
273 }
274 FileWriter fw = new FileWriter(getBam2SamScript);
275 fw.write("#!/bin/bash\n" + Misc.binDir()
276 + "/../thirdparty/samtools/0.1.16/samtools view "
277 + readFileName + " > " + samfile);
278 fw.close();
279
280 Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript);
281 // try {
282 // @SuppressWarnings("unused")
283 // "exitstate" variable unused
284 //int exitstate = p.waitFor();
285 // } catch (InterruptedException e) {
286 //e.printStackTrace();
287 // }
288
289 Scanner stderr=new Scanner(p.getErrorStream());
290 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
291 stderr.close();
292
293 return (samfile);
294
295 }
296
297 /**
298 * First phase of the evaluation. In this phase input files are being
299 * simplified for further processing.
300 *
301 * @throws FileFormatException
302 * @throws ChromosomeMismatchException
303 */
304 public void reduceFiles() throws FileFormatException, ChromosomeMismatchException {
305 String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt";
306 String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt";
307 String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt";
308
309
310 Filter[] filters = {new ReadFilter(readFileName, aReadFileName),
311 new GenomeFilter(genomeFName, aGenomeFName),
312 new TargetFilter(targetFName, aTargetFName)};
313
314 for(Filter f: filters) f.filter();
315
316 if(!Misc.areChromosomeCompatible(new File(aReadFileName),
317 new File(aTargetFName)))
318 throw new ChromosomeMismatchException();
319
320 System.out.println("\nSTEP 1 successfully completed");
321 }
322
323 /**
324 * Second phase of the evaluation. An EnrichmentStatsComputer object is
325 * called with the right parameters and computation is checked for formal
326 * correctness.
327 */
328 public void computeTargetCoverageFiles() {
329
330 File data = new File(outDir + "/data");
331 data.mkdir();
332 File plots = new File(outDir + "/plots");
333 plots.mkdir();
334
335 EnrichmentStatsComputer esc =
336 new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir);
337
338 try {
339 esc.compute();
340 } catch (RangeFormatException e) {
341 e.printStackTrace();
342 } catch (ChromosomeFormatException e) {
343 e.printStackTrace();
344 } catch (ChromosomeNotFoundException e) {
345 e.printStackTrace();
346 } catch (RangeLimitNotFoundException e) {
347 e.printStackTrace();
348 } catch (NullOrNegativeRangeException e) {
349 e.printStackTrace();
350 } catch (IOException e) {
351 e.printStackTrace();
352 }
353
354 if (new File(detailsFileName).exists()) {
355 System.out.println("\nSTEP 2 successfully completed");
356 } else {
357 System.out.println("STEP 2 incomplete.");
358 }
359
360 }
361
362 /**
363 * Third phase of the evaluation. Based on the results from phase 2, the
364 * evaluation is further developed and evaluation data are visualized by
365 * calling "eval_enrichment.R" Script.
366 */
367 public void evaluate() {
368 Runtime rt = Runtime.getRuntime();
369 try {
370 String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R";
371 int lastSlash = outDir.lastIndexOf("/");
372 String outDirR = (lastSlash == outDir.length() - 1) ? outDir
373 .substring(0, lastSlash) : outDir;
374
375 String call = "Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " "
376 + outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " "
377 + targetFName+" "+details;
378
379 Process p=rt.exec(call);
380
381 Scanner stdout = new Scanner(p.getInputStream());
382 Scanner stderr=new Scanner(p.getErrorStream());
383 while (stdout.hasNextLine()){System.out.println(stdout.nextLine());}
384 while (stderr.hasNextLine()){System.out.println(stderr.nextLine());}
385 stdout.close(); stderr.close();
386
387 if(details == 1){
388 String chrInfoPath = getChromInfo();
389
390 //System.out.println("chrInfoPath\t"+chrInfoPath);
391 script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R";
392 outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir;
393
394 String call2 = "Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName
395 + " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " "
396 + genomeName;
397 Process p2 = rt.exec(call2);
398 stdout=new Scanner(p2.getInputStream());
399 stderr=new Scanner(p2.getErrorStream());
400
401 while(stdout.hasNextLine()){System.out.println(stdout.nextLine());}
402 while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
403 stdout.close(); stderr.close();
404 }
405
406 String path = outDir+prefix+"_enrichment.html";
407 System.out.println("Created plots and HTML summary report");
408 if (new File(path).exists()) {
409 System.out.println("\nSTEP 3 successfully completed");
410 } else {
411 System.out.println("HTML FILE " + path + " not found");
412 System.out.println("\nSTEP 3 unsuccessful");
413 }
414 }
415 catch (IOException e) {
416 e.printStackTrace();
417 }
418 }
419
420 /**
421 * This is the fourth phase. In this phase the detailed datta from phase 2
422 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig
423 * object to accomplish this task.
424 *
425 */
426 public void computeWiggleFile() {
427 try {
428 System.out.println("Computing wiggle file for on-target reads");
429 if (!new File(detailsFileName).exists())
430 detailsFileName = tmpDir + new File(detailsFileName).getName();
431
432 new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data",
433 Misc.prefix(readFileName)+ "\nonTarget");
434 System.out.println("Wiggle file for on-target reads created");
435 System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig");
436 File[] outputFiles = new File(outDir + "data/")
437 .listFiles();
438 for (int i = 0; i < outputFiles.length; i++) {
439 if (outputFiles[i].getName().endsWith("wig")
440 && outputFiles[i].getName().startsWith(
441 detailsFileName.substring(
442 detailsFileName.lastIndexOf("/") + 1,
443 detailsFileName.lastIndexOf(".")))) {
444 File f = new File(outDir + "data"
445 + "/" + Misc.prefix(readFileName) + "_onTarget.wig");
446 outputFiles[i].renameTo(f);
447 break;
448 }
449 }
450 if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){
451 System.out.println("\nSTEP 4 successfully completed.");
452 } else {
453 System.out.println(outDir+"data/"+ prefix
454 + "_onTarget.wig not found!\n\nSTEP 4 unsuccessful");
455 }
456 } catch (IOException e) {
457 System.err.println("Target conversion in Wig unsuccessful");
458 e.printStackTrace();
459 }
460 }
461
462 /**
463 * This is the fifth and last phase of the evaluation process. In contrast
464 * to the fourth phase, this method uses a Read2Wig object to convert the
465 * detailed data for all reads (and not only for reads on targets) to the
466 * wiggle format.
467 *
468 */
469 void computeOverallWiggleFile() {
470 try {
471 System.out.println("Computing wiggle file for all reads");
472 new Read2Wig(areadFileName, prefix, outDir + "data",
473 genomeFName, tmpDir);
474 String overallWiggleFileName = prefix + ".wig";
475 System.out.println("Wiggle file for all reads created");
476 System.out.println("Output written to " + outDir + "data/" +
477 overallWiggleFileName);
478 if (new File(outDir + "data/" + overallWiggleFileName).exists()) {
479 System.out.println("\nSTEP 5 successfully completed.");
480 } else {
481 System.out.println(outDir + "data/" + overallWiggleFileName
482 + "not found\n\nSTEP 5 unsuccessful");
483 }
484 } catch (IOException e) {
485 System.err
486 .println("Creating wiggle file for all reads was unsuccessful");
487 e.printStackTrace();
488 }
489 }
490 }