comparison NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:89ad0a9cca52
1 package _main;
2
3 import java.io.File;
4 import java.io.FileNotFoundException;
5 import java.io.FileWriter;
6 import java.io.IOException;
7 import java.util.Scanner;
8
9 import org.jdom.Element;
10
11 import datastructures.ReducedReadLine;
12 import datastructures.TargetLine;
13 import exceptions.ChromosomeFormatException;
14 import exceptions.ChromosomeNotFoundException;
15 import exceptions.NullOrNegativeRangeException;
16 import exceptions.RangeFormatException;
17 import exceptions.RangeLimitNotFoundException;
18
19 import middlewares.GeneExtractor;
20 import middlewares.HitsCounter;
21 import middlewares.Misc;
22 import middlewares.ReadCounter;
23 import middlewares.XMLSummaryFileBuilder;
24
25 /**
26 * This class is for computation of the coverage, BED coverage and statistics
27 * output files.
28 *
29 * @author Ali Abdallah
30 * @version 02.02.2011
31 * @since jdk 1.6.0
32 */
33
34 public class EnrichmentStatsComputer {
35
36 /* ***********************************************************************
37 * FIELDS OF THE CLASS
38 * ***********************************************************************/
39 private String
40 /**
41 * 4-column alignment file sorted by alignment subject and position.
42 */
43 align,
44 /**
45 * 3-column target file sorted by subject and target starting position.
46 */
47 target,
48 /**
49 * Annotation file.
50 */
51 genome,
52 /**
53 * Statistics file in the xml-format.
54 */
55 summary,
56 /**
57 * Coverage file.
58 */
59 details,
60 /**
61 * Coverage BED file.
62 */
63 targetCS,
64 /**
65 * The absolute name of the read alignment file without the extension.
66 */
67 prefix,
68
69 /**
70 * The directory under which temporary data are saved (temporary directory).
71 */
72 tmpDir,
73 /**
74 * The directory of the computed results (output directory).
75 */
76 outDir;
77
78
79 private Scanner
80 /**
81 * Scanner of the read alignment file (sam format).
82 */
83 rScanner,
84 /**
85 * Scanner of the genome annotation file.
86 */
87 gScanner,
88 /**
89 * Scanner of the file of targets.
90 */
91 tScanner;
92
93 private FileWriter
94 /**
95 * Writer used to write the computed statistics into the statistics file.
96 */
97 summaryWriter,
98 /**
99 * Writer used to write coverage data at base-level into the coverage file.
100 */
101 detailsWriter,
102 /**
103 * Writer used to write the summarized coverage data at target level into
104 * the coverage BED file.
105 */
106 targetCSWriter;
107
108 /**
109 * Counter used to counts the coverage frequencies at different levels.
110 */
111 private HitsCounter hc;
112 /**
113 * Counter used to count both: Reads exactly on the targets or overlapping
114 * the targets and Reads doing this in a scope of 100 resp. 200 bases from
115 * both end.
116 */
117 private ReadCounter rc;
118
119 /**
120 * A Gene Extractor used to extract the genes overlapping the specified
121 * targets.
122 */
123 private GeneExtractor ge;
124
125 /**
126 * Counter for the reads.
127 */
128 private int numReads = 0,
129 /**
130 * Counter for all target bases.
131 */
132 targetBases = 0,
133 /**
134 * Counter for the bases in a specific target.
135 */
136 currTargetBases = 0;
137
138 /**
139 * The average Coverage of the targets.
140 */
141 private double avCov = 0;
142 /**
143 * The average Coverage of the current observed target.
144 */
145 private double curAvCov = 0;
146 /**
147 * Flag indicating the end of the read file.
148 */
149 private boolean endReads = false;
150 /**
151 * Flag indicates if the coverage line was written or not.
152 */
153 private boolean covWritten = false;
154
155 /**
156 * Contains always the current observers read line from the sam file.
157 */
158 private ReducedReadLine rLine;
159
160
161 private int rlength;
162
163 /**
164 * ID's of the leaf-tags in the XML summary file.
165 */
166 private static final int
167 from30xPERC = 28, from30xBASES = 27,from20xPERC = 26, from20xBASES = 25,
168 from10xPERC = 24, from10xBASES = 23,from5xPERC = 22, from5xBASES = 21,
169 from1xPERC = 20, from1xBASES = 19, OV200P = 18, OV_200 = 17,
170 OV100P = 16, OV_100 = 15, OV_PERC = 14, OV = 13, ON200P = 12,
171 ON_200 = 11, ON100P = 10, ON_100 = 9, ON_PERC = 8, ON = 7,
172 STANDARD_ERROR = 6, AVERAGE_COVERAGE = 5, TARGET_BASES = 4,
173 NUMBER_OF_READS = 3, READ_LENGTH = 2, SAMPLE_NAME = 1, SAMPLE_NUMBER = 0;
174
175 /**
176 * Constructs a EnrichmentStatsComputer object.
177 *
178 * @param prefix the name of the sam file without its extension.
179 * @param proc the number of the process running the software.
180 * @param tmpDir the temporary directory
181 * @param outDir the output directory.
182 */
183 public EnrichmentStatsComputer(String prefix, String proc, String tmpDir,
184 String outDir) {
185 super();
186
187 this.prefix = prefix;
188
189 // Setting the paths.
190 this.tmpDir = tmpDir + Misc.slash(tmpDir);
191 this.outDir = outDir + Misc.slash(outDir);
192 this.align = this.tmpDir + "NGSrich_" + prefix + "_" + proc+ ".txt";
193 this.target = this.tmpDir + "NGSrich_target_" + proc + ".txt";
194 this.genome = this.tmpDir + "NGSrich_genome_" + proc + ".txt";
195 this.details = this.tmpDir + "coverage_" + proc + ".txt";
196 this.summary = this.outDir + "data/" + prefix + "_enrichment.xml";
197 this.targetCS = this.outDir + "data/" + prefix + "_enrichment.bed";
198
199 // Constructing and initializing various objects.
200 try {
201 init();
202 } catch (ChromosomeFormatException e) {
203 e.printStackTrace();
204 } catch (ChromosomeNotFoundException e) {
205 e.printStackTrace();
206 } catch (RangeFormatException e) {
207 e.printStackTrace();
208 } catch (RangeLimitNotFoundException e) {
209 e.printStackTrace();
210 } catch (IOException e) {
211 e.printStackTrace();
212 }
213 }
214
215 public void init() throws IOException, ChromosomeFormatException,
216 ChromosomeNotFoundException, RangeFormatException,
217 RangeLimitNotFoundException {
218 hc = new HitsCounter();
219 rc = new ReadCounter();
220 rScanner = new Scanner(new File(align));
221 tScanner = new Scanner(new File(target));
222 gScanner = new Scanner(new File(genome));
223 File sum = new File(summary);
224 if (!sum.exists()) {
225 sum.createNewFile();
226 }
227 summaryWriter = new FileWriter(sum);
228 detailsWriter = new FileWriter(new File(details));
229 targetCSWriter = new FileWriter(new File(targetCS));
230 rLine = new ReducedReadLine(rScanner.nextLine());
231 rlength = (rLine.isForwardRead())?rLine.end()-rLine.start()+1
232 :rLine.start()-rLine.end()+1;
233 numReads++;
234 ge = new GeneExtractor(genome);
235 }
236
237 private void finit() throws IOException {
238 summaryWriter.close();
239 rScanner.close();
240 tScanner.close();
241 gScanner.close();
242 detailsWriter.close();
243 }
244
245 /* ***********************************************************************
246 * MAIN COMPUTATION
247 * ***********************************************************************/
248 /**
249 * The main computation method.
250 *
251 * @throws IOException if the coverage file or the summary file can't be
252 * written.
253 * @throws NullOrNegativeRangeException
254 * @throws RangeLimitNotFoundException
255 * @throws ChromosomeNotFoundException
256 * @throws ChromosomeFormatException
257 * @throws RangeFormatException
258 */
259 public void compute() throws IOException,
260 RangeFormatException,
261 ChromosomeFormatException,
262 ChromosomeNotFoundException,
263 RangeLimitNotFoundException,
264 NullOrNegativeRangeException {
265 // run over all targets.
266 while (tScanner.hasNextLine()) {
267 // Set a flag indicating if the target file and the read file are
268 // at the same chromosome level. (Standard value: false)
269 boolean chromAlreadyReached = false;
270 // Read the next target.
271 TargetLine tLine = new TargetLine(tScanner.nextLine());
272 // Save the number of target bases.
273 currTargetBases = tLine.end() - tLine.start() + 1;
274 // Initialize the array of the number if hits on each base of the
275 // target.
276 int[] bases = new int[currTargetBases];
277 // Update the current total number of observed target bases.
278 targetBases += currTargetBases;
279 // Initialize the average coverage of the current target.
280 curAvCov = 0;
281 // Set a flag indicating if the coverage data for the current target
282 // are already written or not.
283 covWritten = false;
284 // Find the gene overlapping the current target by using the
285 // logarithmic time gene extractor.
286 String currTargetGene = ge.extractGene(tLine);
287
288 // Stop if end of the alignment file is reached.
289 while (!endReads) {
290 // Otherwise.
291 // Check if the alignment file and the target file are at the
292 // same chromosome level.
293 if (rLine.chrom().equals(tLine.chrom())) {
294 chromAlreadyReached = true;
295 // Target not exceeded yet.
296 if (!readExceededTarget(rLine, tLine)) {
297 // Increment "on target" and "overlapping" values.
298 incOnTargetValues(rLine, tLine);
299 incOverlapsValues(rLine, tLine, bases);
300 } else {
301 // Compute the average coverage of the current target
302 // and update the target hits values at the 5 different
303 // levels.
304 curAvCov = updateTargetHitsValues(bases);
305 // Write the coverage data for the current target.
306 writeCoverageLines(currTargetBases, curAvCov, tLine
307 + "", bases, currTargetGene);
308 // Break the while-loop, since all next reads belong
309 // to next targets.
310 break;
311 }
312 }
313 // otherwise
314 else {
315 // alignment took already place beforehand.
316 if (!covWritten && chromAlreadyReached) {
317 curAvCov = updateTargetHitsValues(bases);
318 writeCoverageLines(currTargetBases, curAvCov, tLine
319 + "", bases, currTargetGene);
320 break;
321 }
322 }
323 // If exists, get the next read for the next target and
324 // increment the number of reads by one.
325 if (rScanner.hasNextLine()) {
326 rLine = new ReducedReadLine(rScanner.nextLine());
327 numReads++;
328 }
329 // otherwise (no reads anymore).
330 else {
331 endReads = true;
332 }
333 }
334 // Reads completely processed but last coverage line not written.
335 if (endReads && covWritten == false) {
336 curAvCov = updateTargetHitsValues(bases);
337 writeCoverageLines(currTargetBases, curAvCov, tLine + "",
338 bases, currTargetGene);
339 }
340 }
341 // Count the remaining reads.
342 while (rScanner.hasNextLine()) {
343 rScanner.nextLine();
344 numReads++;
345 }
346 targetCSWriter.close();
347 System.out.println("Target BED coverage file created ("+targetCS+")");
348 System.out.println("Per-base coverage file created ("+details+")");
349
350 // Self explanatory.
351 double sd = Math.sqrt(var(targetBases, avCov));
352 writeSummary(numReads, targetBases, avCov, sd);
353
354 System.out.println("Summary statistics file created ("+summary+")");
355 finit();
356 }
357
358 /* ***********************************************************************
359 * AUXILIARY METHODS
360 * ***********************************************************************/
361 /**
362 * Updates the hits values at 5 different levels (1x, 5x, 10x, 20x, 30x) and
363 * the average target coverage.
364 *
365 * @param bases the hits on the bases of the current target.
366 * @return the average coverage of the current target.
367 */
368 private double updateTargetHitsValues(int[] bases) {
369 double curAvCov = 0;
370 for(int i = 0; i < bases.length; i++) {
371 curAvCov += bases[i];
372 hc.updateCurrentHits(bases[i]);
373 }
374 hc.updateHits();
375 avCov += curAvCov;
376 return curAvCov;
377 }
378
379 /**
380 * Writes the summary file containing overall statistics for the observed
381 * sample.
382 *
383 * @param numReads the total number of reads.
384 * @param targetBases the total number of target bases.
385 * @param avCov the average target coverage.
386 * @param sd the standard deviation of the target coverage.
387 *
388 */
389 private void writeSummary(int numReads, int targetBases, double avCov,
390 double sd){
391
392 int x1 = hc.getNextLevelHits();
393 int x5 = hc.getNextLevelHits();
394 int x10 = hc.getNextLevelHits();
395 int x20 = hc.getNextLevelHits();
396 int x30 = hc.getNextLevelHits();
397
398 XMLSummaryFileBuilder xbf =
399 new XMLSummaryFileBuilder(summary, prefix);
400 Element[] tags = xbf.getLeafs();
401 tags[SAMPLE_NUMBER].setText("1");
402 tags[SAMPLE_NAME].setText(prefix);
403 tags[READ_LENGTH].setText(rlength+"");
404 tags[NUMBER_OF_READS].setText(numReads+"");
405 tags[TARGET_BASES].setText(targetBases+"");
406 int d = 2;
407 tags[AVERAGE_COVERAGE].setText(dfloor(avCov / targetBases, d)+"");
408 tags[STANDARD_ERROR].setText(dfloor(sd, d)+"");
409
410 tags[ON].setText(rc.on()+"");
411 tags[ON_PERC].setText(dfloor((rc.on()/(double)numReads)*100, d)+"");
412 tags[ON_100].setText(rc.on100()+"");
413 tags[ON100P].setText(dfloor((rc.on100()/(double)numReads)*100, d)+"");
414 tags[ON_200].setText(rc.on200()+"");
415 tags[ON200P].setText(dfloor((rc.on200()/(double)numReads)*100, d)+"");
416
417 tags[OV].setText(rc.over()+"");
418 tags[OV_PERC].setText(dfloor((rc.over()/(double)numReads)*100, d)+"");
419 tags[OV_100].setText(rc.over100()+"");
420 tags[OV100P].setText(dfloor((rc.over100()/(double)numReads)*100, d)+"");
421 tags[OV_200].setText(rc.over200()+"");
422 tags[OV200P].setText(dfloor((rc.over200()/(double)numReads)*100, d)+"");
423
424 tags[from1xBASES].setText(x1+"");
425 tags[from1xPERC].setText(dfloor((x1/(double)targetBases)*100, d)+"");
426 tags[from5xBASES].setText(x5+"");
427 tags[from5xPERC].setText(dfloor((x5/(double)targetBases)*100, d)+"");
428 tags[from10xBASES].setText(x10+"");
429 tags[from10xPERC].setText(dfloor((x10/(double)targetBases)*100, d)+"");
430 tags[from20xBASES].setText(x20+"");
431 tags[from20xPERC].setText(dfloor((x20/(double)targetBases)*100, d)+"");
432 tags[from30xBASES].setText(x30+"");
433 tags[from30xPERC].setText(dfloor((x30/(double)targetBases)*100, d)+"");
434
435 xbf.writeXMLFile(summary);
436 }
437
438 /**
439 * Computes the variance of the target coverage.
440 *
441 * @param targetBases the total number of target bases.
442 * @param avCov the average target coverage.
443 * @return The variance of the target coverage.
444 * @throws FileNotFoundException if the coverage BED file can't be found.
445 */
446 private double var(int targetBases, double avCov)
447 throws FileNotFoundException {
448
449 Scanner mdScanner = new Scanner(new File(targetCS));
450 int n = 0;
451 double X = avCov / targetBases;
452 double sum = 0;
453
454 while (mdScanner.hasNextLine()) {
455 String mdLine = mdScanner.nextLine();
456 String[] mdFields = mdLine.split("\t");
457 double X_i = Double.parseDouble(mdFields[4]);
458 sum += (X_i - X) * (X_i - X);
459 n += 1;
460 }
461
462 sum /= n - 1;
463 return sum;
464 }
465
466 /**
467 * Writes detailed coverage data to the coverage file and a coverage line
468 * to the coverage BED file.
469 *
470 * @param currTargetBases number of bases of the current target.
471 * @param curAvCov the average coverage on the specified target tLine.
472 * @param tLine the target line.
473 * @param bases the hits on the bases of the target tLine.
474 * @param currTargetGene the gene overlapping the current target.
475 * @throws IOException if the writer fails to write to coverage line.
476 */
477 private void writeCoverageLines(int currTargetBases, double curAvCov,
478 String tLine, int[] bases, String currTargetGene)
479 throws IOException {
480 detailsWriter.write(tLine + "\r\n");
481 for(int i = 0; i < bases.length; i++)
482 detailsWriter.write(bases[i] + "\n");
483 targetCSWriter.write(tLine + "\t" + currTargetGene + "\t"
484 + dfloor((curAvCov / (double)currTargetBases), 2) + "\t"
485 + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
486 + "\t"
487 + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
488 + "\t"
489 + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
490 + "\t"
491 + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
492 + "\t"
493 + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
494 + "\r\n");
495 covWritten = true;
496 hc.resetCurrentHits();
497 }
498
499 /**
500 * Cut decimal places greater than the specified one.
501 * @param number the number to be cut.
502 * @param decimalPlace the decimalPlace after which the cut-operation
503 * take place.
504 * @return the cut number.
505 */
506 public static double dfloor(double number, int decimalPlace) {
507 return Math.floor(number * Math.pow(10, decimalPlace))
508 / Math.pow(10, decimalPlace);
509 }
510
511 /**
512 * Checks if the read exceeded the target.
513 * @param rl read line.
514 * @param tl target line.
515 * @return true if the read starts right to the end position of the target
516 * and false otherwise.
517 */
518 public static boolean readExceededTarget(ReducedReadLine rl, TargetLine tl) {
519 return !(rl.start() <= tl.end() || rl.end() <= tl.end());
520 }
521
522 /**
523 * <pre>
524 * 1. Increments the number of Reads, overlapping targets within a scope of
525 * 0, 100 and 200 bases from both ends, by one.
526 * 2. Increments the number of hits on the hitted bases by one.
527 * </pre>
528 * @param rl the read line.
529 * @param tl the target line.
530 * @param bases the base hits array of the target.
531 */
532 private void incOverlapsValues(ReducedReadLine rl, TargetLine tl, int[] bases) {
533 // Forward reads.
534 if (rl.isForwardRead()) {
535 if (rl.end() >= tl.start() && rl.start() <= tl.end()) {
536 // incrementing the number of his on the hitted bases by one.
537 int start = Math.max(0, rl.start() - tl.start());
538 int stop = Math.min(bases.length - 1, rl.end() - tl.start());
539 for (int i = start; i <= stop; i++)
540 bases[i]++;
541 // incrementing the number of Reads overlapping targets by one.
542 rc.incOver();
543 }
544 if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100)
545 // incrementing the number of Reads overlapping targets(+/-100)
546 // by one.
547 rc.incOver100();
548 if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100)
549 // incrementing the number of Reads overlapping targets(+/-200)
550 // by one.
551 rc.incOver200();
552 }
553 // Reverse reads.
554 else {
555 if (rl.start() >= tl.start() && rl.end() <= tl.end()) {
556 int start = Math.max(0, rl.end() - tl.start());
557 int stop = Math.min(bases.length - 1, rl.start() - tl.start());
558 for (int i = start; i <= stop; i++) bases[i]++;
559 rc.incOver();
560 }
561 if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100)
562 rc.incOver100();
563 if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100)
564 rc.incOver200();
565 }
566 }
567
568 /**
569 * Increments the number of Reads on targets within a scope of 0, 100 and
570 * 200 bases from both ends, by one.
571 * @param rl the read line.
572 * @param tl the target line.
573 */
574 private void incOnTargetValues(ReducedReadLine rl, TargetLine tl) {
575 // Forward read.
576 if (rl.isForwardRead()) {
577 if ((tl.start() <= rl.start()) && (rl.end() <= tl.end()))
578 rc.incOn();
579 if ((tl.start()-100 <= rl.start()) && (rl.end() <= tl.end()+100))
580 rc.incOn100();
581 if ((tl.start()-200 <= rl.start()) && (rl.end() <= tl.end()+200))
582 rc.incOn200();
583 }
584 // Reverse read.
585 else {
586 if (tl.start() <= rl.end() && rl.start() <= tl.end())
587 rc.incOn();
588 if (tl.start()-100 <= rl.end() && rl.start()<= tl.end()+100)
589 rc.incOn100();
590 if (tl.start()-200 <= rl.end() && rl.start() <= tl.end()+200)
591 rc.incOn200();
592 }
593 }
594
595 }