Mercurial > repos > pfrommolt > ngsrich
diff NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java Mon Nov 21 08:12:19 2011 -0500 @@ -0,0 +1,595 @@ +package _main; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.Scanner; + +import org.jdom.Element; + +import datastructures.ReducedReadLine; +import datastructures.TargetLine; +import exceptions.ChromosomeFormatException; +import exceptions.ChromosomeNotFoundException; +import exceptions.NullOrNegativeRangeException; +import exceptions.RangeFormatException; +import exceptions.RangeLimitNotFoundException; + +import middlewares.GeneExtractor; +import middlewares.HitsCounter; +import middlewares.Misc; +import middlewares.ReadCounter; +import middlewares.XMLSummaryFileBuilder; + +/** + * This class is for computation of the coverage, BED coverage and statistics + * output files. + * + * @author Ali Abdallah + * @version 02.02.2011 + * @since jdk 1.6.0 + */ + +public class EnrichmentStatsComputer { + + /* *********************************************************************** + * FIELDS OF THE CLASS + * ***********************************************************************/ + private String + /** + * 4-column alignment file sorted by alignment subject and position. + */ + align, + /** + * 3-column target file sorted by subject and target starting position. + */ + target, + /** + * Annotation file. + */ + genome, + /** + * Statistics file in the xml-format. + */ + summary, + /** + * Coverage file. + */ + details, + /** + * Coverage BED file. + */ + targetCS, + /** + * The absolute name of the read alignment file without the extension. + */ + prefix, + + /** + * The directory under which temporary data are saved (temporary directory). + */ + tmpDir, + /** + * The directory of the computed results (output directory). + */ + outDir; + + + private Scanner + /** + * Scanner of the read alignment file (sam format). + */ + rScanner, + /** + * Scanner of the genome annotation file. + */ + gScanner, + /** + * Scanner of the file of targets. + */ + tScanner; + + private FileWriter + /** + * Writer used to write the computed statistics into the statistics file. + */ + summaryWriter, + /** + * Writer used to write coverage data at base-level into the coverage file. + */ + detailsWriter, + /** + * Writer used to write the summarized coverage data at target level into + * the coverage BED file. + */ + targetCSWriter; + + /** + * Counter used to counts the coverage frequencies at different levels. + */ + private HitsCounter hc; + /** + * Counter used to count both: Reads exactly on the targets or overlapping + * the targets and Reads doing this in a scope of 100 resp. 200 bases from + * both end. + */ + private ReadCounter rc; + + /** + * A Gene Extractor used to extract the genes overlapping the specified + * targets. + */ + private GeneExtractor ge; + + /** + * Counter for the reads. + */ + private int numReads = 0, + /** + * Counter for all target bases. + */ + targetBases = 0, + /** + * Counter for the bases in a specific target. + */ + currTargetBases = 0; + + /** + * The average Coverage of the targets. + */ + private double avCov = 0; + /** + * The average Coverage of the current observed target. + */ + private double curAvCov = 0; + /** + * Flag indicating the end of the read file. + */ + private boolean endReads = false; + /** + * Flag indicates if the coverage line was written or not. + */ + private boolean covWritten = false; + + /** + * Contains always the current observers read line from the sam file. + */ + private ReducedReadLine rLine; + + + private int rlength; + + /** + * ID's of the leaf-tags in the XML summary file. + */ + private static final int + from30xPERC = 28, from30xBASES = 27,from20xPERC = 26, from20xBASES = 25, + from10xPERC = 24, from10xBASES = 23,from5xPERC = 22, from5xBASES = 21, + from1xPERC = 20, from1xBASES = 19, OV200P = 18, OV_200 = 17, + OV100P = 16, OV_100 = 15, OV_PERC = 14, OV = 13, ON200P = 12, + ON_200 = 11, ON100P = 10, ON_100 = 9, ON_PERC = 8, ON = 7, + STANDARD_ERROR = 6, AVERAGE_COVERAGE = 5, TARGET_BASES = 4, + NUMBER_OF_READS = 3, READ_LENGTH = 2, SAMPLE_NAME = 1, SAMPLE_NUMBER = 0; + + /** + * Constructs a EnrichmentStatsComputer object. + * + * @param prefix the name of the sam file without its extension. + * @param proc the number of the process running the software. + * @param tmpDir the temporary directory + * @param outDir the output directory. + */ + public EnrichmentStatsComputer(String prefix, String proc, String tmpDir, + String outDir) { + super(); + + this.prefix = prefix; + + // Setting the paths. + this.tmpDir = tmpDir + Misc.slash(tmpDir); + this.outDir = outDir + Misc.slash(outDir); + this.align = this.tmpDir + "NGSrich_" + prefix + "_" + proc+ ".txt"; + this.target = this.tmpDir + "NGSrich_target_" + proc + ".txt"; + this.genome = this.tmpDir + "NGSrich_genome_" + proc + ".txt"; + this.details = this.tmpDir + "coverage_" + proc + ".txt"; + this.summary = this.outDir + "data/" + prefix + "_enrichment.xml"; + this.targetCS = this.outDir + "data/" + prefix + "_enrichment.bed"; + + // Constructing and initializing various objects. + try { + init(); + } catch (ChromosomeFormatException e) { + e.printStackTrace(); + } catch (ChromosomeNotFoundException e) { + e.printStackTrace(); + } catch (RangeFormatException e) { + e.printStackTrace(); + } catch (RangeLimitNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void init() throws IOException, ChromosomeFormatException, + ChromosomeNotFoundException, RangeFormatException, + RangeLimitNotFoundException { + hc = new HitsCounter(); + rc = new ReadCounter(); + rScanner = new Scanner(new File(align)); + tScanner = new Scanner(new File(target)); + gScanner = new Scanner(new File(genome)); + File sum = new File(summary); + if (!sum.exists()) { + sum.createNewFile(); + } + summaryWriter = new FileWriter(sum); + detailsWriter = new FileWriter(new File(details)); + targetCSWriter = new FileWriter(new File(targetCS)); + rLine = new ReducedReadLine(rScanner.nextLine()); + rlength = (rLine.isForwardRead())?rLine.end()-rLine.start()+1 + :rLine.start()-rLine.end()+1; + numReads++; + ge = new GeneExtractor(genome); + } + + private void finit() throws IOException { + summaryWriter.close(); + rScanner.close(); + tScanner.close(); + gScanner.close(); + detailsWriter.close(); + } + + /* *********************************************************************** + * MAIN COMPUTATION + * ***********************************************************************/ + /** + * The main computation method. + * + * @throws IOException if the coverage file or the summary file can't be + * written. + * @throws NullOrNegativeRangeException + * @throws RangeLimitNotFoundException + * @throws ChromosomeNotFoundException + * @throws ChromosomeFormatException + * @throws RangeFormatException + */ + public void compute() throws IOException, + RangeFormatException, + ChromosomeFormatException, + ChromosomeNotFoundException, + RangeLimitNotFoundException, + NullOrNegativeRangeException { + // run over all targets. + while (tScanner.hasNextLine()) { + // Set a flag indicating if the target file and the read file are + // at the same chromosome level. (Standard value: false) + boolean chromAlreadyReached = false; + // Read the next target. + TargetLine tLine = new TargetLine(tScanner.nextLine()); + // Save the number of target bases. + currTargetBases = tLine.end() - tLine.start() + 1; + // Initialize the array of the number if hits on each base of the + // target. + int[] bases = new int[currTargetBases]; + // Update the current total number of observed target bases. + targetBases += currTargetBases; + // Initialize the average coverage of the current target. + curAvCov = 0; + // Set a flag indicating if the coverage data for the current target + // are already written or not. + covWritten = false; + // Find the gene overlapping the current target by using the + // logarithmic time gene extractor. + String currTargetGene = ge.extractGene(tLine); + + // Stop if end of the alignment file is reached. + while (!endReads) { + // Otherwise. + // Check if the alignment file and the target file are at the + // same chromosome level. + if (rLine.chrom().equals(tLine.chrom())) { + chromAlreadyReached = true; + // Target not exceeded yet. + if (!readExceededTarget(rLine, tLine)) { + // Increment "on target" and "overlapping" values. + incOnTargetValues(rLine, tLine); + incOverlapsValues(rLine, tLine, bases); + } else { + // Compute the average coverage of the current target + // and update the target hits values at the 5 different + // levels. + curAvCov = updateTargetHitsValues(bases); + // Write the coverage data for the current target. + writeCoverageLines(currTargetBases, curAvCov, tLine + + "", bases, currTargetGene); + // Break the while-loop, since all next reads belong + // to next targets. + break; + } + } + // otherwise + else { + // alignment took already place beforehand. + if (!covWritten && chromAlreadyReached) { + curAvCov = updateTargetHitsValues(bases); + writeCoverageLines(currTargetBases, curAvCov, tLine + + "", bases, currTargetGene); + break; + } + } + // If exists, get the next read for the next target and + // increment the number of reads by one. + if (rScanner.hasNextLine()) { + rLine = new ReducedReadLine(rScanner.nextLine()); + numReads++; + } + // otherwise (no reads anymore). + else { + endReads = true; + } + } + // Reads completely processed but last coverage line not written. + if (endReads && covWritten == false) { + curAvCov = updateTargetHitsValues(bases); + writeCoverageLines(currTargetBases, curAvCov, tLine + "", + bases, currTargetGene); + } + } + // Count the remaining reads. + while (rScanner.hasNextLine()) { + rScanner.nextLine(); + numReads++; + } + targetCSWriter.close(); + System.out.println("Target BED coverage file created ("+targetCS+")"); + System.out.println("Per-base coverage file created ("+details+")"); + + // Self explanatory. + double sd = Math.sqrt(var(targetBases, avCov)); + writeSummary(numReads, targetBases, avCov, sd); + + System.out.println("Summary statistics file created ("+summary+")"); + finit(); + } + + /* *********************************************************************** + * AUXILIARY METHODS + * ***********************************************************************/ + /** + * Updates the hits values at 5 different levels (1x, 5x, 10x, 20x, 30x) and + * the average target coverage. + * + * @param bases the hits on the bases of the current target. + * @return the average coverage of the current target. + */ + private double updateTargetHitsValues(int[] bases) { + double curAvCov = 0; + for(int i = 0; i < bases.length; i++) { + curAvCov += bases[i]; + hc.updateCurrentHits(bases[i]); + } + hc.updateHits(); + avCov += curAvCov; + return curAvCov; + } + + /** + * Writes the summary file containing overall statistics for the observed + * sample. + * + * @param numReads the total number of reads. + * @param targetBases the total number of target bases. + * @param avCov the average target coverage. + * @param sd the standard deviation of the target coverage. + * + */ + private void writeSummary(int numReads, int targetBases, double avCov, + double sd){ + + int x1 = hc.getNextLevelHits(); + int x5 = hc.getNextLevelHits(); + int x10 = hc.getNextLevelHits(); + int x20 = hc.getNextLevelHits(); + int x30 = hc.getNextLevelHits(); + + XMLSummaryFileBuilder xbf = + new XMLSummaryFileBuilder(summary, prefix); + Element[] tags = xbf.getLeafs(); + tags[SAMPLE_NUMBER].setText("1"); + tags[SAMPLE_NAME].setText(prefix); + tags[READ_LENGTH].setText(rlength+""); + tags[NUMBER_OF_READS].setText(numReads+""); + tags[TARGET_BASES].setText(targetBases+""); + int d = 2; + tags[AVERAGE_COVERAGE].setText(dfloor(avCov / targetBases, d)+""); + tags[STANDARD_ERROR].setText(dfloor(sd, d)+""); + + tags[ON].setText(rc.on()+""); + tags[ON_PERC].setText(dfloor((rc.on()/(double)numReads)*100, d)+""); + tags[ON_100].setText(rc.on100()+""); + tags[ON100P].setText(dfloor((rc.on100()/(double)numReads)*100, d)+""); + tags[ON_200].setText(rc.on200()+""); + tags[ON200P].setText(dfloor((rc.on200()/(double)numReads)*100, d)+""); + + tags[OV].setText(rc.over()+""); + tags[OV_PERC].setText(dfloor((rc.over()/(double)numReads)*100, d)+""); + tags[OV_100].setText(rc.over100()+""); + tags[OV100P].setText(dfloor((rc.over100()/(double)numReads)*100, d)+""); + tags[OV_200].setText(rc.over200()+""); + tags[OV200P].setText(dfloor((rc.over200()/(double)numReads)*100, d)+""); + + tags[from1xBASES].setText(x1+""); + tags[from1xPERC].setText(dfloor((x1/(double)targetBases)*100, d)+""); + tags[from5xBASES].setText(x5+""); + tags[from5xPERC].setText(dfloor((x5/(double)targetBases)*100, d)+""); + tags[from10xBASES].setText(x10+""); + tags[from10xPERC].setText(dfloor((x10/(double)targetBases)*100, d)+""); + tags[from20xBASES].setText(x20+""); + tags[from20xPERC].setText(dfloor((x20/(double)targetBases)*100, d)+""); + tags[from30xBASES].setText(x30+""); + tags[from30xPERC].setText(dfloor((x30/(double)targetBases)*100, d)+""); + + xbf.writeXMLFile(summary); + } + + /** + * Computes the variance of the target coverage. + * + * @param targetBases the total number of target bases. + * @param avCov the average target coverage. + * @return The variance of the target coverage. + * @throws FileNotFoundException if the coverage BED file can't be found. + */ + private double var(int targetBases, double avCov) + throws FileNotFoundException { + + Scanner mdScanner = new Scanner(new File(targetCS)); + int n = 0; + double X = avCov / targetBases; + double sum = 0; + + while (mdScanner.hasNextLine()) { + String mdLine = mdScanner.nextLine(); + String[] mdFields = mdLine.split("\t"); + double X_i = Double.parseDouble(mdFields[4]); + sum += (X_i - X) * (X_i - X); + n += 1; + } + + sum /= n - 1; + return sum; + } + + /** + * Writes detailed coverage data to the coverage file and a coverage line + * to the coverage BED file. + * + * @param currTargetBases number of bases of the current target. + * @param curAvCov the average coverage on the specified target tLine. + * @param tLine the target line. + * @param bases the hits on the bases of the target tLine. + * @param currTargetGene the gene overlapping the current target. + * @throws IOException if the writer fails to write to coverage line. + */ + private void writeCoverageLines(int currTargetBases, double curAvCov, + String tLine, int[] bases, String currTargetGene) + throws IOException { + detailsWriter.write(tLine + "\r\n"); + for(int i = 0; i < bases.length; i++) + detailsWriter.write(bases[i] + "\n"); + targetCSWriter.write(tLine + "\t" + currTargetGene + "\t" + + dfloor((curAvCov / (double)currTargetBases), 2) + "\t" + + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + + "\t" + + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + + "\t" + + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + + "\t" + + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + + "\t" + + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + + "\r\n"); + covWritten = true; + hc.resetCurrentHits(); + } + + /** + * Cut decimal places greater than the specified one. + * @param number the number to be cut. + * @param decimalPlace the decimalPlace after which the cut-operation + * take place. + * @return the cut number. + */ + public static double dfloor(double number, int decimalPlace) { + return Math.floor(number * Math.pow(10, decimalPlace)) + / Math.pow(10, decimalPlace); + } + + /** + * Checks if the read exceeded the target. + * @param rl read line. + * @param tl target line. + * @return true if the read starts right to the end position of the target + * and false otherwise. + */ + public static boolean readExceededTarget(ReducedReadLine rl, TargetLine tl) { + return !(rl.start() <= tl.end() || rl.end() <= tl.end()); + } + + /** + * <pre> + * 1. Increments the number of Reads, overlapping targets within a scope of + * 0, 100 and 200 bases from both ends, by one. + * 2. Increments the number of hits on the hitted bases by one. + * </pre> + * @param rl the read line. + * @param tl the target line. + * @param bases the base hits array of the target. + */ + private void incOverlapsValues(ReducedReadLine rl, TargetLine tl, int[] bases) { + // Forward reads. + if (rl.isForwardRead()) { + if (rl.end() >= tl.start() && rl.start() <= tl.end()) { + // incrementing the number of his on the hitted bases by one. + int start = Math.max(0, rl.start() - tl.start()); + int stop = Math.min(bases.length - 1, rl.end() - tl.start()); + for (int i = start; i <= stop; i++) + bases[i]++; + // incrementing the number of Reads overlapping targets by one. + rc.incOver(); + } + if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100) + // incrementing the number of Reads overlapping targets(+/-100) + // by one. + rc.incOver100(); + if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100) + // incrementing the number of Reads overlapping targets(+/-200) + // by one. + rc.incOver200(); + } + // Reverse reads. + else { + if (rl.start() >= tl.start() && rl.end() <= tl.end()) { + int start = Math.max(0, rl.end() - tl.start()); + int stop = Math.min(bases.length - 1, rl.start() - tl.start()); + for (int i = start; i <= stop; i++) bases[i]++; + rc.incOver(); + } + if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100) + rc.incOver100(); + if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100) + rc.incOver200(); + } + } + + /** + * Increments the number of Reads on targets within a scope of 0, 100 and + * 200 bases from both ends, by one. + * @param rl the read line. + * @param tl the target line. + */ + private void incOnTargetValues(ReducedReadLine rl, TargetLine tl) { + // Forward read. + if (rl.isForwardRead()) { + if ((tl.start() <= rl.start()) && (rl.end() <= tl.end())) + rc.incOn(); + if ((tl.start()-100 <= rl.start()) && (rl.end() <= tl.end()+100)) + rc.incOn100(); + if ((tl.start()-200 <= rl.start()) && (rl.end() <= tl.end()+200)) + rc.incOn200(); + } + // Reverse read. + else { + if (tl.start() <= rl.end() && rl.start() <= tl.end()) + rc.incOn(); + if (tl.start()-100 <= rl.end() && rl.start()<= tl.end()+100) + rc.incOn100(); + if (tl.start()-200 <= rl.end() && rl.start() <= tl.end()+200) + rc.incOn200(); + } + } + +} \ No newline at end of file