Mercurial > repos > pfrommolt > ngsrich
view NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line source
package _main; import java.io.File; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.util.Scanner; import org.jdom.Element; import datastructures.ReducedReadLine; import datastructures.TargetLine; import exceptions.ChromosomeFormatException; import exceptions.ChromosomeNotFoundException; import exceptions.NullOrNegativeRangeException; import exceptions.RangeFormatException; import exceptions.RangeLimitNotFoundException; import middlewares.GeneExtractor; import middlewares.HitsCounter; import middlewares.Misc; import middlewares.ReadCounter; import middlewares.XMLSummaryFileBuilder; /** * This class is for computation of the coverage, BED coverage and statistics * output files. * * @author Ali Abdallah * @version 02.02.2011 * @since jdk 1.6.0 */ public class EnrichmentStatsComputer { /* *********************************************************************** * FIELDS OF THE CLASS * ***********************************************************************/ private String /** * 4-column alignment file sorted by alignment subject and position. */ align, /** * 3-column target file sorted by subject and target starting position. */ target, /** * Annotation file. */ genome, /** * Statistics file in the xml-format. */ summary, /** * Coverage file. */ details, /** * Coverage BED file. */ targetCS, /** * The absolute name of the read alignment file without the extension. */ prefix, /** * The directory under which temporary data are saved (temporary directory). */ tmpDir, /** * The directory of the computed results (output directory). */ outDir; private Scanner /** * Scanner of the read alignment file (sam format). */ rScanner, /** * Scanner of the genome annotation file. */ gScanner, /** * Scanner of the file of targets. */ tScanner; private FileWriter /** * Writer used to write the computed statistics into the statistics file. */ summaryWriter, /** * Writer used to write coverage data at base-level into the coverage file. */ detailsWriter, /** * Writer used to write the summarized coverage data at target level into * the coverage BED file. */ targetCSWriter; /** * Counter used to counts the coverage frequencies at different levels. */ private HitsCounter hc; /** * Counter used to count both: Reads exactly on the targets or overlapping * the targets and Reads doing this in a scope of 100 resp. 200 bases from * both end. */ private ReadCounter rc; /** * A Gene Extractor used to extract the genes overlapping the specified * targets. */ private GeneExtractor ge; /** * Counter for the reads. */ private int numReads = 0, /** * Counter for all target bases. */ targetBases = 0, /** * Counter for the bases in a specific target. */ currTargetBases = 0; /** * The average Coverage of the targets. */ private double avCov = 0; /** * The average Coverage of the current observed target. */ private double curAvCov = 0; /** * Flag indicating the end of the read file. */ private boolean endReads = false; /** * Flag indicates if the coverage line was written or not. */ private boolean covWritten = false; /** * Contains always the current observers read line from the sam file. */ private ReducedReadLine rLine; private int rlength; /** * ID's of the leaf-tags in the XML summary file. */ private static final int from30xPERC = 28, from30xBASES = 27,from20xPERC = 26, from20xBASES = 25, from10xPERC = 24, from10xBASES = 23,from5xPERC = 22, from5xBASES = 21, from1xPERC = 20, from1xBASES = 19, OV200P = 18, OV_200 = 17, OV100P = 16, OV_100 = 15, OV_PERC = 14, OV = 13, ON200P = 12, ON_200 = 11, ON100P = 10, ON_100 = 9, ON_PERC = 8, ON = 7, STANDARD_ERROR = 6, AVERAGE_COVERAGE = 5, TARGET_BASES = 4, NUMBER_OF_READS = 3, READ_LENGTH = 2, SAMPLE_NAME = 1, SAMPLE_NUMBER = 0; /** * Constructs a EnrichmentStatsComputer object. * * @param prefix the name of the sam file without its extension. * @param proc the number of the process running the software. * @param tmpDir the temporary directory * @param outDir the output directory. */ public EnrichmentStatsComputer(String prefix, String proc, String tmpDir, String outDir) { super(); this.prefix = prefix; // Setting the paths. this.tmpDir = tmpDir + Misc.slash(tmpDir); this.outDir = outDir + Misc.slash(outDir); this.align = this.tmpDir + "NGSrich_" + prefix + "_" + proc+ ".txt"; this.target = this.tmpDir + "NGSrich_target_" + proc + ".txt"; this.genome = this.tmpDir + "NGSrich_genome_" + proc + ".txt"; this.details = this.tmpDir + "coverage_" + proc + ".txt"; this.summary = this.outDir + "data/" + prefix + "_enrichment.xml"; this.targetCS = this.outDir + "data/" + prefix + "_enrichment.bed"; // Constructing and initializing various objects. try { init(); } catch (ChromosomeFormatException e) { e.printStackTrace(); } catch (ChromosomeNotFoundException e) { e.printStackTrace(); } catch (RangeFormatException e) { e.printStackTrace(); } catch (RangeLimitNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } public void init() throws IOException, ChromosomeFormatException, ChromosomeNotFoundException, RangeFormatException, RangeLimitNotFoundException { hc = new HitsCounter(); rc = new ReadCounter(); rScanner = new Scanner(new File(align)); tScanner = new Scanner(new File(target)); gScanner = new Scanner(new File(genome)); File sum = new File(summary); if (!sum.exists()) { sum.createNewFile(); } summaryWriter = new FileWriter(sum); detailsWriter = new FileWriter(new File(details)); targetCSWriter = new FileWriter(new File(targetCS)); rLine = new ReducedReadLine(rScanner.nextLine()); rlength = (rLine.isForwardRead())?rLine.end()-rLine.start()+1 :rLine.start()-rLine.end()+1; numReads++; ge = new GeneExtractor(genome); } private void finit() throws IOException { summaryWriter.close(); rScanner.close(); tScanner.close(); gScanner.close(); detailsWriter.close(); } /* *********************************************************************** * MAIN COMPUTATION * ***********************************************************************/ /** * The main computation method. * * @throws IOException if the coverage file or the summary file can't be * written. * @throws NullOrNegativeRangeException * @throws RangeLimitNotFoundException * @throws ChromosomeNotFoundException * @throws ChromosomeFormatException * @throws RangeFormatException */ public void compute() throws IOException, RangeFormatException, ChromosomeFormatException, ChromosomeNotFoundException, RangeLimitNotFoundException, NullOrNegativeRangeException { // run over all targets. while (tScanner.hasNextLine()) { // Set a flag indicating if the target file and the read file are // at the same chromosome level. (Standard value: false) boolean chromAlreadyReached = false; // Read the next target. TargetLine tLine = new TargetLine(tScanner.nextLine()); // Save the number of target bases. currTargetBases = tLine.end() - tLine.start() + 1; // Initialize the array of the number if hits on each base of the // target. int[] bases = new int[currTargetBases]; // Update the current total number of observed target bases. targetBases += currTargetBases; // Initialize the average coverage of the current target. curAvCov = 0; // Set a flag indicating if the coverage data for the current target // are already written or not. covWritten = false; // Find the gene overlapping the current target by using the // logarithmic time gene extractor. String currTargetGene = ge.extractGene(tLine); // Stop if end of the alignment file is reached. while (!endReads) { // Otherwise. // Check if the alignment file and the target file are at the // same chromosome level. if (rLine.chrom().equals(tLine.chrom())) { chromAlreadyReached = true; // Target not exceeded yet. if (!readExceededTarget(rLine, tLine)) { // Increment "on target" and "overlapping" values. incOnTargetValues(rLine, tLine); incOverlapsValues(rLine, tLine, bases); } else { // Compute the average coverage of the current target // and update the target hits values at the 5 different // levels. curAvCov = updateTargetHitsValues(bases); // Write the coverage data for the current target. writeCoverageLines(currTargetBases, curAvCov, tLine + "", bases, currTargetGene); // Break the while-loop, since all next reads belong // to next targets. break; } } // otherwise else { // alignment took already place beforehand. if (!covWritten && chromAlreadyReached) { curAvCov = updateTargetHitsValues(bases); writeCoverageLines(currTargetBases, curAvCov, tLine + "", bases, currTargetGene); break; } } // If exists, get the next read for the next target and // increment the number of reads by one. if (rScanner.hasNextLine()) { rLine = new ReducedReadLine(rScanner.nextLine()); numReads++; } // otherwise (no reads anymore). else { endReads = true; } } // Reads completely processed but last coverage line not written. if (endReads && covWritten == false) { curAvCov = updateTargetHitsValues(bases); writeCoverageLines(currTargetBases, curAvCov, tLine + "", bases, currTargetGene); } } // Count the remaining reads. while (rScanner.hasNextLine()) { rScanner.nextLine(); numReads++; } targetCSWriter.close(); System.out.println("Target BED coverage file created ("+targetCS+")"); System.out.println("Per-base coverage file created ("+details+")"); // Self explanatory. double sd = Math.sqrt(var(targetBases, avCov)); writeSummary(numReads, targetBases, avCov, sd); System.out.println("Summary statistics file created ("+summary+")"); finit(); } /* *********************************************************************** * AUXILIARY METHODS * ***********************************************************************/ /** * Updates the hits values at 5 different levels (1x, 5x, 10x, 20x, 30x) and * the average target coverage. * * @param bases the hits on the bases of the current target. * @return the average coverage of the current target. */ private double updateTargetHitsValues(int[] bases) { double curAvCov = 0; for(int i = 0; i < bases.length; i++) { curAvCov += bases[i]; hc.updateCurrentHits(bases[i]); } hc.updateHits(); avCov += curAvCov; return curAvCov; } /** * Writes the summary file containing overall statistics for the observed * sample. * * @param numReads the total number of reads. * @param targetBases the total number of target bases. * @param avCov the average target coverage. * @param sd the standard deviation of the target coverage. * */ private void writeSummary(int numReads, int targetBases, double avCov, double sd){ int x1 = hc.getNextLevelHits(); int x5 = hc.getNextLevelHits(); int x10 = hc.getNextLevelHits(); int x20 = hc.getNextLevelHits(); int x30 = hc.getNextLevelHits(); XMLSummaryFileBuilder xbf = new XMLSummaryFileBuilder(summary, prefix); Element[] tags = xbf.getLeafs(); tags[SAMPLE_NUMBER].setText("1"); tags[SAMPLE_NAME].setText(prefix); tags[READ_LENGTH].setText(rlength+""); tags[NUMBER_OF_READS].setText(numReads+""); tags[TARGET_BASES].setText(targetBases+""); int d = 2; tags[AVERAGE_COVERAGE].setText(dfloor(avCov / targetBases, d)+""); tags[STANDARD_ERROR].setText(dfloor(sd, d)+""); tags[ON].setText(rc.on()+""); tags[ON_PERC].setText(dfloor((rc.on()/(double)numReads)*100, d)+""); tags[ON_100].setText(rc.on100()+""); tags[ON100P].setText(dfloor((rc.on100()/(double)numReads)*100, d)+""); tags[ON_200].setText(rc.on200()+""); tags[ON200P].setText(dfloor((rc.on200()/(double)numReads)*100, d)+""); tags[OV].setText(rc.over()+""); tags[OV_PERC].setText(dfloor((rc.over()/(double)numReads)*100, d)+""); tags[OV_100].setText(rc.over100()+""); tags[OV100P].setText(dfloor((rc.over100()/(double)numReads)*100, d)+""); tags[OV_200].setText(rc.over200()+""); tags[OV200P].setText(dfloor((rc.over200()/(double)numReads)*100, d)+""); tags[from1xBASES].setText(x1+""); tags[from1xPERC].setText(dfloor((x1/(double)targetBases)*100, d)+""); tags[from5xBASES].setText(x5+""); tags[from5xPERC].setText(dfloor((x5/(double)targetBases)*100, d)+""); tags[from10xBASES].setText(x10+""); tags[from10xPERC].setText(dfloor((x10/(double)targetBases)*100, d)+""); tags[from20xBASES].setText(x20+""); tags[from20xPERC].setText(dfloor((x20/(double)targetBases)*100, d)+""); tags[from30xBASES].setText(x30+""); tags[from30xPERC].setText(dfloor((x30/(double)targetBases)*100, d)+""); xbf.writeXMLFile(summary); } /** * Computes the variance of the target coverage. * * @param targetBases the total number of target bases. * @param avCov the average target coverage. * @return The variance of the target coverage. * @throws FileNotFoundException if the coverage BED file can't be found. */ private double var(int targetBases, double avCov) throws FileNotFoundException { Scanner mdScanner = new Scanner(new File(targetCS)); int n = 0; double X = avCov / targetBases; double sum = 0; while (mdScanner.hasNextLine()) { String mdLine = mdScanner.nextLine(); String[] mdFields = mdLine.split("\t"); double X_i = Double.parseDouble(mdFields[4]); sum += (X_i - X) * (X_i - X); n += 1; } sum /= n - 1; return sum; } /** * Writes detailed coverage data to the coverage file and a coverage line * to the coverage BED file. * * @param currTargetBases number of bases of the current target. * @param curAvCov the average coverage on the specified target tLine. * @param tLine the target line. * @param bases the hits on the bases of the target tLine. * @param currTargetGene the gene overlapping the current target. * @throws IOException if the writer fails to write to coverage line. */ private void writeCoverageLines(int currTargetBases, double curAvCov, String tLine, int[] bases, String currTargetGene) throws IOException { detailsWriter.write(tLine + "\r\n"); for(int i = 0; i < bases.length; i++) detailsWriter.write(bases[i] + "\n"); targetCSWriter.write(tLine + "\t" + currTargetGene + "\t" + dfloor((curAvCov / (double)currTargetBases), 2) + "\t" + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + "\t" + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + "\t" + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + "\t" + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + "\t" + dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2) + "\r\n"); covWritten = true; hc.resetCurrentHits(); } /** * Cut decimal places greater than the specified one. * @param number the number to be cut. * @param decimalPlace the decimalPlace after which the cut-operation * take place. * @return the cut number. */ public static double dfloor(double number, int decimalPlace) { return Math.floor(number * Math.pow(10, decimalPlace)) / Math.pow(10, decimalPlace); } /** * Checks if the read exceeded the target. * @param rl read line. * @param tl target line. * @return true if the read starts right to the end position of the target * and false otherwise. */ public static boolean readExceededTarget(ReducedReadLine rl, TargetLine tl) { return !(rl.start() <= tl.end() || rl.end() <= tl.end()); } /** * <pre> * 1. Increments the number of Reads, overlapping targets within a scope of * 0, 100 and 200 bases from both ends, by one. * 2. Increments the number of hits on the hitted bases by one. * </pre> * @param rl the read line. * @param tl the target line. * @param bases the base hits array of the target. */ private void incOverlapsValues(ReducedReadLine rl, TargetLine tl, int[] bases) { // Forward reads. if (rl.isForwardRead()) { if (rl.end() >= tl.start() && rl.start() <= tl.end()) { // incrementing the number of his on the hitted bases by one. int start = Math.max(0, rl.start() - tl.start()); int stop = Math.min(bases.length - 1, rl.end() - tl.start()); for (int i = start; i <= stop; i++) bases[i]++; // incrementing the number of Reads overlapping targets by one. rc.incOver(); } if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100) // incrementing the number of Reads overlapping targets(+/-100) // by one. rc.incOver100(); if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100) // incrementing the number of Reads overlapping targets(+/-200) // by one. rc.incOver200(); } // Reverse reads. else { if (rl.start() >= tl.start() && rl.end() <= tl.end()) { int start = Math.max(0, rl.end() - tl.start()); int stop = Math.min(bases.length - 1, rl.start() - tl.start()); for (int i = start; i <= stop; i++) bases[i]++; rc.incOver(); } if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100) rc.incOver100(); if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100) rc.incOver200(); } } /** * Increments the number of Reads on targets within a scope of 0, 100 and * 200 bases from both ends, by one. * @param rl the read line. * @param tl the target line. */ private void incOnTargetValues(ReducedReadLine rl, TargetLine tl) { // Forward read. if (rl.isForwardRead()) { if ((tl.start() <= rl.start()) && (rl.end() <= tl.end())) rc.incOn(); if ((tl.start()-100 <= rl.start()) && (rl.end() <= tl.end()+100)) rc.incOn100(); if ((tl.start()-200 <= rl.start()) && (rl.end() <= tl.end()+200)) rc.incOn200(); } // Reverse read. else { if (tl.start() <= rl.end() && rl.start() <= tl.end()) rc.incOn(); if (tl.start()-100 <= rl.end() && rl.start()<= tl.end()+100) rc.incOn100(); if (tl.start()-200 <= rl.end() && rl.start() <= tl.end()+200) rc.incOn200(); } } }