diff NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java	Mon Nov 21 08:12:19 2011 -0500
@@ -0,0 +1,595 @@
+package _main;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Scanner;
+
+import org.jdom.Element;
+
+import datastructures.ReducedReadLine;
+import datastructures.TargetLine;
+import exceptions.ChromosomeFormatException;
+import exceptions.ChromosomeNotFoundException;
+import exceptions.NullOrNegativeRangeException;
+import exceptions.RangeFormatException;
+import exceptions.RangeLimitNotFoundException;
+
+import middlewares.GeneExtractor;
+import middlewares.HitsCounter;
+import middlewares.Misc;
+import middlewares.ReadCounter;
+import middlewares.XMLSummaryFileBuilder;
+
+/**
+ * This class is for computation of the coverage, BED coverage and statistics
+ * output files.
+ * 
+ * @author Ali Abdallah
+ * @version 02.02.2011
+ * @since jdk 1.6.0
+ */
+
+public class EnrichmentStatsComputer {
+
+	/* ***********************************************************************
+	 * 							FIELDS OF THE CLASS								
+	 * ***********************************************************************/
+	private String 
+	/** 
+	 * 4-column alignment file sorted by alignment subject and position.
+	 */ 
+	align, 
+	/**
+	 * 3-column target file sorted by subject and target starting position.
+	 */
+	target, 
+	/** 
+	 * Annotation file.
+	 */
+	genome, 
+	/**
+	 * Statistics file in the xml-format.
+	 */
+	summary, 
+	/**
+	 * Coverage file.
+	 */
+	details, 
+	/**
+	 * Coverage BED file.
+	 */
+	targetCS,
+	/**
+	 * The absolute name of the read alignment file without the extension.
+	 */
+	prefix, 
+ 
+	/**
+	 * The directory under which temporary data are saved (temporary directory).
+	 */
+	tmpDir, 
+	/**
+	 * The directory of the computed results (output directory).
+	 */
+	outDir;
+	
+	
+	private Scanner 
+	/**
+	 * Scanner of the read alignment file (sam format).
+	 */
+	rScanner, 
+	/**
+	 * Scanner of the genome annotation file.
+	 */
+	gScanner, 
+	/**
+	 * Scanner of the file of targets.
+	 */
+	tScanner;
+	
+	private FileWriter 
+	/**
+	 * Writer used to write the computed statistics into the statistics file.
+	 */
+	summaryWriter, 
+	/**
+	 * Writer used to write coverage data at base-level into the coverage file. 
+	 */
+	detailsWriter, 
+	/**
+	 * Writer used to write the summarized coverage data at target level into
+	 * the coverage BED file.
+	 */
+	targetCSWriter;
+	
+	/**
+	 * Counter used to counts the coverage frequencies at different levels. 
+	 */
+	private HitsCounter hc;
+	/**
+	 * Counter used to count both: Reads exactly on the targets or overlapping
+	 * the targets and Reads doing this in a scope of 100 resp. 200 bases from 
+	 * both end.
+	 */
+	private ReadCounter rc;
+	
+	/**
+	 * A Gene Extractor used to extract the genes overlapping the specified 
+	 * targets. 
+	 */
+	private GeneExtractor ge;
+	
+	/**
+	 * Counter for the reads.
+	 */
+	private int numReads = 0, 
+	/**
+	 * Counter for all target bases.
+	 */
+	targetBases = 0, 
+	/**
+	 * Counter for the bases in a specific target.
+	 */
+	currTargetBases = 0;
+	
+	/**
+	 * The average Coverage of the targets.
+	 */
+	private double avCov = 0;
+	/**
+	 * The average Coverage of the current observed target.
+	 */
+	private double curAvCov = 0;
+	/**
+	 * Flag indicating the end of the read file.
+	 */
+	private boolean endReads = false;
+	/**
+	 * Flag indicates if the coverage line was written or not. 
+	 */
+	private boolean covWritten = false;
+	
+	/**
+	 * Contains always the current observers read line from the sam file.
+	 */
+	private ReducedReadLine rLine;
+
+
+	private int rlength;
+	
+	/**
+	 * ID's of the leaf-tags in the XML summary file. 
+	 */
+	private static final int 
+	from30xPERC = 28,  from30xBASES = 27,from20xPERC = 26, from20xBASES = 25, 
+	from10xPERC = 24, from10xBASES = 23,from5xPERC = 22, from5xBASES = 21,
+	from1xPERC = 20, from1xBASES = 19, OV200P = 18, OV_200 = 17,
+	OV100P = 16, OV_100 = 15, OV_PERC = 14, OV = 13, ON200P = 12,
+	ON_200 = 11, ON100P = 10, ON_100 = 9, ON_PERC = 8, ON = 7,
+	STANDARD_ERROR = 6, AVERAGE_COVERAGE = 5, TARGET_BASES = 4,
+	NUMBER_OF_READS = 3, READ_LENGTH = 2, SAMPLE_NAME = 1, SAMPLE_NUMBER = 0;
+	
+	/**
+	 * Constructs a EnrichmentStatsComputer object.
+	 * 
+	 * @param prefix the name of the sam file without its extension.
+	 * @param proc the number of the process running the software.
+	 * @param tmpDir the temporary directory
+	 * @param outDir the output directory.
+	 */
+	public EnrichmentStatsComputer(String prefix, String proc, String tmpDir,
+			String outDir) {
+		super();
+		
+		this.prefix 	= prefix;
+		
+		// Setting the paths.
+		this.tmpDir 	= tmpDir + Misc.slash(tmpDir);
+		this.outDir 	= outDir + Misc.slash(outDir);
+		this.align 	= this.tmpDir + "NGSrich_" + prefix + "_" + proc+ ".txt"; 
+		this.target = this.tmpDir + "NGSrich_target_" + proc + ".txt"; 
+		this.genome = this.tmpDir + "NGSrich_genome_" + proc + ".txt"; 
+		this.details = this.tmpDir + "coverage_" + proc + ".txt"; 	
+		this.summary = this.outDir + "data/" + prefix + "_enrichment.xml";			
+		this.targetCS = this.outDir + "data/" + prefix + "_enrichment.bed"; 
+		
+		// Constructing and initializing various objects.
+			try {
+				init();
+			} catch (ChromosomeFormatException e) {
+				e.printStackTrace();
+			} catch (ChromosomeNotFoundException e) {
+				e.printStackTrace();
+			} catch (RangeFormatException e) {
+				e.printStackTrace();
+			} catch (RangeLimitNotFoundException e) {
+				e.printStackTrace();
+			} catch (IOException e) {
+				e.printStackTrace();
+			}
+	}
+	
+	public void init() throws 	IOException, ChromosomeFormatException, 
+								ChromosomeNotFoundException, RangeFormatException, 
+								RangeLimitNotFoundException {
+		hc = new HitsCounter();
+		rc = new ReadCounter();
+		rScanner = new Scanner(new File(align));
+		tScanner = new Scanner(new File(target));
+		gScanner = new Scanner(new File(genome));
+		File sum = new File(summary);
+		if (!sum.exists()) {
+			sum.createNewFile();
+		}
+		summaryWriter = new FileWriter(sum);
+		detailsWriter = new FileWriter(new File(details));
+		targetCSWriter = new FileWriter(new File(targetCS));
+		rLine = new ReducedReadLine(rScanner.nextLine());
+		rlength = (rLine.isForwardRead())?rLine.end()-rLine.start()+1
+										:rLine.start()-rLine.end()+1; 
+		numReads++;
+		ge = new GeneExtractor(genome);
+	}
+	
+	private void finit() throws IOException {
+		summaryWriter.close();
+		rScanner.close();
+		tScanner.close();
+		gScanner.close();
+		detailsWriter.close();
+	}
+
+	/* ***********************************************************************
+	 * 							MAIN COMPUTATION							  
+	 * ***********************************************************************/
+	/**
+	 * The main computation method.
+	 * 
+	 * @throws IOException if the coverage file or the summary file can't be 
+	 * written.
+	 * @throws NullOrNegativeRangeException 
+	 * @throws RangeLimitNotFoundException 
+	 * @throws ChromosomeNotFoundException 
+	 * @throws ChromosomeFormatException 
+	 * @throws RangeFormatException 
+	 */
+	public void compute() throws 	IOException, 
+									RangeFormatException, 
+									ChromosomeFormatException, 
+									ChromosomeNotFoundException, 
+									RangeLimitNotFoundException, 
+									NullOrNegativeRangeException {
+		// run over all targets.
+		while (tScanner.hasNextLine()) {
+			// Set a flag indicating if the target file and the read file are 
+			// at the same chromosome level. (Standard value: false)
+			boolean chromAlreadyReached = false;			
+			// Read the next target.
+			TargetLine tLine = new TargetLine(tScanner.nextLine());
+			// Save the number of target bases.
+			currTargetBases = tLine.end() - tLine.start() + 1;
+			// Initialize the array of the number if hits on each base of the 
+			// target.
+			int[] bases = new int[currTargetBases];
+			// Update the current total number of observed target bases. 
+			targetBases += currTargetBases;
+			// Initialize the average coverage of the current target.
+			curAvCov = 0;
+			// Set a flag indicating if the coverage data for the current target
+			// are already written or not.
+			covWritten = false;
+			// Find the gene overlapping the current target by using the
+			// logarithmic time gene extractor.
+			String currTargetGene = ge.extractGene(tLine);
+			
+			// Stop if end of the alignment file is reached.
+			while (!endReads) {
+				// Otherwise.
+				// Check if the alignment file and the target file are at the
+				// same chromosome level.
+				if (rLine.chrom().equals(tLine.chrom())) {
+					chromAlreadyReached = true;
+					// Target not exceeded yet.
+					if (!readExceededTarget(rLine, tLine)) {
+						// Increment "on target" and "overlapping" values.
+						incOnTargetValues(rLine, tLine);
+						incOverlapsValues(rLine, tLine, bases);
+					} else {
+						// Compute the average coverage of the current target
+						// and update the target hits values at the 5 different
+						// levels.
+						curAvCov = updateTargetHitsValues(bases);
+						// Write the coverage data for the current target.
+						writeCoverageLines(currTargetBases, curAvCov, tLine
+								+ "", bases, currTargetGene);
+						// Break the while-loop, since all next reads belong
+						// to next targets.
+						break;
+					}
+				} 
+				// otherwise
+				else {
+					// alignment took already place beforehand.
+					if (!covWritten && chromAlreadyReached) {
+						curAvCov = updateTargetHitsValues(bases);
+						writeCoverageLines(currTargetBases, curAvCov, tLine
+								+ "", bases, currTargetGene);
+						break;
+					}
+				}
+				// If exists, get the next read for the next target and 
+				// increment the number of reads by one.
+				if (rScanner.hasNextLine()) {
+					rLine = new ReducedReadLine(rScanner.nextLine());
+					numReads++;
+				} 
+				// otherwise (no reads anymore).
+				else {
+					endReads = true;
+				}
+			}
+			// Reads completely processed but last coverage line not written. 
+			if (endReads && covWritten == false) {
+				curAvCov = updateTargetHitsValues(bases);
+				writeCoverageLines(currTargetBases, curAvCov, tLine + "",
+														bases, currTargetGene);
+			}
+		}
+		// Count the remaining reads.
+		while (rScanner.hasNextLine()) {
+			rScanner.nextLine();
+			numReads++;
+		}
+		targetCSWriter.close();
+		System.out.println("Target BED coverage file created ("+targetCS+")");
+		System.out.println("Per-base coverage file created ("+details+")");
+		
+		// Self explanatory.
+		double sd = Math.sqrt(var(targetBases, avCov));
+		writeSummary(numReads, targetBases, avCov, sd);
+
+		System.out.println("Summary statistics file created ("+summary+")");
+		finit();
+	}
+
+	/* ***********************************************************************
+	 * 							AUXILIARY METHODS								
+	 * ***********************************************************************/
+	/**
+	 * Updates the hits values at 5 different levels (1x, 5x, 10x, 20x, 30x) and
+	 * the average target coverage.
+	 * 
+	 * @param bases the hits on the bases of the current target.
+	 * @return the average coverage of the current target. 
+	 */
+	private double updateTargetHitsValues(int[] bases) {
+		double curAvCov = 0;
+		for(int i = 0; i < bases.length; i++) {
+			curAvCov += bases[i];
+			hc.updateCurrentHits(bases[i]);
+		}
+		hc.updateHits();
+		avCov += curAvCov;
+		return curAvCov;
+	}
+
+	/**
+	 * Writes the summary file containing overall statistics for the observed 
+	 * sample.
+	 * 
+	 * @param numReads the total number of reads.
+	 * @param targetBases the total number of target bases. 
+	 * @param avCov the average target coverage.
+	 * @param sd the standard deviation of the target coverage.
+	 * 
+	 */
+	private void writeSummary(int numReads, int targetBases, double avCov,
+			double sd){
+
+		int x1 = hc.getNextLevelHits();
+		int x5 = hc.getNextLevelHits();
+		int x10 = hc.getNextLevelHits();
+		int x20 = hc.getNextLevelHits();
+		int x30 = hc.getNextLevelHits();
+		
+		XMLSummaryFileBuilder xbf = 
+				new XMLSummaryFileBuilder(summary, prefix);
+		Element[] tags = xbf.getLeafs();
+		tags[SAMPLE_NUMBER].setText("1");
+		tags[SAMPLE_NAME].setText(prefix);
+		tags[READ_LENGTH].setText(rlength+"");
+		tags[NUMBER_OF_READS].setText(numReads+"");
+		tags[TARGET_BASES].setText(targetBases+"");
+		int d = 2;
+		tags[AVERAGE_COVERAGE].setText(dfloor(avCov / targetBases, d)+"");
+		tags[STANDARD_ERROR].setText(dfloor(sd, d)+"");
+		
+		tags[ON].setText(rc.on()+"");
+		tags[ON_PERC].setText(dfloor((rc.on()/(double)numReads)*100, d)+"");
+		tags[ON_100].setText(rc.on100()+"");
+		tags[ON100P].setText(dfloor((rc.on100()/(double)numReads)*100, d)+"");
+		tags[ON_200].setText(rc.on200()+"");
+		tags[ON200P].setText(dfloor((rc.on200()/(double)numReads)*100, d)+"");
+		
+		tags[OV].setText(rc.over()+"");
+		tags[OV_PERC].setText(dfloor((rc.over()/(double)numReads)*100, d)+"");
+		tags[OV_100].setText(rc.over100()+"");
+		tags[OV100P].setText(dfloor((rc.over100()/(double)numReads)*100, d)+"");
+		tags[OV_200].setText(rc.over200()+"");
+		tags[OV200P].setText(dfloor((rc.over200()/(double)numReads)*100, d)+"");
+		
+		tags[from1xBASES].setText(x1+"");
+		tags[from1xPERC].setText(dfloor((x1/(double)targetBases)*100, d)+"");
+		tags[from5xBASES].setText(x5+"");
+		tags[from5xPERC].setText(dfloor((x5/(double)targetBases)*100, d)+"");
+		tags[from10xBASES].setText(x10+"");
+		tags[from10xPERC].setText(dfloor((x10/(double)targetBases)*100, d)+"");
+		tags[from20xBASES].setText(x20+"");
+		tags[from20xPERC].setText(dfloor((x20/(double)targetBases)*100, d)+"");
+		tags[from30xBASES].setText(x30+"");
+		tags[from30xPERC].setText(dfloor((x30/(double)targetBases)*100, d)+"");
+	
+		xbf.writeXMLFile(summary);
+	}
+
+	/**
+	 * Computes the variance of the target coverage.
+	 * 
+	 * @param targetBases the total number of target bases. 
+	 * @param avCov the average target coverage.
+	 * @return The variance of the target coverage.
+	 * @throws FileNotFoundException if the coverage BED file can't be found.
+	 */
+	private double var(int targetBases, double avCov)
+			throws FileNotFoundException {
+
+		Scanner mdScanner = new Scanner(new File(targetCS));
+		int n = 0;
+		double X = avCov / targetBases;
+		double sum = 0;
+
+		while (mdScanner.hasNextLine()) {
+			String mdLine = mdScanner.nextLine();
+			String[] mdFields = mdLine.split("\t");
+			double X_i = Double.parseDouble(mdFields[4]);
+			sum += (X_i - X) * (X_i - X);
+			n += 1;
+		}
+
+		sum /= n - 1;
+		return sum;
+	}
+
+	/**
+	 * Writes detailed coverage data to the coverage file and a coverage line
+	 * to the coverage BED file.
+	 *  
+	 * @param currTargetBases number of bases of the current target.
+	 * @param curAvCov the average coverage on the specified target tLine.
+	 * @param tLine the target line.
+	 * @param bases the hits on the bases of the target tLine.
+	 * @param currTargetGene the gene overlapping the current target.
+	 * @throws IOException if the writer fails to write to coverage line.
+	 */
+	private void writeCoverageLines(int currTargetBases, double curAvCov,
+			String tLine, int[] bases, String currTargetGene)
+			throws IOException {
+		detailsWriter.write(tLine + "\r\n");
+		for(int i = 0; i < bases.length; i++) 
+			detailsWriter.write(bases[i] + "\n");
+		targetCSWriter.write(tLine + "\t" + currTargetGene + "\t"
+				+ dfloor((curAvCov / (double)currTargetBases), 2) + "\t"
+				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
+				+ "\t"
+				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
+				+ "\t"
+				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
+				+ "\t"
+				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
+				+ "\t"
+				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
+				+ "\r\n");
+		covWritten = true;
+		hc.resetCurrentHits();
+	}
+
+	/**
+	 * Cut decimal places greater than the specified one.
+	 * @param number the number to be cut.
+	 * @param decimalPlace the decimalPlace after which the cut-operation
+	 * 		  take place.
+	 * @return the cut number.
+	 */
+	public static double dfloor(double number, int decimalPlace) {
+		return Math.floor(number * Math.pow(10, decimalPlace))
+				/ Math.pow(10, decimalPlace);
+	}
+
+	/**
+	 * Checks if the read exceeded the target.
+	 * @param rl read line.
+	 * @param tl target line.
+	 * @return true if the read starts right to the end position of the target
+	 * and false otherwise. 
+	 */
+	public static boolean readExceededTarget(ReducedReadLine rl, TargetLine tl) {
+		return !(rl.start() <= tl.end() || rl.end() <= tl.end());
+	}
+
+	/**
+	 * <pre> 
+	 * 1. Increments the number of Reads, overlapping targets within a scope of 
+	 * 	  0, 100 and 200 bases from both ends, by one.
+	 * 2. Increments the number of hits on the hitted bases by one.   
+	 * </pre>
+	 * @param rl the read line.
+	 * @param tl the target line.
+	 * @param bases the base hits array of the target.
+	 */
+	private void incOverlapsValues(ReducedReadLine rl, TargetLine tl, int[] bases) {
+		// Forward reads.
+		if (rl.isForwardRead()) {
+			if (rl.end() >= tl.start() && rl.start() <= tl.end()) {
+				// incrementing the number of his on the hitted bases by one.
+				int start = Math.max(0, rl.start() - tl.start());
+				int stop = Math.min(bases.length - 1, rl.end() - tl.start());
+				for (int i = start; i <= stop; i++) 
+					bases[i]++;
+				// incrementing the number of Reads overlapping targets by one.
+				rc.incOver();
+			}
+			if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100)
+				// incrementing the number of Reads overlapping targets(+/-100)
+				// by one.
+				rc.incOver100();
+			if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100)
+				// incrementing the number of Reads overlapping targets(+/-200)
+				// by one.
+				rc.incOver200();
+		} 
+		// Reverse reads.
+		else {
+			if (rl.start() >= tl.start() && rl.end() <= tl.end()) {
+				int start = Math.max(0, rl.end() - tl.start());
+				int stop = Math.min(bases.length - 1, rl.start() - tl.start());
+				for (int i = start; i <= stop; i++) bases[i]++;
+				rc.incOver();
+			}
+			if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100)
+				rc.incOver100();
+			if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100)
+				rc.incOver200();
+		}
+	}
+
+	/**
+	 * Increments the number of Reads on targets within a scope of 0, 100 and 
+	 * 200 bases from both ends, by one.
+	 * @param rl the read line.
+	 * @param tl the target line.
+	 */
+	private void incOnTargetValues(ReducedReadLine rl, TargetLine tl) {
+		// Forward read.
+		if (rl.isForwardRead()) {
+			if ((tl.start() <= rl.start()) && (rl.end() <= tl.end()))
+				rc.incOn();
+			if ((tl.start()-100 <= rl.start()) && (rl.end() <= tl.end()+100))
+				rc.incOn100();
+			if ((tl.start()-200 <= rl.start()) && (rl.end() <= tl.end()+200))
+				rc.incOn200();
+		} 
+		// Reverse read.
+		else {
+			if (tl.start() <= rl.end() && rl.start() <= tl.end())
+				rc.incOn();
+			if (tl.start()-100 <= rl.end() && rl.start()<= tl.end()+100)
+				rc.incOn100();
+			if (tl.start()-200 <= rl.end() && rl.start() <= tl.end()+200)
+				rc.incOn200();
+		}
+	}
+
+}
\ No newline at end of file