view NGSrich_0.5.5/src/_main/EnrichmentStatsComputer.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
line wrap: on
line source

package _main;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Scanner;

import org.jdom.Element;

import datastructures.ReducedReadLine;
import datastructures.TargetLine;
import exceptions.ChromosomeFormatException;
import exceptions.ChromosomeNotFoundException;
import exceptions.NullOrNegativeRangeException;
import exceptions.RangeFormatException;
import exceptions.RangeLimitNotFoundException;

import middlewares.GeneExtractor;
import middlewares.HitsCounter;
import middlewares.Misc;
import middlewares.ReadCounter;
import middlewares.XMLSummaryFileBuilder;

/**
 * This class is for computation of the coverage, BED coverage and statistics
 * output files.
 * 
 * @author Ali Abdallah
 * @version 02.02.2011
 * @since jdk 1.6.0
 */

public class EnrichmentStatsComputer {

	/* ***********************************************************************
	 * 							FIELDS OF THE CLASS								
	 * ***********************************************************************/
	private String 
	/** 
	 * 4-column alignment file sorted by alignment subject and position.
	 */ 
	align, 
	/**
	 * 3-column target file sorted by subject and target starting position.
	 */
	target, 
	/** 
	 * Annotation file.
	 */
	genome, 
	/**
	 * Statistics file in the xml-format.
	 */
	summary, 
	/**
	 * Coverage file.
	 */
	details, 
	/**
	 * Coverage BED file.
	 */
	targetCS,
	/**
	 * The absolute name of the read alignment file without the extension.
	 */
	prefix, 
 
	/**
	 * The directory under which temporary data are saved (temporary directory).
	 */
	tmpDir, 
	/**
	 * The directory of the computed results (output directory).
	 */
	outDir;
	
	
	private Scanner 
	/**
	 * Scanner of the read alignment file (sam format).
	 */
	rScanner, 
	/**
	 * Scanner of the genome annotation file.
	 */
	gScanner, 
	/**
	 * Scanner of the file of targets.
	 */
	tScanner;
	
	private FileWriter 
	/**
	 * Writer used to write the computed statistics into the statistics file.
	 */
	summaryWriter, 
	/**
	 * Writer used to write coverage data at base-level into the coverage file. 
	 */
	detailsWriter, 
	/**
	 * Writer used to write the summarized coverage data at target level into
	 * the coverage BED file.
	 */
	targetCSWriter;
	
	/**
	 * Counter used to counts the coverage frequencies at different levels. 
	 */
	private HitsCounter hc;
	/**
	 * Counter used to count both: Reads exactly on the targets or overlapping
	 * the targets and Reads doing this in a scope of 100 resp. 200 bases from 
	 * both end.
	 */
	private ReadCounter rc;
	
	/**
	 * A Gene Extractor used to extract the genes overlapping the specified 
	 * targets. 
	 */
	private GeneExtractor ge;
	
	/**
	 * Counter for the reads.
	 */
	private int numReads = 0, 
	/**
	 * Counter for all target bases.
	 */
	targetBases = 0, 
	/**
	 * Counter for the bases in a specific target.
	 */
	currTargetBases = 0;
	
	/**
	 * The average Coverage of the targets.
	 */
	private double avCov = 0;
	/**
	 * The average Coverage of the current observed target.
	 */
	private double curAvCov = 0;
	/**
	 * Flag indicating the end of the read file.
	 */
	private boolean endReads = false;
	/**
	 * Flag indicates if the coverage line was written or not. 
	 */
	private boolean covWritten = false;
	
	/**
	 * Contains always the current observers read line from the sam file.
	 */
	private ReducedReadLine rLine;


	private int rlength;
	
	/**
	 * ID's of the leaf-tags in the XML summary file. 
	 */
	private static final int 
	from30xPERC = 28,  from30xBASES = 27,from20xPERC = 26, from20xBASES = 25, 
	from10xPERC = 24, from10xBASES = 23,from5xPERC = 22, from5xBASES = 21,
	from1xPERC = 20, from1xBASES = 19, OV200P = 18, OV_200 = 17,
	OV100P = 16, OV_100 = 15, OV_PERC = 14, OV = 13, ON200P = 12,
	ON_200 = 11, ON100P = 10, ON_100 = 9, ON_PERC = 8, ON = 7,
	STANDARD_ERROR = 6, AVERAGE_COVERAGE = 5, TARGET_BASES = 4,
	NUMBER_OF_READS = 3, READ_LENGTH = 2, SAMPLE_NAME = 1, SAMPLE_NUMBER = 0;
	
	/**
	 * Constructs a EnrichmentStatsComputer object.
	 * 
	 * @param prefix the name of the sam file without its extension.
	 * @param proc the number of the process running the software.
	 * @param tmpDir the temporary directory
	 * @param outDir the output directory.
	 */
	public EnrichmentStatsComputer(String prefix, String proc, String tmpDir,
			String outDir) {
		super();
		
		this.prefix 	= prefix;
		
		// Setting the paths.
		this.tmpDir 	= tmpDir + Misc.slash(tmpDir);
		this.outDir 	= outDir + Misc.slash(outDir);
		this.align 	= this.tmpDir + "NGSrich_" + prefix + "_" + proc+ ".txt"; 
		this.target = this.tmpDir + "NGSrich_target_" + proc + ".txt"; 
		this.genome = this.tmpDir + "NGSrich_genome_" + proc + ".txt"; 
		this.details = this.tmpDir + "coverage_" + proc + ".txt"; 	
		this.summary = this.outDir + "data/" + prefix + "_enrichment.xml";			
		this.targetCS = this.outDir + "data/" + prefix + "_enrichment.bed"; 
		
		// Constructing and initializing various objects.
			try {
				init();
			} catch (ChromosomeFormatException e) {
				e.printStackTrace();
			} catch (ChromosomeNotFoundException e) {
				e.printStackTrace();
			} catch (RangeFormatException e) {
				e.printStackTrace();
			} catch (RangeLimitNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
	}
	
	public void init() throws 	IOException, ChromosomeFormatException, 
								ChromosomeNotFoundException, RangeFormatException, 
								RangeLimitNotFoundException {
		hc = new HitsCounter();
		rc = new ReadCounter();
		rScanner = new Scanner(new File(align));
		tScanner = new Scanner(new File(target));
		gScanner = new Scanner(new File(genome));
		File sum = new File(summary);
		if (!sum.exists()) {
			sum.createNewFile();
		}
		summaryWriter = new FileWriter(sum);
		detailsWriter = new FileWriter(new File(details));
		targetCSWriter = new FileWriter(new File(targetCS));
		rLine = new ReducedReadLine(rScanner.nextLine());
		rlength = (rLine.isForwardRead())?rLine.end()-rLine.start()+1
										:rLine.start()-rLine.end()+1; 
		numReads++;
		ge = new GeneExtractor(genome);
	}
	
	private void finit() throws IOException {
		summaryWriter.close();
		rScanner.close();
		tScanner.close();
		gScanner.close();
		detailsWriter.close();
	}

	/* ***********************************************************************
	 * 							MAIN COMPUTATION							  
	 * ***********************************************************************/
	/**
	 * The main computation method.
	 * 
	 * @throws IOException if the coverage file or the summary file can't be 
	 * written.
	 * @throws NullOrNegativeRangeException 
	 * @throws RangeLimitNotFoundException 
	 * @throws ChromosomeNotFoundException 
	 * @throws ChromosomeFormatException 
	 * @throws RangeFormatException 
	 */
	public void compute() throws 	IOException, 
									RangeFormatException, 
									ChromosomeFormatException, 
									ChromosomeNotFoundException, 
									RangeLimitNotFoundException, 
									NullOrNegativeRangeException {
		// run over all targets.
		while (tScanner.hasNextLine()) {
			// Set a flag indicating if the target file and the read file are 
			// at the same chromosome level. (Standard value: false)
			boolean chromAlreadyReached = false;			
			// Read the next target.
			TargetLine tLine = new TargetLine(tScanner.nextLine());
			// Save the number of target bases.
			currTargetBases = tLine.end() - tLine.start() + 1;
			// Initialize the array of the number if hits on each base of the 
			// target.
			int[] bases = new int[currTargetBases];
			// Update the current total number of observed target bases. 
			targetBases += currTargetBases;
			// Initialize the average coverage of the current target.
			curAvCov = 0;
			// Set a flag indicating if the coverage data for the current target
			// are already written or not.
			covWritten = false;
			// Find the gene overlapping the current target by using the
			// logarithmic time gene extractor.
			String currTargetGene = ge.extractGene(tLine);
			
			// Stop if end of the alignment file is reached.
			while (!endReads) {
				// Otherwise.
				// Check if the alignment file and the target file are at the
				// same chromosome level.
				if (rLine.chrom().equals(tLine.chrom())) {
					chromAlreadyReached = true;
					// Target not exceeded yet.
					if (!readExceededTarget(rLine, tLine)) {
						// Increment "on target" and "overlapping" values.
						incOnTargetValues(rLine, tLine);
						incOverlapsValues(rLine, tLine, bases);
					} else {
						// Compute the average coverage of the current target
						// and update the target hits values at the 5 different
						// levels.
						curAvCov = updateTargetHitsValues(bases);
						// Write the coverage data for the current target.
						writeCoverageLines(currTargetBases, curAvCov, tLine
								+ "", bases, currTargetGene);
						// Break the while-loop, since all next reads belong
						// to next targets.
						break;
					}
				} 
				// otherwise
				else {
					// alignment took already place beforehand.
					if (!covWritten && chromAlreadyReached) {
						curAvCov = updateTargetHitsValues(bases);
						writeCoverageLines(currTargetBases, curAvCov, tLine
								+ "", bases, currTargetGene);
						break;
					}
				}
				// If exists, get the next read for the next target and 
				// increment the number of reads by one.
				if (rScanner.hasNextLine()) {
					rLine = new ReducedReadLine(rScanner.nextLine());
					numReads++;
				} 
				// otherwise (no reads anymore).
				else {
					endReads = true;
				}
			}
			// Reads completely processed but last coverage line not written. 
			if (endReads && covWritten == false) {
				curAvCov = updateTargetHitsValues(bases);
				writeCoverageLines(currTargetBases, curAvCov, tLine + "",
														bases, currTargetGene);
			}
		}
		// Count the remaining reads.
		while (rScanner.hasNextLine()) {
			rScanner.nextLine();
			numReads++;
		}
		targetCSWriter.close();
		System.out.println("Target BED coverage file created ("+targetCS+")");
		System.out.println("Per-base coverage file created ("+details+")");
		
		// Self explanatory.
		double sd = Math.sqrt(var(targetBases, avCov));
		writeSummary(numReads, targetBases, avCov, sd);

		System.out.println("Summary statistics file created ("+summary+")");
		finit();
	}

	/* ***********************************************************************
	 * 							AUXILIARY METHODS								
	 * ***********************************************************************/
	/**
	 * Updates the hits values at 5 different levels (1x, 5x, 10x, 20x, 30x) and
	 * the average target coverage.
	 * 
	 * @param bases the hits on the bases of the current target.
	 * @return the average coverage of the current target. 
	 */
	private double updateTargetHitsValues(int[] bases) {
		double curAvCov = 0;
		for(int i = 0; i < bases.length; i++) {
			curAvCov += bases[i];
			hc.updateCurrentHits(bases[i]);
		}
		hc.updateHits();
		avCov += curAvCov;
		return curAvCov;
	}

	/**
	 * Writes the summary file containing overall statistics for the observed 
	 * sample.
	 * 
	 * @param numReads the total number of reads.
	 * @param targetBases the total number of target bases. 
	 * @param avCov the average target coverage.
	 * @param sd the standard deviation of the target coverage.
	 * 
	 */
	private void writeSummary(int numReads, int targetBases, double avCov,
			double sd){

		int x1 = hc.getNextLevelHits();
		int x5 = hc.getNextLevelHits();
		int x10 = hc.getNextLevelHits();
		int x20 = hc.getNextLevelHits();
		int x30 = hc.getNextLevelHits();
		
		XMLSummaryFileBuilder xbf = 
				new XMLSummaryFileBuilder(summary, prefix);
		Element[] tags = xbf.getLeafs();
		tags[SAMPLE_NUMBER].setText("1");
		tags[SAMPLE_NAME].setText(prefix);
		tags[READ_LENGTH].setText(rlength+"");
		tags[NUMBER_OF_READS].setText(numReads+"");
		tags[TARGET_BASES].setText(targetBases+"");
		int d = 2;
		tags[AVERAGE_COVERAGE].setText(dfloor(avCov / targetBases, d)+"");
		tags[STANDARD_ERROR].setText(dfloor(sd, d)+"");
		
		tags[ON].setText(rc.on()+"");
		tags[ON_PERC].setText(dfloor((rc.on()/(double)numReads)*100, d)+"");
		tags[ON_100].setText(rc.on100()+"");
		tags[ON100P].setText(dfloor((rc.on100()/(double)numReads)*100, d)+"");
		tags[ON_200].setText(rc.on200()+"");
		tags[ON200P].setText(dfloor((rc.on200()/(double)numReads)*100, d)+"");
		
		tags[OV].setText(rc.over()+"");
		tags[OV_PERC].setText(dfloor((rc.over()/(double)numReads)*100, d)+"");
		tags[OV_100].setText(rc.over100()+"");
		tags[OV100P].setText(dfloor((rc.over100()/(double)numReads)*100, d)+"");
		tags[OV_200].setText(rc.over200()+"");
		tags[OV200P].setText(dfloor((rc.over200()/(double)numReads)*100, d)+"");
		
		tags[from1xBASES].setText(x1+"");
		tags[from1xPERC].setText(dfloor((x1/(double)targetBases)*100, d)+"");
		tags[from5xBASES].setText(x5+"");
		tags[from5xPERC].setText(dfloor((x5/(double)targetBases)*100, d)+"");
		tags[from10xBASES].setText(x10+"");
		tags[from10xPERC].setText(dfloor((x10/(double)targetBases)*100, d)+"");
		tags[from20xBASES].setText(x20+"");
		tags[from20xPERC].setText(dfloor((x20/(double)targetBases)*100, d)+"");
		tags[from30xBASES].setText(x30+"");
		tags[from30xPERC].setText(dfloor((x30/(double)targetBases)*100, d)+"");
	
		xbf.writeXMLFile(summary);
	}

	/**
	 * Computes the variance of the target coverage.
	 * 
	 * @param targetBases the total number of target bases. 
	 * @param avCov the average target coverage.
	 * @return The variance of the target coverage.
	 * @throws FileNotFoundException if the coverage BED file can't be found.
	 */
	private double var(int targetBases, double avCov)
			throws FileNotFoundException {

		Scanner mdScanner = new Scanner(new File(targetCS));
		int n = 0;
		double X = avCov / targetBases;
		double sum = 0;

		while (mdScanner.hasNextLine()) {
			String mdLine = mdScanner.nextLine();
			String[] mdFields = mdLine.split("\t");
			double X_i = Double.parseDouble(mdFields[4]);
			sum += (X_i - X) * (X_i - X);
			n += 1;
		}

		sum /= n - 1;
		return sum;
	}

	/**
	 * Writes detailed coverage data to the coverage file and a coverage line
	 * to the coverage BED file.
	 *  
	 * @param currTargetBases number of bases of the current target.
	 * @param curAvCov the average coverage on the specified target tLine.
	 * @param tLine the target line.
	 * @param bases the hits on the bases of the target tLine.
	 * @param currTargetGene the gene overlapping the current target.
	 * @throws IOException if the writer fails to write to coverage line.
	 */
	private void writeCoverageLines(int currTargetBases, double curAvCov,
			String tLine, int[] bases, String currTargetGene)
			throws IOException {
		detailsWriter.write(tLine + "\r\n");
		for(int i = 0; i < bases.length; i++) 
			detailsWriter.write(bases[i] + "\n");
		targetCSWriter.write(tLine + "\t" + currTargetGene + "\t"
				+ dfloor((curAvCov / (double)currTargetBases), 2) + "\t"
				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
				+ "\t"
				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
				+ "\t"
				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
				+ "\t"
				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
				+ "\t"
				+ dfloor((hc.getNextLevelCurrentHits() / (double)currTargetBases)*100, 2)
				+ "\r\n");
		covWritten = true;
		hc.resetCurrentHits();
	}

	/**
	 * Cut decimal places greater than the specified one.
	 * @param number the number to be cut.
	 * @param decimalPlace the decimalPlace after which the cut-operation
	 * 		  take place.
	 * @return the cut number.
	 */
	public static double dfloor(double number, int decimalPlace) {
		return Math.floor(number * Math.pow(10, decimalPlace))
				/ Math.pow(10, decimalPlace);
	}

	/**
	 * Checks if the read exceeded the target.
	 * @param rl read line.
	 * @param tl target line.
	 * @return true if the read starts right to the end position of the target
	 * and false otherwise. 
	 */
	public static boolean readExceededTarget(ReducedReadLine rl, TargetLine tl) {
		return !(rl.start() <= tl.end() || rl.end() <= tl.end());
	}

	/**
	 * <pre> 
	 * 1. Increments the number of Reads, overlapping targets within a scope of 
	 * 	  0, 100 and 200 bases from both ends, by one.
	 * 2. Increments the number of hits on the hitted bases by one.   
	 * </pre>
	 * @param rl the read line.
	 * @param tl the target line.
	 * @param bases the base hits array of the target.
	 */
	private void incOverlapsValues(ReducedReadLine rl, TargetLine tl, int[] bases) {
		// Forward reads.
		if (rl.isForwardRead()) {
			if (rl.end() >= tl.start() && rl.start() <= tl.end()) {
				// incrementing the number of his on the hitted bases by one.
				int start = Math.max(0, rl.start() - tl.start());
				int stop = Math.min(bases.length - 1, rl.end() - tl.start());
				for (int i = start; i <= stop; i++) 
					bases[i]++;
				// incrementing the number of Reads overlapping targets by one.
				rc.incOver();
			}
			if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100)
				// incrementing the number of Reads overlapping targets(+/-100)
				// by one.
				rc.incOver100();
			if (tl.start() - 100 <= rl.end() && rl.start() <= tl.end() + 100)
				// incrementing the number of Reads overlapping targets(+/-200)
				// by one.
				rc.incOver200();
		} 
		// Reverse reads.
		else {
			if (rl.start() >= tl.start() && rl.end() <= tl.end()) {
				int start = Math.max(0, rl.end() - tl.start());
				int stop = Math.min(bases.length - 1, rl.start() - tl.start());
				for (int i = start; i <= stop; i++) bases[i]++;
				rc.incOver();
			}
			if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100)
				rc.incOver100();
			if (rl.start() >= tl.start() - 100 && rl.end() <= tl.end() + 100)
				rc.incOver200();
		}
	}

	/**
	 * Increments the number of Reads on targets within a scope of 0, 100 and 
	 * 200 bases from both ends, by one.
	 * @param rl the read line.
	 * @param tl the target line.
	 */
	private void incOnTargetValues(ReducedReadLine rl, TargetLine tl) {
		// Forward read.
		if (rl.isForwardRead()) {
			if ((tl.start() <= rl.start()) && (rl.end() <= tl.end()))
				rc.incOn();
			if ((tl.start()-100 <= rl.start()) && (rl.end() <= tl.end()+100))
				rc.incOn100();
			if ((tl.start()-200 <= rl.start()) && (rl.end() <= tl.end()+200))
				rc.incOn200();
		} 
		// Reverse read.
		else {
			if (tl.start() <= rl.end() && rl.start() <= tl.end())
				rc.incOn();
			if (tl.start()-100 <= rl.end() && rl.start()<= tl.end()+100)
				rc.incOn100();
			if (tl.start()-200 <= rl.end() && rl.start() <= tl.end()+200)
				rc.incOn200();
		}
	}

}