view NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
line wrap: on
line source

package _main;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.sql.Time;
import java.util.Scanner;

import middlewares.Misc;
import converters.Read2Wig;
import converters.ReadOnTarget2Wig;
import exceptions.ChromosomeFormatException;
import exceptions.ChromosomeMismatchException;
import exceptions.ChromosomeNotFoundException;
import exceptions.FileFormatException;
import exceptions.GenomeAnnotationException;
import exceptions.NullOrNegativeRangeException;
import exceptions.RangeFormatException;
import exceptions.RangeLimitNotFoundException;
import filters.Filter;
import filters.GenomeFilter;
import filters.ReadFilter;
import filters.TargetFilter;

/**
 * This is a scheduler for the phases of the evaluation.
 * 
 * @author Ali Abdallah
 * @version 19.03.2011
 * @since jdk 1.6
 */

public class Enrichment {
	
	String 
	/**
	 * The name of the read alignment file.
	 */
	readFileName, 
	/**
	 * the name of the genome annotation file.
	 */
	genomeFName, 
	/**
	 * the ucsc-name of the genome (e.g. hg19).
	 */
	genomeName, 
	/**
	 * the name of the file of the targeted regions.
	 */
	targetFName, 
	/**
	 * the temporary directory.
	 */
	tmpDir,
	/**
	 * the output directory.
	 */
	outDir,
	/**
	 * the xml summary file containing various overall statistics.
	 */
	xmlSummaryFile, 
	/**
	 * the name of the detailed coverage data file.
	 */
	detailsFileName, 
	/**
	 * the .bed file containing target-level coverage statistics data. 
	 */
	beddetailsFileName,
	/**
	 * the process number used for unique naming.
	 */
	proc, 
	/**
	 * the name of the read alignment file without extension.
	 */
	prefix, 
	/**
	 * the extension of the read alignment file.
	 */
	suffix,
	/**
	 * The name of the read alignment file after the reduction.
	 */
	areadFileName, 
	/**
	 * The name of the genome annotation file after the reduction.
	 */
	aGenomeFName, 
	/**
	 * the name of the file of the targeted regions after the reduction.
	 */
	aTargetFName;
	int 
	/**
	 * Cutoff for poor coverage [default: 2].
	 * 
	 */
	poor, 
	/**
	 * Cutoff for high coverage [default: 200].
	 */
	high,
	details;

	/**
	 * Construcs an enrichment object and initializes the evaluation parameters.
	 * @param args the list of all evaluation parameters.
	 */
	public Enrichment(String... args) {
		readFileName = args[0];
		genomeName = args[8];
		targetFName = args[2];
		if (!targetFName.endsWith("bed")) {
			System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format.");
		}
		if (args.length > 3) {
			Time time = new Time(System.currentTimeMillis());
			String nano = ""+System.nanoTime();
			
			tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+
															time.getHours()+"_"+
																time.getMinutes()+"_"+
																	time.getSeconds()+"_"+nano+ "/";
			new File(tmpDir).mkdir();
			new File(tmpDir).setReadable(true);
			new File(tmpDir).setWritable(true);
			outDir = args[4] + Misc.slash(args[4]);

			if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) {
				System.out.println("File " + tmpDir
						+ " is not a directory.\nProgram stopped.");
				System.exit(0);
			} else if (!(new File(tmpDir).exists())) {
				System.out.println("File " + tmpDir
						+ " not found.\nProgram stopped.");
				System.exit(0);
			}
		}
		if (args.length > 5) {
			try {
				poor = Integer.parseInt(args[5]);
				high = Integer.parseInt(args[6]);
			} catch (NumberFormatException nfe) {
				System.out
						.println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values.");
				poor = 2;
				high = 200;
			}
		} else {
			poor = 2;
			high = 200;
		}
		
		genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1];
		
		prefix = Misc.prefix(readFileName);
		if (args[7] != "none") {
			prefix = args[7];
		}
		details = Integer.parseInt(args[9]);
		
		suffix = Misc.suffix(readFileName);
		proc = Misc.getHostName();
		areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt";
		aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt";
		aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed";
		detailsFileName = tmpDir + "coverage_" + proc + ".txt";
		xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml";
		beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed";
	}
	
	public static final String[] GENOME_NAMES = {
		"anoGam1", "bosTau4", "cb3", "ce4", "ce6",
		"danRer5", "danRer6", "danRer7", "dm2", "dm3",
		"galGal2", "galGal3", "hg19", "hg18", "mm8",
		"mm9", "panTro3", "rn3", "rn4", "susScr2"
	};
	
	public int genomeID(String gname){
		for(int i = 0; i < GENOME_NAMES.length; i++)
			if(GENOME_NAMES[i].equals(gname))
				return i;
		return -1;
	}
	
	/**
	 * Downloads the specified genome annotation, if internet connection
	 * exists.
	 * 
	 * @param genome the ucsc-name of the genome. (e.g. hg18, hg19)
	 * @return the very same name of the genome.
	 * @throws GenomeAnnotationException 
	 */
	
	public String getGenomeAnnotation() throws GenomeAnnotationException{
		try{
			return downloadGenomeAnnotation();
		}catch(Exception e){
		if(genomeID(genomeName)!=-1)
			return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt";
		else if(new File(genomeName).exists())
			return genomeName;
		else
			throw new GenomeAnnotationException();
		}
	}
	
	public String getChromInfo(){
		//genomeName
		return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt";
	}
	
	public String downloadGenomeAnnotation() {
		if (genomeName.indexOf("/") == -1) {
			String tmpGenomeDir = tmpDir + genomeName;
			try {
				if (!new File(tmpGenomeDir).exists()) {
					new File(tmpGenomeDir).mkdir();
				}
				File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh");
				if (!getGenomeScript.exists()) {
					getGenomeScript.createNewFile();
				}
				FileWriter fw = new FileWriter(getGenomeScript);

				getGenomeScript.setReadable(true);
				getGenomeScript.setWritable(true);
				getGenomeScript.setExecutable(true);
				getGenomeScript.setExecutable(true);
				String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir)
						+ "refGene.txt.gz";
				fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n"
						+ "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName
						+ "/database/refGene.txt.gz " + "-O " + tmpGenomeFile
						+ "\n" + "gunzip " + tmpGenomeFile + "\n");
				fw.close();
				Runtime.getRuntime().exec("sh " + getGenomeScript);
				try {
					Thread.sleep(10000);
				} catch (InterruptedException e) {
					e.printStackTrace();
				}
				
			} catch (IOException e) {
				e.printStackTrace();
			}
			return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt";
		} else {
			return genomeName;
		}
	}

	/**
	 * A bridge method which converts a bam file to a sam file for further
	 * processing. This method assures therefore the compatibility of the
	 * software with the bam format.
	 * 
	 * @return the name of the generated sam file.
	 * @throws IOException
	 */
	public String bam2sam() throws IOException {

		String[] bampath = readFileName.split("/");
		String bamfile = bampath[bampath.length - 1];
		String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam";

		File getBam2SamScript = new File(tmpDir + "bam2sam.sh");
		if (!getBam2SamScript.exists()) {
			getBam2SamScript.createNewFile();
		}
		FileWriter fw = new FileWriter(getBam2SamScript);
		fw.write("#!/bin/bash\n" + Misc.binDir()
				+ "/../thirdparty/samtools/0.1.16/samtools view "
				+ readFileName + " > " + samfile);
		fw.close();

		Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript);
		//		try {
		//	@SuppressWarnings("unused")
			// "exitstate" variable unused
			    //int exitstate = p.waitFor();
		//		} catch (InterruptedException e) {
		//e.printStackTrace();
		//		}

		Scanner stderr=new Scanner(p.getErrorStream());
		while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
		stderr.close();

		return (samfile);

	}

	/**
	 * First phase of the evaluation. In this phase input files are being
	 * simplified for further processing.
	 * 
	 * @throws FileFormatException
	 * @throws ChromosomeMismatchException 
	 */
	public void reduceFiles() throws FileFormatException, ChromosomeMismatchException {
		String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt";
		String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt";
		String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt";
		
		
		Filter[] filters = {new ReadFilter(readFileName, aReadFileName), 
							new GenomeFilter(genomeFName, aGenomeFName),
							new TargetFilter(targetFName, aTargetFName)};
		
		for(Filter f: filters) f.filter();

		if(!Misc.areChromosomeCompatible(new File(aReadFileName), 
										 new File(aTargetFName)))
			throw new ChromosomeMismatchException();
		
		System.out.println("\nSTEP 1 successfully completed");
	}

	/**
	 * Second phase of the evaluation. An EnrichmentStatsComputer object is
	 * called with the right parameters and computation is checked for formal
	 * correctness.
	 */
	public void computeTargetCoverageFiles() {

		File data = new File(outDir + "/data");
		data.mkdir();
		File plots = new File(outDir + "/plots");
		plots.mkdir();

		EnrichmentStatsComputer esc = 
					new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir);
		
		try {
			esc.compute();
		} catch (RangeFormatException e) {
			e.printStackTrace();
		} catch (ChromosomeFormatException e) {
			e.printStackTrace();
		} catch (ChromosomeNotFoundException e) {
			e.printStackTrace();
		} catch (RangeLimitNotFoundException e) {
			e.printStackTrace();
		} catch (NullOrNegativeRangeException e) {	
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		if (new File(detailsFileName).exists()) {
			System.out.println("\nSTEP 2 successfully completed");
		} else {
			System.out.println("STEP 2 incomplete.");
		}

	}

	/**
	 * Third phase of the evaluation. Based on the results from phase 2, the
	 * evaluation is further developed and evaluation data are visualized by
	 * calling "eval_enrichment.R" Script.
	 */
	public void evaluate() {
		Runtime rt = Runtime.getRuntime();
		try {
			String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R";
			int lastSlash = outDir.lastIndexOf("/");
			String outDirR = (lastSlash == outDir.length() - 1) ? outDir
					.substring(0, lastSlash) : outDir;
			
			String call = 	"Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " "
							+ outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " " 
							+ targetFName+" "+details;
			
			Process p=rt.exec(call);

			Scanner stdout = new Scanner(p.getInputStream());
			Scanner stderr=new Scanner(p.getErrorStream());
			while (stdout.hasNextLine()){System.out.println(stdout.nextLine());}
            while (stderr.hasNextLine()){System.out.println(stderr.nextLine());}
			stdout.close(); stderr.close();

			if(details == 1){
				String chrInfoPath = getChromInfo();
			
				//System.out.println("chrInfoPath\t"+chrInfoPath);
				script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R";
                outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir;
            
                String call2 = 	"Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName 
							+ " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " " 
							+ genomeName;
                Process p2 = rt.exec(call2);
                stdout=new Scanner(p2.getInputStream());
                stderr=new Scanner(p2.getErrorStream());
                
                while(stdout.hasNextLine()){System.out.println(stdout.nextLine());}
                while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
                stdout.close(); stderr.close();
			}
            
			String path = outDir+prefix+"_enrichment.html";
			System.out.println("Created plots and HTML summary report");
			if (new File(path).exists()) {
			    System.out.println("\nSTEP 3 successfully completed");
			} else {
			    System.out.println("HTML FILE " + path + " not found");
			    System.out.println("\nSTEP 3 unsuccessful");
			}
		}
		catch (IOException e) {
			e.printStackTrace();
		}
	}

	/**
	 * This is the fourth phase. In this phase the detailed datta from phase 2
	 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig
	 * object to accomplish this task.
	 * 
	 */
	public void computeWiggleFile() {
		try {
			System.out.println("Computing wiggle file for on-target reads");
			if (!new File(detailsFileName).exists())
				detailsFileName = tmpDir +  new File(detailsFileName).getName();
			
			new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data", 
					Misc.prefix(readFileName)+ "\nonTarget");
			System.out.println("Wiggle file for on-target reads created");
			System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig");
			File[] outputFiles = new File(outDir + "data/")
					.listFiles();
			for (int i = 0; i < outputFiles.length; i++) {
				if (outputFiles[i].getName().endsWith("wig")
						&& outputFiles[i].getName().startsWith(
								detailsFileName.substring(
									detailsFileName.lastIndexOf("/") + 1,
										detailsFileName.lastIndexOf(".")))) {
					File f = new File(outDir + "data"
							+ "/" + Misc.prefix(readFileName) + "_onTarget.wig");
					outputFiles[i].renameTo(f);
					break;
				}
			}
			if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){
				System.out.println("\nSTEP 4 successfully completed.");
			} else {
				System.out.println(outDir+"data/"+ prefix
						+ "_onTarget.wig not found!\n\nSTEP 4 unsuccessful");
			}
		} catch (IOException e) {
			System.err.println("Target conversion in Wig unsuccessful");
			e.printStackTrace();
		}
	}

	/**
	 * This is the fifth and last phase of the evaluation process. In contrast
	 * to the fourth phase, this method uses a Read2Wig object to convert the
	 * detailed data for all reads (and not only for reads on targets) to the
	 * wiggle format.
	 * 
	 */
	void computeOverallWiggleFile() {
		try {
			System.out.println("Computing wiggle file for all reads");
			new Read2Wig(areadFileName, prefix, outDir + "data", 
														genomeFName, tmpDir);
			String overallWiggleFileName = prefix + ".wig";
			System.out.println("Wiggle file for all reads created");
			System.out.println("Output written to " + outDir + "data/" + 
														overallWiggleFileName);
			if (new File(outDir + "data/" + overallWiggleFileName).exists()) {
				System.out.println("\nSTEP 5 successfully completed.");
			} else {
				System.out.println(outDir + "data/" + overallWiggleFileName
						+ "not found\n\nSTEP 5 unsuccessful");
			}
		} catch (IOException e) {
			System.err
					.println("Creating wiggle file for all reads was unsuccessful");
			e.printStackTrace();
		}
	}
}