Mercurial > repos > pfrommolt > ngsrich

diff NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip
Uploaded
author: pfrommolt
date: Mon, 21 Nov 2011 08:12:19 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NGSrich_0.5.5/src/_main/Enrichment.java	Mon Nov 21 08:12:19 2011 -0500
@@ -0,0 +1,490 @@
+package _main;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.sql.Time;
+import java.util.Scanner;
+
+import middlewares.Misc;
+import converters.Read2Wig;
+import converters.ReadOnTarget2Wig;
+import exceptions.ChromosomeFormatException;
+import exceptions.ChromosomeMismatchException;
+import exceptions.ChromosomeNotFoundException;
+import exceptions.FileFormatException;
+import exceptions.GenomeAnnotationException;
+import exceptions.NullOrNegativeRangeException;
+import exceptions.RangeFormatException;
+import exceptions.RangeLimitNotFoundException;
+import filters.Filter;
+import filters.GenomeFilter;
+import filters.ReadFilter;
+import filters.TargetFilter;
+
+/**
+ * This is a scheduler for the phases of the evaluation.
+ * 
+ * @author Ali Abdallah
+ * @version 19.03.2011
+ * @since jdk 1.6
+ */
+
+public class Enrichment {
+	
+	String 
+	/**
+	 * The name of the read alignment file.
+	 */
+	readFileName, 
+	/**
+	 * the name of the genome annotation file.
+	 */
+	genomeFName, 
+	/**
+	 * the ucsc-name of the genome (e.g. hg19).
+	 */
+	genomeName, 
+	/**
+	 * the name of the file of the targeted regions.
+	 */
+	targetFName, 
+	/**
+	 * the temporary directory.
+	 */
+	tmpDir,
+	/**
+	 * the output directory.
+	 */
+	outDir,
+	/**
+	 * the xml summary file containing various overall statistics.
+	 */
+	xmlSummaryFile, 
+	/**
+	 * the name of the detailed coverage data file.
+	 */
+	detailsFileName, 
+	/**
+	 * the .bed file containing target-level coverage statistics data. 
+	 */
+	beddetailsFileName,
+	/**
+	 * the process number used for unique naming.
+	 */
+	proc, 
+	/**
+	 * the name of the read alignment file without extension.
+	 */
+	prefix, 
+	/**
+	 * the extension of the read alignment file.
+	 */
+	suffix,
+	/**
+	 * The name of the read alignment file after the reduction.
+	 */
+	areadFileName, 
+	/**
+	 * The name of the genome annotation file after the reduction.
+	 */
+	aGenomeFName, 
+	/**
+	 * the name of the file of the targeted regions after the reduction.
+	 */
+	aTargetFName;
+	int 
+	/**
+	 * Cutoff for poor coverage [default: 2].
+	 * 
+	 */
+	poor, 
+	/**
+	 * Cutoff for high coverage [default: 200].
+	 */
+	high,
+	details;
+
+	/**
+	 * Construcs an enrichment object and initializes the evaluation parameters.
+	 * @param args the list of all evaluation parameters.
+	 */
+	public Enrichment(String... args) {
+		readFileName = args[0];
+		genomeName = args[8];
+		targetFName = args[2];
+		if (!targetFName.endsWith("bed")) {
+			System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format.");
+		}
+		if (args.length > 3) {
+			Time time = new Time(System.currentTimeMillis());
+			String nano = ""+System.nanoTime();
+			
+			tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+
+															time.getHours()+"_"+
+																time.getMinutes()+"_"+
+																	time.getSeconds()+"_"+nano+ "/";
+			new File(tmpDir).mkdir();
+			new File(tmpDir).setReadable(true);
+			new File(tmpDir).setWritable(true);
+			outDir = args[4] + Misc.slash(args[4]);
+
+			if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) {
+				System.out.println("File " + tmpDir
+						+ " is not a directory.\nProgram stopped.");
+				System.exit(0);
+			} else if (!(new File(tmpDir).exists())) {
+				System.out.println("File " + tmpDir
+						+ " not found.\nProgram stopped.");
+				System.exit(0);
+			}
+		}
+		if (args.length > 5) {
+			try {
+				poor = Integer.parseInt(args[5]);
+				high = Integer.parseInt(args[6]);
+			} catch (NumberFormatException nfe) {
+				System.out
+						.println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values.");
+				poor = 2;
+				high = 200;
+			}
+		} else {
+			poor = 2;
+			high = 200;
+		}
+		
+		genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1];
+		
+		prefix = Misc.prefix(readFileName);
+		if (args[7] != "none") {
+			prefix = args[7];
+		}
+		details = Integer.parseInt(args[9]);
+		
+		suffix = Misc.suffix(readFileName);
+		proc = Misc.getHostName();
+		areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt";
+		aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt";
+		aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed";
+		detailsFileName = tmpDir + "coverage_" + proc + ".txt";
+		xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml";
+		beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed";
+	}
+	
+	public static final String[] GENOME_NAMES = {
+		"anoGam1", "bosTau4", "cb3", "ce4", "ce6",
+		"danRer5", "danRer6", "danRer7", "dm2", "dm3",
+		"galGal2", "galGal3", "hg19", "hg18", "mm8",
+		"mm9", "panTro3", "rn3", "rn4", "susScr2"
+	};
+	
+	public int genomeID(String gname){
+		for(int i = 0; i < GENOME_NAMES.length; i++)
+			if(GENOME_NAMES[i].equals(gname))
+				return i;
+		return -1;
+	}
+	
+	/**
+	 * Downloads the specified genome annotation, if internet connection
+	 * exists.
+	 * 
+	 * @param genome the ucsc-name of the genome. (e.g. hg18, hg19)
+	 * @return the very same name of the genome.
+	 * @throws GenomeAnnotationException 
+	 */
+	
+	public String getGenomeAnnotation() throws GenomeAnnotationException{
+		try{
+			return downloadGenomeAnnotation();
+		}catch(Exception e){
+		if(genomeID(genomeName)!=-1)
+			return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt";
+		else if(new File(genomeName).exists())
+			return genomeName;
+		else
+			throw new GenomeAnnotationException();
+		}
+	}
+	
+	public String getChromInfo(){
+		//genomeName
+		return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt";
+	}
+	
+	public String downloadGenomeAnnotation() {
+		if (genomeName.indexOf("/") == -1) {
+			String tmpGenomeDir = tmpDir + genomeName;
+			try {
+				if (!new File(tmpGenomeDir).exists()) {
+					new File(tmpGenomeDir).mkdir();
+				}
+				File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh");
+				if (!getGenomeScript.exists()) {
+					getGenomeScript.createNewFile();
+				}
+				FileWriter fw = new FileWriter(getGenomeScript);
+
+				getGenomeScript.setReadable(true);
+				getGenomeScript.setWritable(true);
+				getGenomeScript.setExecutable(true);
+				getGenomeScript.setExecutable(true);
+				String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir)
+						+ "refGene.txt.gz";
+				fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n"
+						+ "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName
+						+ "/database/refGene.txt.gz " + "-O " + tmpGenomeFile
+						+ "\n" + "gunzip " + tmpGenomeFile + "\n");
+				fw.close();
+				Runtime.getRuntime().exec("sh " + getGenomeScript);
+				try {
+					Thread.sleep(10000);
+				} catch (InterruptedException e) {
+					e.printStackTrace();
+				}
+				
+			} catch (IOException e) {
+				e.printStackTrace();
+			}
+			return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt";
+		} else {
+			return genomeName;
+		}
+	}
+
+	/**
+	 * A bridge method which converts a bam file to a sam file for further
+	 * processing. This method assures therefore the compatibility of the
+	 * software with the bam format.
+	 * 
+	 * @return the name of the generated sam file.
+	 * @throws IOException
+	 */
+	public String bam2sam() throws IOException {
+
+		String[] bampath = readFileName.split("/");
+		String bamfile = bampath[bampath.length - 1];
+		String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam";
+
+		File getBam2SamScript = new File(tmpDir + "bam2sam.sh");
+		if (!getBam2SamScript.exists()) {
+			getBam2SamScript.createNewFile();
+		}
+		FileWriter fw = new FileWriter(getBam2SamScript);
+		fw.write("#!/bin/bash\n" + Misc.binDir()
+				+ "/../thirdparty/samtools/0.1.16/samtools view "
+				+ readFileName + " > " + samfile);
+		fw.close();
+
+		Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript);
+		//		try {
+		//	@SuppressWarnings("unused")
+			// "exitstate" variable unused
+			    //int exitstate = p.waitFor();
+		//		} catch (InterruptedException e) {
+		//e.printStackTrace();
+		//		}
+
+		Scanner stderr=new Scanner(p.getErrorStream());
+		while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
+		stderr.close();
+
+		return (samfile);
+
+	}
+
+	/**
+	 * First phase of the evaluation. In this phase input files are being
+	 * simplified for further processing.
+	 * 
+	 * @throws FileFormatException
+	 * @throws ChromosomeMismatchException 
+	 */
+	public void reduceFiles() throws FileFormatException, ChromosomeMismatchException {
+		String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt";
+		String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt";
+		String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt";
+		
+		
+		Filter[] filters = {new ReadFilter(readFileName, aReadFileName), 
+							new GenomeFilter(genomeFName, aGenomeFName),
+							new TargetFilter(targetFName, aTargetFName)};
+		
+		for(Filter f: filters) f.filter();
+
+		if(!Misc.areChromosomeCompatible(new File(aReadFileName), 
+										 new File(aTargetFName)))
+			throw new ChromosomeMismatchException();
+		
+		System.out.println("\nSTEP 1 successfully completed");
+	}
+
+	/**
+	 * Second phase of the evaluation. An EnrichmentStatsComputer object is
+	 * called with the right parameters and computation is checked for formal
+	 * correctness.
+	 */
+	public void computeTargetCoverageFiles() {
+
+		File data = new File(outDir + "/data");
+		data.mkdir();
+		File plots = new File(outDir + "/plots");
+		plots.mkdir();
+
+		EnrichmentStatsComputer esc = 
+					new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir);
+		
+		try {
+			esc.compute();
+		} catch (RangeFormatException e) {
+			e.printStackTrace();
+		} catch (ChromosomeFormatException e) {
+			e.printStackTrace();
+		} catch (ChromosomeNotFoundException e) {
+			e.printStackTrace();
+		} catch (RangeLimitNotFoundException e) {
+			e.printStackTrace();
+		} catch (NullOrNegativeRangeException e) {	
+			e.printStackTrace();
+		} catch (IOException e) {
+			e.printStackTrace();
+		}
+		
+		if (new File(detailsFileName).exists()) {
+			System.out.println("\nSTEP 2 successfully completed");
+		} else {
+			System.out.println("STEP 2 incomplete.");
+		}
+
+	}
+
+	/**
+	 * Third phase of the evaluation. Based on the results from phase 2, the
+	 * evaluation is further developed and evaluation data are visualized by
+	 * calling "eval_enrichment.R" Script.
+	 */
+	public void evaluate() {
+		Runtime rt = Runtime.getRuntime();
+		try {
+			String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R";
+			int lastSlash = outDir.lastIndexOf("/");
+			String outDirR = (lastSlash == outDir.length() - 1) ? outDir
+					.substring(0, lastSlash) : outDir;
+			
+			String call = 	"Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " "
+							+ outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " " 
+							+ targetFName+" "+details;
+			
+			Process p=rt.exec(call);
+
+			Scanner stdout = new Scanner(p.getInputStream());
+			Scanner stderr=new Scanner(p.getErrorStream());
+			while (stdout.hasNextLine()){System.out.println(stdout.nextLine());}
+            while (stderr.hasNextLine()){System.out.println(stderr.nextLine());}
+			stdout.close(); stderr.close();
+
+			if(details == 1){
+				String chrInfoPath = getChromInfo();
+			
+				//System.out.println("chrInfoPath\t"+chrInfoPath);
+				script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R";
+                outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir;
+            
+                String call2 = 	"Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName 
+							+ " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " " 
+							+ genomeName;
+                Process p2 = rt.exec(call2);
+                stdout=new Scanner(p2.getInputStream());
+                stderr=new Scanner(p2.getErrorStream());
+                
+                while(stdout.hasNextLine()){System.out.println(stdout.nextLine());}
+                while(stderr.hasNextLine()){System.out.println(stderr.nextLine());}
+                stdout.close(); stderr.close();
+			}
+            
+			String path = outDir+prefix+"_enrichment.html";
+			System.out.println("Created plots and HTML summary report");
+			if (new File(path).exists()) {
+			    System.out.println("\nSTEP 3 successfully completed");
+			} else {
+			    System.out.println("HTML FILE " + path + " not found");
+			    System.out.println("\nSTEP 3 unsuccessful");
+			}
+		}
+		catch (IOException e) {
+			e.printStackTrace();
+		}
+	}
+
+	/**
+	 * This is the fourth phase. In this phase the detailed datta from phase 2
+	 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig
+	 * object to accomplish this task.
+	 * 
+	 */
+	public void computeWiggleFile() {
+		try {
+			System.out.println("Computing wiggle file for on-target reads");
+			if (!new File(detailsFileName).exists())
+				detailsFileName = tmpDir +  new File(detailsFileName).getName();
+			
+			new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data", 
+					Misc.prefix(readFileName)+ "\nonTarget");
+			System.out.println("Wiggle file for on-target reads created");
+			System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig");
+			File[] outputFiles = new File(outDir + "data/")
+					.listFiles();
+			for (int i = 0; i < outputFiles.length; i++) {
+				if (outputFiles[i].getName().endsWith("wig")
+						&& outputFiles[i].getName().startsWith(
+								detailsFileName.substring(
+									detailsFileName.lastIndexOf("/") + 1,
+										detailsFileName.lastIndexOf(".")))) {
+					File f = new File(outDir + "data"
+							+ "/" + Misc.prefix(readFileName) + "_onTarget.wig");
+					outputFiles[i].renameTo(f);
+					break;
+				}
+			}
+			if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){
+				System.out.println("\nSTEP 4 successfully completed.");
+			} else {
+				System.out.println(outDir+"data/"+ prefix
+						+ "_onTarget.wig not found!\n\nSTEP 4 unsuccessful");
+			}
+		} catch (IOException e) {
+			System.err.println("Target conversion in Wig unsuccessful");
+			e.printStackTrace();
+		}
+	}
+
+	/**
+	 * This is the fifth and last phase of the evaluation process. In contrast
+	 * to the fourth phase, this method uses a Read2Wig object to convert the
+	 * detailed data for all reads (and not only for reads on targets) to the
+	 * wiggle format.
+	 * 
+	 */
+	void computeOverallWiggleFile() {
+		try {
+			System.out.println("Computing wiggle file for all reads");
+			new Read2Wig(areadFileName, prefix, outDir + "data", 
+														genomeFName, tmpDir);
+			String overallWiggleFileName = prefix + ".wig";
+			System.out.println("Wiggle file for all reads created");
+			System.out.println("Output written to " + outDir + "data/" + 
+														overallWiggleFileName);
+			if (new File(outDir + "data/" + overallWiggleFileName).exists()) {
+				System.out.println("\nSTEP 5 successfully completed.");
+			} else {
+				System.out.println(outDir + "data/" + overallWiggleFileName
+						+ "not found\n\nSTEP 5 unsuccessful");
+			}
+		} catch (IOException e) {
+			System.err
+					.println("Creating wiggle file for all reads was unsuccessful");
+			e.printStackTrace();
+		}
+	}
+}
\ No newline at end of file
author	pfrommolt
date	Mon, 21 Nov 2011 08:12:19 -0500
parents
children