Mercurial > repos > pfrommolt > ngsrich
diff NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NGSrich_0.5.5/src/_main/Enrichment.java Mon Nov 21 08:12:19 2011 -0500 @@ -0,0 +1,490 @@ +package _main; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.sql.Time; +import java.util.Scanner; + +import middlewares.Misc; +import converters.Read2Wig; +import converters.ReadOnTarget2Wig; +import exceptions.ChromosomeFormatException; +import exceptions.ChromosomeMismatchException; +import exceptions.ChromosomeNotFoundException; +import exceptions.FileFormatException; +import exceptions.GenomeAnnotationException; +import exceptions.NullOrNegativeRangeException; +import exceptions.RangeFormatException; +import exceptions.RangeLimitNotFoundException; +import filters.Filter; +import filters.GenomeFilter; +import filters.ReadFilter; +import filters.TargetFilter; + +/** + * This is a scheduler for the phases of the evaluation. + * + * @author Ali Abdallah + * @version 19.03.2011 + * @since jdk 1.6 + */ + +public class Enrichment { + + String + /** + * The name of the read alignment file. + */ + readFileName, + /** + * the name of the genome annotation file. + */ + genomeFName, + /** + * the ucsc-name of the genome (e.g. hg19). + */ + genomeName, + /** + * the name of the file of the targeted regions. + */ + targetFName, + /** + * the temporary directory. + */ + tmpDir, + /** + * the output directory. + */ + outDir, + /** + * the xml summary file containing various overall statistics. + */ + xmlSummaryFile, + /** + * the name of the detailed coverage data file. + */ + detailsFileName, + /** + * the .bed file containing target-level coverage statistics data. + */ + beddetailsFileName, + /** + * the process number used for unique naming. + */ + proc, + /** + * the name of the read alignment file without extension. + */ + prefix, + /** + * the extension of the read alignment file. + */ + suffix, + /** + * The name of the read alignment file after the reduction. + */ + areadFileName, + /** + * The name of the genome annotation file after the reduction. + */ + aGenomeFName, + /** + * the name of the file of the targeted regions after the reduction. + */ + aTargetFName; + int + /** + * Cutoff for poor coverage [default: 2]. + * + */ + poor, + /** + * Cutoff for high coverage [default: 200]. + */ + high, + details; + + /** + * Construcs an enrichment object and initializes the evaluation parameters. + * @param args the list of all evaluation parameters. + */ + public Enrichment(String... args) { + readFileName = args[0]; + genomeName = args[8]; + targetFName = args[2]; + if (!targetFName.endsWith("bed")) { + System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format."); + } + if (args.length > 3) { + Time time = new Time(System.currentTimeMillis()); + String nano = ""+System.nanoTime(); + + tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+ + time.getHours()+"_"+ + time.getMinutes()+"_"+ + time.getSeconds()+"_"+nano+ "/"; + new File(tmpDir).mkdir(); + new File(tmpDir).setReadable(true); + new File(tmpDir).setWritable(true); + outDir = args[4] + Misc.slash(args[4]); + + if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) { + System.out.println("File " + tmpDir + + " is not a directory.\nProgram stopped."); + System.exit(0); + } else if (!(new File(tmpDir).exists())) { + System.out.println("File " + tmpDir + + " not found.\nProgram stopped."); + System.exit(0); + } + } + if (args.length > 5) { + try { + poor = Integer.parseInt(args[5]); + high = Integer.parseInt(args[6]); + } catch (NumberFormatException nfe) { + System.out + .println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values."); + poor = 2; + high = 200; + } + } else { + poor = 2; + high = 200; + } + + genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1]; + + prefix = Misc.prefix(readFileName); + if (args[7] != "none") { + prefix = args[7]; + } + details = Integer.parseInt(args[9]); + + suffix = Misc.suffix(readFileName); + proc = Misc.getHostName(); + areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt"; + aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt"; + aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed"; + detailsFileName = tmpDir + "coverage_" + proc + ".txt"; + xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml"; + beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed"; + } + + public static final String[] GENOME_NAMES = { + "anoGam1", "bosTau4", "cb3", "ce4", "ce6", + "danRer5", "danRer6", "danRer7", "dm2", "dm3", + "galGal2", "galGal3", "hg19", "hg18", "mm8", + "mm9", "panTro3", "rn3", "rn4", "susScr2" + }; + + public int genomeID(String gname){ + for(int i = 0; i < GENOME_NAMES.length; i++) + if(GENOME_NAMES[i].equals(gname)) + return i; + return -1; + } + + /** + * Downloads the specified genome annotation, if internet connection + * exists. + * + * @param genome the ucsc-name of the genome. (e.g. hg18, hg19) + * @return the very same name of the genome. + * @throws GenomeAnnotationException + */ + + public String getGenomeAnnotation() throws GenomeAnnotationException{ + try{ + return downloadGenomeAnnotation(); + }catch(Exception e){ + if(genomeID(genomeName)!=-1) + return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt"; + else if(new File(genomeName).exists()) + return genomeName; + else + throw new GenomeAnnotationException(); + } + } + + public String getChromInfo(){ + //genomeName + return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt"; + } + + public String downloadGenomeAnnotation() { + if (genomeName.indexOf("/") == -1) { + String tmpGenomeDir = tmpDir + genomeName; + try { + if (!new File(tmpGenomeDir).exists()) { + new File(tmpGenomeDir).mkdir(); + } + File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh"); + if (!getGenomeScript.exists()) { + getGenomeScript.createNewFile(); + } + FileWriter fw = new FileWriter(getGenomeScript); + + getGenomeScript.setReadable(true); + getGenomeScript.setWritable(true); + getGenomeScript.setExecutable(true); + getGenomeScript.setExecutable(true); + String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir) + + "refGene.txt.gz"; + fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n" + + "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName + + "/database/refGene.txt.gz " + "-O " + tmpGenomeFile + + "\n" + "gunzip " + tmpGenomeFile + "\n"); + fw.close(); + Runtime.getRuntime().exec("sh " + getGenomeScript); + try { + Thread.sleep(10000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + + } catch (IOException e) { + e.printStackTrace(); + } + return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt"; + } else { + return genomeName; + } + } + + /** + * A bridge method which converts a bam file to a sam file for further + * processing. This method assures therefore the compatibility of the + * software with the bam format. + * + * @return the name of the generated sam file. + * @throws IOException + */ + public String bam2sam() throws IOException { + + String[] bampath = readFileName.split("/"); + String bamfile = bampath[bampath.length - 1]; + String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam"; + + File getBam2SamScript = new File(tmpDir + "bam2sam.sh"); + if (!getBam2SamScript.exists()) { + getBam2SamScript.createNewFile(); + } + FileWriter fw = new FileWriter(getBam2SamScript); + fw.write("#!/bin/bash\n" + Misc.binDir() + + "/../thirdparty/samtools/0.1.16/samtools view " + + readFileName + " > " + samfile); + fw.close(); + + Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript); + // try { + // @SuppressWarnings("unused") + // "exitstate" variable unused + //int exitstate = p.waitFor(); + // } catch (InterruptedException e) { + //e.printStackTrace(); + // } + + Scanner stderr=new Scanner(p.getErrorStream()); + while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} + stderr.close(); + + return (samfile); + + } + + /** + * First phase of the evaluation. In this phase input files are being + * simplified for further processing. + * + * @throws FileFormatException + * @throws ChromosomeMismatchException + */ + public void reduceFiles() throws FileFormatException, ChromosomeMismatchException { + String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt"; + String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt"; + String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt"; + + + Filter[] filters = {new ReadFilter(readFileName, aReadFileName), + new GenomeFilter(genomeFName, aGenomeFName), + new TargetFilter(targetFName, aTargetFName)}; + + for(Filter f: filters) f.filter(); + + if(!Misc.areChromosomeCompatible(new File(aReadFileName), + new File(aTargetFName))) + throw new ChromosomeMismatchException(); + + System.out.println("\nSTEP 1 successfully completed"); + } + + /** + * Second phase of the evaluation. An EnrichmentStatsComputer object is + * called with the right parameters and computation is checked for formal + * correctness. + */ + public void computeTargetCoverageFiles() { + + File data = new File(outDir + "/data"); + data.mkdir(); + File plots = new File(outDir + "/plots"); + plots.mkdir(); + + EnrichmentStatsComputer esc = + new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir); + + try { + esc.compute(); + } catch (RangeFormatException e) { + e.printStackTrace(); + } catch (ChromosomeFormatException e) { + e.printStackTrace(); + } catch (ChromosomeNotFoundException e) { + e.printStackTrace(); + } catch (RangeLimitNotFoundException e) { + e.printStackTrace(); + } catch (NullOrNegativeRangeException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + if (new File(detailsFileName).exists()) { + System.out.println("\nSTEP 2 successfully completed"); + } else { + System.out.println("STEP 2 incomplete."); + } + + } + + /** + * Third phase of the evaluation. Based on the results from phase 2, the + * evaluation is further developed and evaluation data are visualized by + * calling "eval_enrichment.R" Script. + */ + public void evaluate() { + Runtime rt = Runtime.getRuntime(); + try { + String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R"; + int lastSlash = outDir.lastIndexOf("/"); + String outDirR = (lastSlash == outDir.length() - 1) ? outDir + .substring(0, lastSlash) : outDir; + + String call = "Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " " + + outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " " + + targetFName+" "+details; + + Process p=rt.exec(call); + + Scanner stdout = new Scanner(p.getInputStream()); + Scanner stderr=new Scanner(p.getErrorStream()); + while (stdout.hasNextLine()){System.out.println(stdout.nextLine());} + while (stderr.hasNextLine()){System.out.println(stderr.nextLine());} + stdout.close(); stderr.close(); + + if(details == 1){ + String chrInfoPath = getChromInfo(); + + //System.out.println("chrInfoPath\t"+chrInfoPath); + script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R"; + outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir; + + String call2 = "Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName + + " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " " + + genomeName; + Process p2 = rt.exec(call2); + stdout=new Scanner(p2.getInputStream()); + stderr=new Scanner(p2.getErrorStream()); + + while(stdout.hasNextLine()){System.out.println(stdout.nextLine());} + while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} + stdout.close(); stderr.close(); + } + + String path = outDir+prefix+"_enrichment.html"; + System.out.println("Created plots and HTML summary report"); + if (new File(path).exists()) { + System.out.println("\nSTEP 3 successfully completed"); + } else { + System.out.println("HTML FILE " + path + " not found"); + System.out.println("\nSTEP 3 unsuccessful"); + } + } + catch (IOException e) { + e.printStackTrace(); + } + } + + /** + * This is the fourth phase. In this phase the detailed datta from phase 2 + * are converted to the wiggle format. The method uses a ReadOnTarget2Wig + * object to accomplish this task. + * + */ + public void computeWiggleFile() { + try { + System.out.println("Computing wiggle file for on-target reads"); + if (!new File(detailsFileName).exists()) + detailsFileName = tmpDir + new File(detailsFileName).getName(); + + new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data", + Misc.prefix(readFileName)+ "\nonTarget"); + System.out.println("Wiggle file for on-target reads created"); + System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig"); + File[] outputFiles = new File(outDir + "data/") + .listFiles(); + for (int i = 0; i < outputFiles.length; i++) { + if (outputFiles[i].getName().endsWith("wig") + && outputFiles[i].getName().startsWith( + detailsFileName.substring( + detailsFileName.lastIndexOf("/") + 1, + detailsFileName.lastIndexOf(".")))) { + File f = new File(outDir + "data" + + "/" + Misc.prefix(readFileName) + "_onTarget.wig"); + outputFiles[i].renameTo(f); + break; + } + } + if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){ + System.out.println("\nSTEP 4 successfully completed."); + } else { + System.out.println(outDir+"data/"+ prefix + + "_onTarget.wig not found!\n\nSTEP 4 unsuccessful"); + } + } catch (IOException e) { + System.err.println("Target conversion in Wig unsuccessful"); + e.printStackTrace(); + } + } + + /** + * This is the fifth and last phase of the evaluation process. In contrast + * to the fourth phase, this method uses a Read2Wig object to convert the + * detailed data for all reads (and not only for reads on targets) to the + * wiggle format. + * + */ + void computeOverallWiggleFile() { + try { + System.out.println("Computing wiggle file for all reads"); + new Read2Wig(areadFileName, prefix, outDir + "data", + genomeFName, tmpDir); + String overallWiggleFileName = prefix + ".wig"; + System.out.println("Wiggle file for all reads created"); + System.out.println("Output written to " + outDir + "data/" + + overallWiggleFileName); + if (new File(outDir + "data/" + overallWiggleFileName).exists()) { + System.out.println("\nSTEP 5 successfully completed."); + } else { + System.out.println(outDir + "data/" + overallWiggleFileName + + "not found\n\nSTEP 5 unsuccessful"); + } + } catch (IOException e) { + System.err + .println("Creating wiggle file for all reads was unsuccessful"); + e.printStackTrace(); + } + } +} \ No newline at end of file