Mercurial > repos > pfrommolt > ngsrich
view NGSrich_0.5.5/src/_main/Enrichment.java @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line source
package _main; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.sql.Time; import java.util.Scanner; import middlewares.Misc; import converters.Read2Wig; import converters.ReadOnTarget2Wig; import exceptions.ChromosomeFormatException; import exceptions.ChromosomeMismatchException; import exceptions.ChromosomeNotFoundException; import exceptions.FileFormatException; import exceptions.GenomeAnnotationException; import exceptions.NullOrNegativeRangeException; import exceptions.RangeFormatException; import exceptions.RangeLimitNotFoundException; import filters.Filter; import filters.GenomeFilter; import filters.ReadFilter; import filters.TargetFilter; /** * This is a scheduler for the phases of the evaluation. * * @author Ali Abdallah * @version 19.03.2011 * @since jdk 1.6 */ public class Enrichment { String /** * The name of the read alignment file. */ readFileName, /** * the name of the genome annotation file. */ genomeFName, /** * the ucsc-name of the genome (e.g. hg19). */ genomeName, /** * the name of the file of the targeted regions. */ targetFName, /** * the temporary directory. */ tmpDir, /** * the output directory. */ outDir, /** * the xml summary file containing various overall statistics. */ xmlSummaryFile, /** * the name of the detailed coverage data file. */ detailsFileName, /** * the .bed file containing target-level coverage statistics data. */ beddetailsFileName, /** * the process number used for unique naming. */ proc, /** * the name of the read alignment file without extension. */ prefix, /** * the extension of the read alignment file. */ suffix, /** * The name of the read alignment file after the reduction. */ areadFileName, /** * The name of the genome annotation file after the reduction. */ aGenomeFName, /** * the name of the file of the targeted regions after the reduction. */ aTargetFName; int /** * Cutoff for poor coverage [default: 2]. * */ poor, /** * Cutoff for high coverage [default: 200]. */ high, details; /** * Construcs an enrichment object and initializes the evaluation parameters. * @param args the list of all evaluation parameters. */ public Enrichment(String... args) { readFileName = args[0]; genomeName = args[8]; targetFName = args[2]; if (!targetFName.endsWith("bed")) { System.out.println("WARNING: Target file: " + targetFName + " must be in the bed format."); } if (args.length > 3) { Time time = new Time(System.currentTimeMillis()); String nano = ""+System.nanoTime(); tmpDir = args[3] + Misc.slash(args[3]) +"Sample_From_"+ time.getHours()+"_"+ time.getMinutes()+"_"+ time.getSeconds()+"_"+nano+ "/"; new File(tmpDir).mkdir(); new File(tmpDir).setReadable(true); new File(tmpDir).setWritable(true); outDir = args[4] + Misc.slash(args[4]); if (!(new File(tmpDir).isDirectory()) && new File(tmpDir).exists()) { System.out.println("File " + tmpDir + " is not a directory.\nProgram stopped."); System.exit(0); } else if (!(new File(tmpDir).exists())) { System.out.println("File " + tmpDir + " not found.\nProgram stopped."); System.exit(0); } } if (args.length > 5) { try { poor = Integer.parseInt(args[5]); high = Integer.parseInt(args[6]); } catch (NumberFormatException nfe) { System.out .println("Warning: poor or high must be integers.\n<poor> and <high> are set to the standard values."); poor = 2; high = 200; } } else { poor = 2; high = 200; } genomeFName = (args[1].equals("none"))?downloadGenomeAnnotation():args[1]; prefix = Misc.prefix(readFileName); if (args[7] != "none") { prefix = args[7]; } details = Integer.parseInt(args[9]); suffix = Misc.suffix(readFileName); proc = Misc.getHostName(); areadFileName = tmpDir + "NGSrich_" + prefix + "_" + proc + ".txt"; aGenomeFName = tmpDir + "NGSrich_genome_" + proc + ".txt"; aTargetFName = tmpDir + "NGSrich_target_" + proc + ".bed"; detailsFileName = tmpDir + "coverage_" + proc + ".txt"; xmlSummaryFile = outDir + "data/" + prefix + "_enrichment.xml"; beddetailsFileName = outDir + "data/" + prefix + "_enrichment.bed"; } public static final String[] GENOME_NAMES = { "anoGam1", "bosTau4", "cb3", "ce4", "ce6", "danRer5", "danRer6", "danRer7", "dm2", "dm3", "galGal2", "galGal3", "hg19", "hg18", "mm8", "mm9", "panTro3", "rn3", "rn4", "susScr2" }; public int genomeID(String gname){ for(int i = 0; i < GENOME_NAMES.length; i++) if(GENOME_NAMES[i].equals(gname)) return i; return -1; } /** * Downloads the specified genome annotation, if internet connection * exists. * * @param genome the ucsc-name of the genome. (e.g. hg18, hg19) * @return the very same name of the genome. * @throws GenomeAnnotationException */ public String getGenomeAnnotation() throws GenomeAnnotationException{ try{ return downloadGenomeAnnotation(); }catch(Exception e){ if(genomeID(genomeName)!=-1) return Misc.scriptDir()+"thirdparty/annotations/"+genomeName+".txt"; else if(new File(genomeName).exists()) return genomeName; else throw new GenomeAnnotationException(); } } public String getChromInfo(){ //genomeName return Misc.scriptDir()+"thirdparty/chromInfo/"+genomeName+".txt"; } public String downloadGenomeAnnotation() { if (genomeName.indexOf("/") == -1) { String tmpGenomeDir = tmpDir + genomeName; try { if (!new File(tmpGenomeDir).exists()) { new File(tmpGenomeDir).mkdir(); } File getGenomeScript = new File(tmpGenomeDir + Misc.slash(tmpGenomeDir) + "getGenome.sh"); if (!getGenomeScript.exists()) { getGenomeScript.createNewFile(); } FileWriter fw = new FileWriter(getGenomeScript); getGenomeScript.setReadable(true); getGenomeScript.setWritable(true); getGenomeScript.setExecutable(true); getGenomeScript.setExecutable(true); String tmpGenomeFile = tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt.gz"; fw.write("#!/bin/bash\n" + "cd " + tmpGenomeDir + "\n" + "wget http://hgdownload.cse.ucsc.edu/goldenPath/" + genomeName + "/database/refGene.txt.gz " + "-O " + tmpGenomeFile + "\n" + "gunzip " + tmpGenomeFile + "\n"); fw.close(); Runtime.getRuntime().exec("sh " + getGenomeScript); try { Thread.sleep(10000); } catch (InterruptedException e) { e.printStackTrace(); } } catch (IOException e) { e.printStackTrace(); } return tmpGenomeDir + Misc.slash(tmpGenomeDir) + "refGene.txt"; } else { return genomeName; } } /** * A bridge method which converts a bam file to a sam file for further * processing. This method assures therefore the compatibility of the * software with the bam format. * * @return the name of the generated sam file. * @throws IOException */ public String bam2sam() throws IOException { String[] bampath = readFileName.split("/"); String bamfile = bampath[bampath.length - 1]; String samfile = tmpDir + bamfile.split(".bam")[0] + ".sam"; File getBam2SamScript = new File(tmpDir + "bam2sam.sh"); if (!getBam2SamScript.exists()) { getBam2SamScript.createNewFile(); } FileWriter fw = new FileWriter(getBam2SamScript); fw.write("#!/bin/bash\n" + Misc.binDir() + "/../thirdparty/samtools/0.1.16/samtools view " + readFileName + " > " + samfile); fw.close(); Process p = Runtime.getRuntime().exec("sh " + getBam2SamScript); // try { // @SuppressWarnings("unused") // "exitstate" variable unused //int exitstate = p.waitFor(); // } catch (InterruptedException e) { //e.printStackTrace(); // } Scanner stderr=new Scanner(p.getErrorStream()); while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} stderr.close(); return (samfile); } /** * First phase of the evaluation. In this phase input files are being * simplified for further processing. * * @throws FileFormatException * @throws ChromosomeMismatchException */ public void reduceFiles() throws FileFormatException, ChromosomeMismatchException { String aReadFileName = tmpDir+"NGSrich_"+prefix+"_"+ proc + ".txt"; String aGenomeFName = tmpDir+"NGSrich_genome_" + proc + ".txt"; String aTargetFName = tmpDir+"NGSrich_target_"+ proc+".txt"; Filter[] filters = {new ReadFilter(readFileName, aReadFileName), new GenomeFilter(genomeFName, aGenomeFName), new TargetFilter(targetFName, aTargetFName)}; for(Filter f: filters) f.filter(); if(!Misc.areChromosomeCompatible(new File(aReadFileName), new File(aTargetFName))) throw new ChromosomeMismatchException(); System.out.println("\nSTEP 1 successfully completed"); } /** * Second phase of the evaluation. An EnrichmentStatsComputer object is * called with the right parameters and computation is checked for formal * correctness. */ public void computeTargetCoverageFiles() { File data = new File(outDir + "/data"); data.mkdir(); File plots = new File(outDir + "/plots"); plots.mkdir(); EnrichmentStatsComputer esc = new EnrichmentStatsComputer(prefix, proc,tmpDir, outDir); try { esc.compute(); } catch (RangeFormatException e) { e.printStackTrace(); } catch (ChromosomeFormatException e) { e.printStackTrace(); } catch (ChromosomeNotFoundException e) { e.printStackTrace(); } catch (RangeLimitNotFoundException e) { e.printStackTrace(); } catch (NullOrNegativeRangeException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } if (new File(detailsFileName).exists()) { System.out.println("\nSTEP 2 successfully completed"); } else { System.out.println("STEP 2 incomplete."); } } /** * Third phase of the evaluation. Based on the results from phase 2, the * evaluation is further developed and evaluation data are visualized by * calling "eval_enrichment.R" Script. */ public void evaluate() { Runtime rt = Runtime.getRuntime(); try { String script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_enrichment.R"; int lastSlash = outDir.lastIndexOf("/"); String outDirR = (lastSlash == outDir.length() - 1) ? outDir .substring(0, lastSlash) : outDir; String call = "Rscript " + script + " "+ xmlSummaryFile + " " + beddetailsFileName + " " + outDirR + " " + genomeName + " " + poor + " " + high + " " + prefix + " " + targetFName+" "+details; Process p=rt.exec(call); Scanner stdout = new Scanner(p.getInputStream()); Scanner stderr=new Scanner(p.getErrorStream()); while (stdout.hasNextLine()){System.out.println(stdout.nextLine());} while (stderr.hasNextLine()){System.out.println(stderr.nextLine());} stdout.close(); stderr.close(); if(details == 1){ String chrInfoPath = getChromInfo(); //System.out.println("chrInfoPath\t"+chrInfoPath); script = Misc.scriptDir() + Misc.slash(Misc.scriptDir())+"R/eval_details.R"; outDirR = (lastSlash == outDir.length() - 1) ? outDir.substring(0, lastSlash) : outDir; String call2 = "Rscript "+ script + " " + xmlSummaryFile + " " + beddetailsFileName + " " + detailsFileName + " " + chrInfoPath + " " + outDirR + " " + genomeName; Process p2 = rt.exec(call2); stdout=new Scanner(p2.getInputStream()); stderr=new Scanner(p2.getErrorStream()); while(stdout.hasNextLine()){System.out.println(stdout.nextLine());} while(stderr.hasNextLine()){System.out.println(stderr.nextLine());} stdout.close(); stderr.close(); } String path = outDir+prefix+"_enrichment.html"; System.out.println("Created plots and HTML summary report"); if (new File(path).exists()) { System.out.println("\nSTEP 3 successfully completed"); } else { System.out.println("HTML FILE " + path + " not found"); System.out.println("\nSTEP 3 unsuccessful"); } } catch (IOException e) { e.printStackTrace(); } } /** * This is the fourth phase. In this phase the detailed datta from phase 2 * are converted to the wiggle format. The method uses a ReadOnTarget2Wig * object to accomplish this task. * */ public void computeWiggleFile() { try { System.out.println("Computing wiggle file for on-target reads"); if (!new File(detailsFileName).exists()) detailsFileName = tmpDir + new File(detailsFileName).getName(); new ReadOnTarget2Wig(detailsFileName, prefix, outDir + "data", Misc.prefix(readFileName)+ "\nonTarget"); System.out.println("Wiggle file for on-target reads created"); System.out.println("Output written to " + outDir + "data/" + prefix + "_onTarget.wig"); File[] outputFiles = new File(outDir + "data/") .listFiles(); for (int i = 0; i < outputFiles.length; i++) { if (outputFiles[i].getName().endsWith("wig") && outputFiles[i].getName().startsWith( detailsFileName.substring( detailsFileName.lastIndexOf("/") + 1, detailsFileName.lastIndexOf(".")))) { File f = new File(outDir + "data" + "/" + Misc.prefix(readFileName) + "_onTarget.wig"); outputFiles[i].renameTo(f); break; } } if (new File(outDir+"data/"+prefix+"_onTarget.wig").exists()){ System.out.println("\nSTEP 4 successfully completed."); } else { System.out.println(outDir+"data/"+ prefix + "_onTarget.wig not found!\n\nSTEP 4 unsuccessful"); } } catch (IOException e) { System.err.println("Target conversion in Wig unsuccessful"); e.printStackTrace(); } } /** * This is the fifth and last phase of the evaluation process. In contrast * to the fourth phase, this method uses a Read2Wig object to convert the * detailed data for all reads (and not only for reads on targets) to the * wiggle format. * */ void computeOverallWiggleFile() { try { System.out.println("Computing wiggle file for all reads"); new Read2Wig(areadFileName, prefix, outDir + "data", genomeFName, tmpDir); String overallWiggleFileName = prefix + ".wig"; System.out.println("Wiggle file for all reads created"); System.out.println("Output written to " + outDir + "data/" + overallWiggleFileName); if (new File(outDir + "data/" + overallWiggleFileName).exists()) { System.out.println("\nSTEP 5 successfully completed."); } else { System.out.println(outDir + "data/" + overallWiggleFileName + "not found\n\nSTEP 5 unsuccessful"); } } catch (IOException e) { System.err .println("Creating wiggle file for all reads was unsuccessful"); e.printStackTrace(); } } }