Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
diff fastqc_site_render.R @ 11:507eec497730 draft
update fastqc site
| author | mingchen0919 | 
|---|---|
| date | Tue, 07 Nov 2017 16:52:24 -0500 | 
| parents | d820be692d74 | 
| children | a6f8382f852c | 
line wrap: on
 line diff
--- a/fastqc_site_render.R Tue Aug 15 15:50:21 2017 -0400 +++ b/fastqc_site_render.R Tue Nov 07 16:52:24 2017 -0500 @@ -1,195 +1,283 @@ -##======= Handle arguments from command line ======== -# setup R error handline to go to stderr -options(show.error.messages=FALSE, - error=function(){ - cat(geterrmessage(), file=stderr()) - quit("no", 1, F) - }) - -# we need that to not crash galaxy with an UTF8 error on German LC settings. -loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") - -# suppress warning -options(warn = -1) - -options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) -args = commandArgs(trailingOnly=TRUE) - -suppressPackageStartupMessages({ - library(getopt) - library(tools) -}) - -# column 1: the long flag name -# column 2: the short flag alias. A SINGLE character string -# column 3: argument mask -# 0: no argument -# 1: argument required -# 2: argument is optional -# column 4: date type to which the flag's argument shall be cast. -# possible values: logical, integer, double, complex, character. -spec_list=list() - -##------- 1. input data --------------------- -spec_list$READS = c('reads', 'r', '1', 'character') -spec_list$ECHO = c('echo', 'e', '1', 'character') - -##--------2. output report and report site directory -------------- -spec_list$FASTQC_SITE = c('fastqc_site', 'o', '1', 'character') -spec_list$FASTQC_SITE_DIR = c('fastqc_site_dir', 'd', '1', 'character') - -##--------3. Rmd templates sitting in the tool directory ---------- - - ## _site.yml and index.Rmd files - spec_list$SITE_YML = c('site_yml', 's', 1, 'character') - spec_list$INDEX_Rmd = c('index_rmd', 'i', 1, 'character') - - ## other Rmd body template files - spec_list$x01 = c('x01_evaluation_overview', 'p', '1', 'character') - spec_list$x02 = c('x02_fastqc_original_reports', 'a', '1', 'character') - spec_list$x1 = c('x1_per_base_quality_scores', 'b', '1', 'character') - spec_list$x2 = c('x2_per_base_N_content', 'c', '1', 'character') - spec_list$x3 = c('x3_per_sequence_quality_scores', 'f', '1', 'character') - spec_list$x4 = c('x4_per_sequence_GC_content', 'g', '1', 'character') - spec_list$x5 = c('x5_per_base_sequence_content', 'h', '1', 'character') - -##------------------------------------------------------------------ - -spec = t(as.data.frame(spec_list)) -opt = getopt(spec) -# arguments are accessed by long flag name (the first column in the spec matrix) -# NOT by element name in the spec_list -# example: opt$help, opt$expression_file -##====== End of arguments handling ========== - -#------ Load libraries --------- +library(getopt) library(rmarkdown) +library(htmltools) library(plyr) +library(dplyr) library(stringr) -library(dplyr) library(highcharter) library(DT) library(reshape2) library(plotly) library(formattable) -library(htmltools) - +options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) -#----- 1. create the report directory ------------------------ -paste0('mkdir -p ', opt$fastqc_site_dir) %>% - system() - -#----- 2. generate Rmd files with Rmd templates -------------- -# a. templates without placeholder variables: -# copy templates from tool directory to the working directory. -# b. templates with placeholder variables: -# substitute variables with user input values and place them in the working directory. +##============ Sink warnings and errors to a file ============== +## use the sink() function to wrap all code within it. +##============================================================== +zz = file('warnings_and_errors.txt') +sink(zz) +sink(zz, type = 'message') + ##---------below is the code for rendering .Rmd templates----- + + ##=============STEP 1: handle command line arguments========== + ## + ##============================================================ + # column 1: the long flag name + # column 2: the short flag alias. A SINGLE character string + # column 3: argument mask + # 0: no argument + # 1: argument required + # 2: argument is optional + # column 4: date type to which the flag's argument shall be cast. + # possible values: logical, integer, double, complex, character. + #------------------------------------------------------------- + #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++ + # 1. short flag alias should match the flag in the command section in the XML file. + # 2. long flag name can be any legal R variable names + # 3. two names in args_list can have common string but one name should not be a part of another name. + # for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems. + #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + args_list=list() + ##------- 1. input data --------------------- + args_list$ECHO = c('echo', 'e', '1', 'character') + args_list$READS_1 = c('reads_1', 'r', '1', 'character') + args_list$NAME_1 = c('name_1', 'n', '1', 'character') + args_list$READS_2 = c('reads_2', 'R', '1', 'character') + args_list$NAME_2 = c('name_2', 'N', '1', 'character') + args_list$CONTAMINANTS = c('contaminants', 'c', '1', 'character') + args_list$LIMITS = c('limits', 'l', '1', 'character') + ##--------2. output report and outputs -------------- + args_list$REPORT_HTML = c('report_html', 'o', '1', 'character') + args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character') + args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character') + ##--------3. .Rmd templates in the tool directory ---------- + args_list$SITE_YML = c('site_yml', 'S', '1', 'character') + args_list$INDEX_RMD = c('index_rmd', 'I', '1', 'character') + args_list$X01_EVALUATION_OVERVIEW = c('x01_evaluation_overview', 'A', '1', 'character') + args_list$X02_PER_BASE_SEQUENCE_QUALITY = c('x02_per_base_sequence_quality', 'B', '1', 'character') + args_list$X03_PER_TILE_SEQUENCE_QUALITY = c('x03_per_tile_sequence_quality', 'C', '1', 'character') + args_list$X04_PER_SEQUENCE_QUALITY_SCORE = c('x04_per_sequence_quality_score', 'D', '1', 'character') + args_list$X05_PER_BASE_SEQUENCE_CONTENT = c('x05_per_base_sequence_content', 'E', '1', 'character') + args_list$X06_PER_SEQUENCE_GC_CONTENT = c('x06_per_sequence_gc_content', 'F', '1', 'character') + args_list$X07_PER_BASE_N_CONTENT = c('x07_per_base_n_content', 'G', '1', 'character') + args_list$X08_SEQUENCE_LENGTH_DISTRIBUTION = c('x08_sequence_length_distribution', 'H', '1', 'character') + args_list$X09_SEQUENCE_DUPLICATION_LEVELS = c('x09_sequence_duplication_levels', 'J', '1', 'character') + args_list$X10_ADAPTER_CONTENT = c('x10_adapter_content', 'K', '1', 'character') + args_list$X11_KMER_CONTENT = c('x11_kmer_content', 'L', '1', 'character') + ##----------------------------------------------------------- + opt = getopt(t(as.data.frame(args_list))) - #----- Copy index.Rmd and _site.yml files to job working direcotry ----- - file.copy(opt$index_rmd, 'index.Rmd', recursive=TRUE) - file.copy(opt$site_yml, '_site.yml', recursive=TRUE) - #--------------------------------------------------------- - - #----- 01_evaluation_overview.Rmd ----------------------- - readLines(opt$x01_evaluation_overview) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - gsub('READS', opt$reads, x) - }) %>% - (function(x) { - gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x) - }) %>% - (function(x) { - fileConn = file('01_evaluation_overview.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 1_per_base_quality_scores.Rmd -------------------- - readLines(opt$x1_per_base_quality_scores) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('1_per_base_quality_scores.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 2_per_base_N_content.Rmd ------------------------- - readLines(opt$x2_per_base_N_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('2_per_base_N_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - #----- 3_per_sequence_quality_scores.Rmd ---------------- - readLines(opt$x3_per_sequence_quality_scores) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('3_per_sequence_quality_scores.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - #----- 4_per_sequence_GC_content.Rmd -------------------- - readLines(opt$x4_per_sequence_GC_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('4_per_sequence_GC_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) - - - #----- 5_per_base_sequence_content.Rmd ------------------ - readLines(opt$x5_per_base_sequence_content) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - fileConn = file('5_per_base_sequence_content.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) + + ##=======STEP 2: create report directory (optional)========== + ## + ##=========================================================== + dir.create(opt$report_dir) + + ##==STEP 3: copy index.Rmd and _site.yml to job working directory====== + ## + ##===================================================================== + file.copy(opt$index_rmd, 'index.Rmd') + file.copy(opt$site_yml, '_site.yml') + + ##=STEP 4: replace placeholders in .Rmd files with argument values= + ## + ##================================================================= + #++ need to replace placeholders with args values one by one+ + + # 01_evaluation_overview.Rmd + readLines(opt$x01_evaluation_overview) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('READS_1', opt$reads_1, x) + }) %>% + (function(x) { + gsub('NAME_1', opt$name_1, x) + }) %>% + (function(x) { + gsub('READS_2', opt$reads_2, x) + }) %>% + (function(x) { + gsub('NAME_2', opt$name_1, x) + }) %>% + (function(x) { + gsub('CONTAMINANTS', opt$contaminants, x) + }) %>% + (function(x) { + gsub('LIMITS', opt$limits, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x01_evaluation_overview.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 02_per_base_sequence_quality.Rmd + readLines(opt$x02_per_base_sequence_quality) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x02_per_base_sequence_quality.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 03_per_tile_sequence_quality.Rmd + readLines(opt$x03_per_tile_sequence_quality) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x03_per_tile_sequence_quality.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 04_per_sequence_quality_score.Rmd + readLines(opt$x04_per_sequence_quality_score) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x04_per_sequence_quality_score.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 05_per_base_sequence_content.Rmd + readLines(opt$x05_per_base_sequence_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x05_per_base_sequence_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 06_per_sequence_gc_content.Rmd + readLines(opt$x06_per_sequence_gc_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x06_per_sequence_gc_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 07_per_base_n_content.Rmd + readLines(opt$x07_per_base_n_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x07_per_base_n_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) - #----- 02_fastqc_original_reports.Rmd ------------------- - readLines(opt$x02_fastqc_original_reports) %>% - (function(x) { - gsub('ECHO', opt$echo, x) - }) %>% - (function(x) { - gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x) - }) %>% - (function(x) { - fileConn = file('02_fastqc_original_reports.Rmd') - writeLines(x, con=fileConn) - close(fileConn) - }) + # 08_sequence_length_distribution.Rmd + readLines(opt$x08_sequence_length_distribution) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x08_sequence_length_distribution.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 09_sequence_duplication_levels.Rmd + readLines(opt$x09_sequence_duplication_levels) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x09_sequence_duplication_levels.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 10_adapter_content.Rmd + readLines(opt$x10_adapter_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x10_adapter_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + # 11_kmer_content.Rmd + readLines(opt$x11_kmer_content) %>% + (function(x) { + gsub('ECHO', opt$echo, x) + }) %>% + (function(x) { + gsub('REPORT_DIR', opt$report_dir, x) + }) %>% + (function(x) { + fileConn = file('x11_kmer_content.Rmd') + writeLines(x, con=fileConn) + close(fileConn) + }) + + ##=============STEP 5: render all .Rmd templates================= + ## + ##=========================================================== + extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { + f = readLines(fastqc_data) + start_line = grep(module_name, f) + end_module_lines = grep('END_MODULE', f) + end_line = end_module_lines[which(end_module_lines > start_line)[1]] + module_data = f[(start_line+1):(end_line-1)] + writeLines(module_data, 'temp.txt') + read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char) + } + render_site() + + ##=============STEP 6: manipulate outputs==================== + ## + ##=========================================================== + file.copy('my_site/index.html', opt$report_html, recursive = TRUE) + system(paste0('cp -r my_site/* ', opt$report_dir)) - -#------ 3. render all Rmd files with render_site() -------- -render_site() - - -#-------4. manipulate outputs ----------------------------- -# a. copy index.html to the report output path -# b. copy all files in 'my_site' to the report output directory -file.copy('my_site/index.html', opt$fastqc_site, recursive=TRUE) -paste0('cp -r my_site/* ', opt$fastqc_site_dir) %>% - system() - - + ##--------end of code rendering .Rmd templates---------------- +sink() +##=========== End of sinking output============================= \ No newline at end of file
