diff fastqc_site_render.R @ 11:507eec497730 draft

update fastqc site
author mingchen0919
date Tue, 07 Nov 2017 16:52:24 -0500
parents d820be692d74
children a6f8382f852c
line wrap: on
line diff
--- a/fastqc_site_render.R	Tue Aug 15 15:50:21 2017 -0400
+++ b/fastqc_site_render.R	Tue Nov 07 16:52:24 2017 -0500
@@ -1,195 +1,283 @@
-##======= Handle arguments from command line ========
-# setup R error handline to go to stderr
-options(show.error.messages=FALSE,
-        error=function(){
-          cat(geterrmessage(), file=stderr())
-          quit("no", 1, F)
-        })
-
-# we need that to not crash galaxy with an UTF8 error on German LC settings.
-loc = Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-
-# suppress warning
-options(warn = -1)
-
-options(stringsAsFactors=FALSE, useFancyQuotes=FALSE)
-args = commandArgs(trailingOnly=TRUE)
-
-suppressPackageStartupMessages({
-  library(getopt)
-  library(tools)
-})
-
-# column 1: the long flag name
-# column 2: the short flag alias. A SINGLE character string
-# column 3: argument mask
-#           0: no argument
-#           1: argument required
-#           2: argument is optional
-# column 4: date type to which the flag's argument shall be cast.
-#           possible values: logical, integer, double, complex, character.
-spec_list=list()
-
-##------- 1. input data ---------------------
-spec_list$READS = c('reads', 'r', '1', 'character')
-spec_list$ECHO = c('echo', 'e', '1', 'character')
-
-##--------2. output report and report site directory --------------
-spec_list$FASTQC_SITE = c('fastqc_site', 'o', '1', 'character')
-spec_list$FASTQC_SITE_DIR = c('fastqc_site_dir', 'd', '1', 'character')
-
-##--------3. Rmd templates sitting in the tool directory ----------
-
-    ## _site.yml and index.Rmd files
-    spec_list$SITE_YML = c('site_yml', 's', 1, 'character')
-    spec_list$INDEX_Rmd = c('index_rmd', 'i', 1, 'character')
-    
-    ## other Rmd body template files
-    spec_list$x01 = c('x01_evaluation_overview', 'p', '1', 'character')
-    spec_list$x02 = c('x02_fastqc_original_reports', 'a', '1', 'character')
-    spec_list$x1 = c('x1_per_base_quality_scores', 'b', '1', 'character')
-    spec_list$x2 = c('x2_per_base_N_content', 'c', '1', 'character')
-    spec_list$x3 = c('x3_per_sequence_quality_scores', 'f', '1', 'character')
-    spec_list$x4 = c('x4_per_sequence_GC_content', 'g', '1', 'character')
-    spec_list$x5 = c('x5_per_base_sequence_content', 'h', '1', 'character')
-
-##------------------------------------------------------------------
-
-spec = t(as.data.frame(spec_list))
-opt = getopt(spec)
-# arguments are accessed by long flag name (the first column in the spec matrix)
-#                        NOT by element name in the spec_list
-# example: opt$help, opt$expression_file
-##====== End of arguments handling ==========
-
-#------ Load libraries ---------
+library(getopt)
 library(rmarkdown)
+library(htmltools)
 library(plyr)
+library(dplyr)
 library(stringr)
-library(dplyr)
 library(highcharter)
 library(DT)
 library(reshape2)
 library(plotly)
 library(formattable)
-library(htmltools)
-
+options(stringsAsFactors=FALSE, useFancyQuotes=FALSE)
 
-#----- 1. create the report directory ------------------------
-paste0('mkdir -p ', opt$fastqc_site_dir) %>%
-  system()
-
-#----- 2. generate Rmd files with Rmd templates --------------
-#   a. templates without placeholder variables:
-#         copy templates from tool directory to the working directory.
-#   b. templates with placeholder variables:
-#         substitute variables with user input values and place them in the working directory.
+##============ Sink warnings and errors to a file ==============
+## use the sink() function to wrap all code within it.
+##==============================================================
+zz = file('warnings_and_errors.txt')
+sink(zz)
+sink(zz, type = 'message')
+  ##---------below is the code for rendering .Rmd templates-----
+  
+  ##=============STEP 1: handle command line arguments==========
+  ##
+  ##============================================================
+  # column 1: the long flag name
+  # column 2: the short flag alias. A SINGLE character string
+  # column 3: argument mask
+  #           0: no argument
+  #           1: argument required
+  #           2: argument is optional
+  # column 4: date type to which the flag's argument shall be cast.
+  #           possible values: logical, integer, double, complex, character.
+  #-------------------------------------------------------------
+  #++++++++++++++++++++ Best practice ++++++++++++++++++++++++++
+  # 1. short flag alias should match the flag in the command section in the XML file.
+  # 2. long flag name can be any legal R variable names
+  # 3. two names in args_list can have common string but one name should not be a part of another name.
+  #    for example, one name is "ECHO", if another name is "ECHO_XXX", it will cause problems.
+  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+  args_list=list()
+  ##------- 1. input data ---------------------
+  args_list$ECHO = c('echo', 'e', '1', 'character')
+  args_list$READS_1 = c('reads_1', 'r', '1', 'character')
+  args_list$NAME_1 = c('name_1', 'n', '1', 'character')
+  args_list$READS_2 = c('reads_2', 'R', '1', 'character')
+  args_list$NAME_2 = c('name_2', 'N', '1', 'character')
+  args_list$CONTAMINANTS = c('contaminants', 'c', '1', 'character')
+  args_list$LIMITS = c('limits', 'l', '1', 'character')
+  ##--------2. output report and outputs --------------
+  args_list$REPORT_HTML = c('report_html', 'o', '1', 'character')
+  args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character')
+  args_list$SINK_MESSAGE = c('sink_message', 's', '1', 'character')
+  ##--------3. .Rmd templates in the tool directory ----------
+  args_list$SITE_YML = c('site_yml', 'S', '1', 'character')
+  args_list$INDEX_RMD = c('index_rmd', 'I', '1', 'character')
+  args_list$X01_EVALUATION_OVERVIEW = c('x01_evaluation_overview', 'A', '1', 'character')
+  args_list$X02_PER_BASE_SEQUENCE_QUALITY = c('x02_per_base_sequence_quality', 'B', '1', 'character')
+  args_list$X03_PER_TILE_SEQUENCE_QUALITY = c('x03_per_tile_sequence_quality', 'C', '1', 'character')
+  args_list$X04_PER_SEQUENCE_QUALITY_SCORE = c('x04_per_sequence_quality_score', 'D', '1', 'character')
+  args_list$X05_PER_BASE_SEQUENCE_CONTENT = c('x05_per_base_sequence_content', 'E', '1', 'character')
+  args_list$X06_PER_SEQUENCE_GC_CONTENT = c('x06_per_sequence_gc_content', 'F', '1', 'character')
+  args_list$X07_PER_BASE_N_CONTENT = c('x07_per_base_n_content', 'G', '1', 'character')
+  args_list$X08_SEQUENCE_LENGTH_DISTRIBUTION = c('x08_sequence_length_distribution', 'H', '1', 'character')
+  args_list$X09_SEQUENCE_DUPLICATION_LEVELS = c('x09_sequence_duplication_levels', 'J', '1', 'character')
+  args_list$X10_ADAPTER_CONTENT = c('x10_adapter_content', 'K', '1', 'character')
+  args_list$X11_KMER_CONTENT = c('x11_kmer_content', 'L', '1', 'character')
+  ##-----------------------------------------------------------
+  opt = getopt(t(as.data.frame(args_list)))
 
 
-    #----- Copy index.Rmd and _site.yml files to job working direcotry -----
-    file.copy(opt$index_rmd, 'index.Rmd', recursive=TRUE)
-    file.copy(opt$site_yml, '_site.yml', recursive=TRUE)
-    #---------------------------------------------------------
-    
-    #----- 01_evaluation_overview.Rmd -----------------------
-    readLines(opt$x01_evaluation_overview) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        gsub('READS', opt$reads, x)
-      }) %>%
-      (function(x) {
-        gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('01_evaluation_overview.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-    
-    #----- 1_per_base_quality_scores.Rmd --------------------
-    readLines(opt$x1_per_base_quality_scores) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('1_per_base_quality_scores.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-    
-    #----- 2_per_base_N_content.Rmd -------------------------
-    readLines(opt$x2_per_base_N_content) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('2_per_base_N_content.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-    
-    #----- 3_per_sequence_quality_scores.Rmd ----------------
-    readLines(opt$x3_per_sequence_quality_scores) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('3_per_sequence_quality_scores.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-    
-    
-    #----- 4_per_sequence_GC_content.Rmd --------------------
-    readLines(opt$x4_per_sequence_GC_content) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('4_per_sequence_GC_content.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
-    
-    
-    #----- 5_per_base_sequence_content.Rmd ------------------
-    readLines(opt$x5_per_base_sequence_content) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('5_per_base_sequence_content.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
+  
+  ##=======STEP 2: create report directory (optional)==========
+  ##
+  ##===========================================================
+  dir.create(opt$report_dir)
+  
+  ##==STEP 3: copy index.Rmd and _site.yml to job working directory======
+  ##
+  ##=====================================================================
+  file.copy(opt$index_rmd, 'index.Rmd')
+  file.copy(opt$site_yml, '_site.yml')
+  
+  ##=STEP 4: replace placeholders in .Rmd files with argument values=
+  ##
+  ##=================================================================
+  #++ need to replace placeholders with args values one by one+
+  
+  # 01_evaluation_overview.Rmd
+  readLines(opt$x01_evaluation_overview) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('READS_1', opt$reads_1, x)
+    }) %>%
+    (function(x) {
+      gsub('NAME_1', opt$name_1, x)
+    }) %>%
+    (function(x) {
+      gsub('READS_2', opt$reads_2, x)
+    }) %>%
+    (function(x) {
+      gsub('NAME_2', opt$name_1, x)
+    }) %>%
+    (function(x) {
+      gsub('CONTAMINANTS', opt$contaminants, x)
+    }) %>%
+    (function(x) {
+      gsub('LIMITS', opt$limits, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x01_evaluation_overview.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 02_per_base_sequence_quality.Rmd
+  readLines(opt$x02_per_base_sequence_quality) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x02_per_base_sequence_quality.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 03_per_tile_sequence_quality.Rmd
+  readLines(opt$x03_per_tile_sequence_quality) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x03_per_tile_sequence_quality.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 04_per_sequence_quality_score.Rmd
+  readLines(opt$x04_per_sequence_quality_score) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x04_per_sequence_quality_score.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 05_per_base_sequence_content.Rmd
+  readLines(opt$x05_per_base_sequence_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x05_per_base_sequence_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 06_per_sequence_gc_content.Rmd
+  readLines(opt$x06_per_sequence_gc_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x06_per_sequence_gc_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 07_per_base_n_content.Rmd
+  readLines(opt$x07_per_base_n_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x07_per_base_n_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
 
-    #----- 02_fastqc_original_reports.Rmd -------------------
-    readLines(opt$x02_fastqc_original_reports) %>%
-      (function(x) {
-        gsub('ECHO', opt$echo, x)
-      }) %>%
-      (function(x) {
-        gsub('REPORT_OUTPUT_DIR', opt$fastqc_site_dir, x)
-      }) %>%
-      (function(x) {
-        fileConn = file('02_fastqc_original_reports.Rmd')
-        writeLines(x, con=fileConn)
-        close(fileConn)
-      })
+  # 08_sequence_length_distribution.Rmd
+  readLines(opt$x08_sequence_length_distribution) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x08_sequence_length_distribution.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 09_sequence_duplication_levels.Rmd
+  readLines(opt$x09_sequence_duplication_levels) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x09_sequence_duplication_levels.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 10_adapter_content.Rmd
+  readLines(opt$x10_adapter_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x10_adapter_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  # 11_kmer_content.Rmd
+  readLines(opt$x11_kmer_content) %>%
+    (function(x) {
+      gsub('ECHO', opt$echo, x)
+    }) %>%
+    (function(x) {
+      gsub('REPORT_DIR', opt$report_dir, x)
+    }) %>%
+    (function(x) {
+      fileConn = file('x11_kmer_content.Rmd')
+      writeLines(x, con=fileConn)
+      close(fileConn)
+    })
+  
+  ##=============STEP 5: render all .Rmd templates=================
+  ##
+  ##===========================================================
+  extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") {
+    f = readLines(fastqc_data)
+    start_line = grep(module_name, f)
+    end_module_lines = grep('END_MODULE', f)
+    end_line = end_module_lines[which(end_module_lines > start_line)[1]]
+    module_data = f[(start_line+1):(end_line-1)]
+    writeLines(module_data, 'temp.txt')
+    read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char)
+  }
+  render_site()
+  
+  ##=============STEP 6: manipulate outputs====================
+  ##
+  ##===========================================================
+  file.copy('my_site/index.html', opt$report_html, recursive = TRUE)
+  system(paste0('cp -r my_site/* ', opt$report_dir))
 
 
-
-#------ 3. render all Rmd files with render_site() --------
-render_site()    
-
-
-#-------4. manipulate outputs -----------------------------
-#   a. copy index.html to the report output path
-#   b. copy all files in 'my_site' to the report output directory
-file.copy('my_site/index.html', opt$fastqc_site, recursive=TRUE)
-paste0('cp -r my_site/* ', opt$fastqc_site_dir) %>%
-  system()
-
-
+  ##--------end of code rendering .Rmd templates----------------
+sink()
+##=========== End of sinking output=============================
\ No newline at end of file