comparison lib/reporting.R @ 0:1d1b9e1b2e2f draft

Uploaded
author petr-novak
date Thu, 19 Dec 2019 10:24:45 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1d1b9e1b2e2f
1 #!/usr/bin/env Rscript
2 library(R2HTML)
3 library(hwriter)
4 library(DT)
5 library(tools)
6
7 source("htmlheader.R")
8 source("config.R") # load TANDEM_RANKS
9 source("utils.R")
10 DT_OPTIONS = list(pageLength = 1000, lengthMenu = c(10, 50, 100, 1000, 5000, 10000))
11 HTMLHEADER_TAREAN = gsub("PAGE_TITLE","TAREAN summary", htmlheader)
12 HTMLHEADER_INDEX = gsub("PAGE_TITLE","Clustering summary", htmlheader)
13
14 WD = getwd() # to get script directory when run from Rserve
15
16 reformat_header = function(df){
17 H = colnames(df)
18 H[H=="TR_score"] = "TAREAN k-mer_coverage"
19 H[H=="vcount"] = "|V|"
20 H[H=="ecount"] = "|E|"
21 H[H=="Genome_Proportion[%]"] = "Proportion[%]"
22 H[H=="Proportion_Adjusted[%]"] = "Proportion adjusted[%]"
23 H[H=="supercluster"] = "Super_cluster"
24 H[H=="size_real"] = "Number of reads"
25 H[H=="TR_monomer_length"] = "Consensus_length"
26 H[H=="TR_consensus"] = "Consensus"
27 H[H=="pbs_score"] = "PBS score"
28 H[H=="ltr_detection"] = "LTR detection"
29 H[H=="kmer_analysis"] = "TAREAN k-mer analysis"
30
31 # H[H=="annotations_summary"] = "Similarity_hits"
32 H[H=="annotations_summary"] = "Similarity_hits_[above 0.1%]"
33 H[H=="annotations_summary_custom"] = "Similarity_hits_to_custom_database"
34 H[H=="loop_index"] = "connected_component_index C"
35 H[H=="pair_completeness"] = "pair_completeness_index_P"
36 H = gsub("_","<br>",H)
37 H=gsub("TR_","",H)
38 H = capitalize(H)
39 colnames(df) = H
40 return(df)
41 }
42
43 reformat4html=function(df){
44 for (n in colnames(df)){
45 if (class(df[,n]) == 'character'){
46 df[,n] = gsub("\n","<br>", df[,n])
47 }
48 if (class(df[,n]) == 'numeric'){
49 df[,n] = signif(df[,n],3)
50 }
51 }
52 return(df)
53 }
54
55 capitalize = function(s){
56 paste(toupper(substring(s, 1, 1)),
57 substring(s, 2),
58 sep="")
59 }
60
61
62 create_main_reports = function(paths, N_clustering, N_input,N_omit, merge_threshold,
63 paired, consensus_files, custom_db, tarean_mode,
64 HTML_LINKS, pipeline_version_info, max_memory,
65 max_number_reads_for_clustering, mincln){
66 ## this create main html index and also tarean report ##
67 ## index and tarean html reports are created always
68 ## extract all paths and directories
69 HTML_LINKS = nested2named_list(HTML_LINKS)
70 paths = nested2named_list(paths)
71 csvfile = paths[['clusters_info']]
72 clusters_summary_csv = paths[['clusters_summary_csv']]
73 profrep_classification_csv = paths[['profrep_classification_csv']]
74 htmlfile = paths[["tarean_report_html"]]
75 html_report_dt = paths[["cluster_report_html"]]
76 main_report = paths[["main_report_html"]]
77 summarized_annnotation_html = paths[["summarized_annotation_html"]]
78 libdir = paths[['libdir']]
79 clusters_dir = paths[["clusters__relative"]]
80 superclusters_dir = paths[['superclusters__relative']]
81 seqdb = paths[['sequences_db']]
82 hitsortdb = paths[['hitsort_db']]
83 connect_to_databases(seqdb, hitsortdb)
84 dfraw = read.table(csvfile, as.is=TRUE, header=TRUE, sep="\t", na.strings = c('None','NA'))
85 # table must be updated
86 dfraw$supercluster_best_hit = dbGetQuery(HITSORTDB, "SELECT supercluster_best_hit FROM cluster_info")[, 1]
87 ## columns to use
88 selected_cols = c("index", "size_real","size_adjusted", "vcount","ecount",
89 "loop_index", "pair_completeness",
90 'satellite_probability','satellite',
91 'TR_score','pbs_score','ltr_detection', 'TR_monomer_length',
92 'TR_consensus', "annotations_summary", "supercluster", 'tandem_rank',
93 'supercluster_best_hit')
94
95 ## some columns are added (like Graph_layout, clusters,...)
96 ## columns for html report
97 selected_cols_tarean = c(
98 "Cluster",
99 "Proportion[%]",
100 "Proportion_Adjusted[%]",
101 "size_real",
102 'satellite_probability',
103 'TR_monomer_length',
104 'TR_consensus',
105 'Graph_layout',
106 'kmer_analysis',
107 "loop_index",
108 "pair_completeness",
109 'TR_score',
110 "vcount",
111 "ecount",
112 'pbs_score',
113 "annotations_summary"
114 )
115 selected_cols_main = c(
116 "Cluster",
117 "supercluster",
118 "Proportion[%]",
119 "Proportion_Adjusted[%]",
120 "size_real",
121 'Graph_layout',
122 "annotations_summary",
123 'ltr_detection',
124 'satellite_probability',
125 'TAREAN_annotation',
126 'TR_monomer_length',
127 'TR_consensus',
128 'kmer_analysis',
129 "loop_index",
130 "pair_completeness",
131 'TR_score',
132 "ecount",
133 "vcount"
134 )
135
136 if (custom_db){
137 selected_cols_main = c(selected_cols_main, "annotations_summary_custom")
138 selected_cols_tarean = c(selected_cols_tarean, "annotations_summary_custom")
139 selected_cols = c(selected_cols, "annotations_summary_custom")
140 }
141 if (is_comparative()){
142 prefix_codes = dbGetQuery(SEQDB, "SELECT * FROM prefix_codes")
143 species_counts = dbGetQuery(HITSORTDB, "SELECT * FROM comparative_counts")
144 superclusters = dbGetQuery(HITSORTDB,paste(
145 "SELECT supercluster, cluster FROM superclusters WHERE cluster <=",
146 nrow(species_counts))
147 )
148 species_counts = merge(superclusters, species_counts, by.x = "cluster", by.y = "clusterindex")
149 ## include commented header with total counts:
150 cat("# Total counts:\t\t", paste(prefix_codes$N, collapse="\t"),"\n#\n",
151 sep="",
152 file = paths[['comparative_analysis_counts_csv']])
153
154 write.table(species_counts, file = paths[['comparative_analysis_counts_csv']],
155 sep = "\t", col.names = TRUE, row.names = FALSE, append=TRUE)
156 species_counts_formated = apply(
157 species_counts[, prefix_codes$prefix, drop = FALSE],
158 1, function(x) paste(prefix_codes$prefix, ":", x, "\n",sep='', collapse=""))
159 dfraw$species_counts = species_counts_formated[1:nrow(dfraw)]
160 selected_cols = c(selected_cols, "species_counts")
161 selected_cols_main = c(selected_cols_main, "species_counts")
162 }
163
164
165 df_report = dfraw[,selected_cols]
166 ## describe tandem ranks:
167 df_report$TAREAN_annotation = RANKS_TANDEM[as.character(df_report$tandem_rank)]
168 ## remove Cluster_similarity_hits
169 df_report_csv = reformat_df_report(df_report)
170 df_report_csv = df_report_csv[,!colnames(df_report_csv) %in% "Cluster_similarity_hits"]
171 df_report_csv$Final_annotation=""
172
173 ## make table for profrep classification
174 write.table(
175 reformat_df_to_profrep_classification(df_report), file = profrep_classification_csv,
176 sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
177
178 df_report$"kmer_analysis" = ifelse(dfraw$putative_tandem, hwrite("report", link = dfraw$html_tarean), "N/A")
179 df_report$"Graph_layout" = hwriteImage(dfraw$image_file_tmb, link = dfraw$image_file, table = FALSE)
180 df_report$Cluster = paste0("CL", df_report$index)
181 df_report$"Proportion[%]" = signif (100 * df_report$size_real / N_clustering, 2)
182 df_report$"Proportion_Adjusted[%]" = signif (100 * df_report$size_adjusted / N_clustering, 2)
183 if (!tarean_mode){
184
185 df_report$Cluster=sapply(
186 df_report$index,
187 function(x) hwrite(x, link = sprintf("%s/dir_CL%04d/index.html", clusters_dir, x)))
188
189 df_report$supercluster = sapply(
190 df_report$supercluster,
191 function(x) hwrite(x, link = sprintf("%s/dir_SC%04d/index.html", superclusters_dir, x)))
192 }
193 ## TAREAN report
194 ## copy tarean output data help to place nad make link to it
195 file.copy(paste0(WD,"/style1.css"), dirname(htmlfile))
196 file.copy(paste0(WD,"/documentation.html"), dirname(htmlfile))
197
198 tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN)
199 tarean_html("Tandem Repeat Analyzer", HTML.title, HR=1)
200 tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN)
201 tarean_html('Run statistics:', HTML.title, HR=2)
202 tarean_html(paste("Number of input reads:", N_input ))
203 tarean_html(paste("Number of analyzed reads:", N_clustering))
204 tarean_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes")))
205
206 ## export links to consensus sequecnes in fasta files
207 tarean_html("Consensus files - fasta format:", HTML.title, HR=2)
208 for (i in TANDEM_RANKS[TANDEM_RANKS != 0]){ ## no consensus for rank 0
209 if (!is.null (consensus_files[[i]])){
210 N = sum(dfraw$tandem_rank == TANDEM_RANKS[i])
211 name_string = paste(names(TANDEM_RANKS)[i]," - total ", N, "found" )
212 tarean_html(paste("<p>",
213 hwrite(name_string, download = name_string,
214 link = basename(consensus_files[[i]][[1]])),
215 "<br>\n"))
216
217 }
218 }
219 ## print link to documentation ##
220 tarean_html("Documentation", HTML.title, HR=2)
221 tarean_html(paste('<p> For the explanation of TAREAN output see',
222 ' <a href="documentation.html#tra" > the help section </a> <p>'))
223 ## HOW TO CITE section)
224
225 ## PRINT TABLES WITH CLUSTERS
226 for (n in names(TANDEM_RANKS)){
227 tarean_html(n, HTML.title, HR=2)
228 inc <- dfraw$tandem_rank == TANDEM_RANKS[n]
229 if (sum(inc > 0)){
230 tarean_html(reformat4html(
231 reformat_header(
232 df_report[inc, selected_cols_tarean ,drop=FALSE]
233 )
234 ),
235 align = "left", digits = 3)
236 }else{
237 tarean_html("not found")
238 }
239 }
240
241 ## export table with all cluster
242 cat("",file = html_report_dt)
243
244 DT_instance = df_report[,selected_cols_main, drop = FALSE] %>%
245 reformat_header %>% reformat4html %>% datatable(escape = FALSE, options = DT_OPTIONS) %>%
246 formatStyle(columns = seq_along(selected_cols), "font-size" = "12px") %>%
247 formatStyle(columns = "Similarity<br>hits<br>[above 0.1%]", "min-width" = "500px")
248
249 saveWidget(DT_instance, file = normalizePath(html_report_dt),
250 libdir=normalizePath(libdir) , selfcontained = FALSE)
251
252 add_preamble(normalizePath(html_report_dt),
253 preamble='<h2>Cluster annotation</h2> <p><a href="documentation.html#clust"> For table legend see documentation. <a> </p>')
254
255 ## Main page - Clustering info - global information about clustering
256 top_clusters_prop = sum(df_report$size_real)/N_clustering
257 clustering_info = summary_histogram(fn = paths[["summary_histogram"]], N_clustering, N_omit, df_report$size_adjusted,
258 top_clusters_prop)
259 index_html = start_html(main_report, HTMLHEADER_INDEX)
260 index_html("Clustering Summary", HTML.title, HR = 1)
261
262 index_html(paste0('<a href="',
263 paths[['summary_histogram__relative']],
264 '"> <img src="', paths[['summary_histogram__relative']],
265 '" width="700" border="1" >',
266 ' </a>'), cat)
267 index_html('<p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>',cat)
268
269 index_html('Run information:', HTML.title, HR = 2)
270 index_html(paste("Number of input reads:", N_input ))
271 index_html(paste("Number of analyzed reads:", N_clustering))
272 if (N_omit != 0){
273 index_html(paste("Number of reads removed by automatic filtering of abundant putative satellites:", N_omit))
274 index_html(paste("Number of remaining reads after filtering of abundant satellites:", N_clustering - N_omit ))
275 }
276
277 index_html(
278 paste(
279 "Proportion of reads in top clusters :",
280 signif(100 * sum(df_report$size_real)/N_clustering,2),
281 "%"
282 ))
283 index_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes")))
284 index_html(paste("Paired-end reads:",ifelse(paired, "Yes", "No")))
285 index_html("Available analyses:", HTML.title, HR=2)
286 index_html(paste("<p>",hwrite("Tandem repeat analysis", link = HTML_LINKS$INDEX_TO_TAREAN),"</p>"),cat)
287
288 if (!tarean_mode){
289 index_html(paste("<p>", hwrite("Cluster annotation", link = HTML_LINKS$INDEX_TO_CLUSTER_REPORT),"</p>"),cat)
290 index_html(paste("<p>", hwrite("Supercluster annotation",
291 link = HTML_LINKS$INDEX_TO_SUPERCLUSTER_REPORT),"</p>"),cat)
292 index_html(paste("<p>", hwrite("Repeat annotation summary", link = HTML_LINKS$INDEX_TO_SUMMARIZED_ANNOTATION),"</p>"),cat)
293 }
294
295 if (is_comparative()) {
296 tryCatch({
297 imagemap = plot_rect_map(
298 read_counts = paths[['comparative_analysis_counts_csv']],
299 cluster_annotation = paths[['profrep_classification_csv']],
300 output_file = paths[['comparative_summary_map']]
301 )},
302 error = function(err){
303 print(paste("error while plotting ", err))
304 }
305 )
306
307 HTML.title("Comparative analysis - Total number of reads in clustering analysis", file = main_report)
308 index_html(df2html(
309 prefix_codes,
310 header = c("Code", "Total read count"), rounding_function = round),
311 cat
312 )
313 HTML.title("Comparative analysis - Number of reads in individual clusters", file = main_report)
314
315 index_html(paste0('<img src="', paths[['comparative_summary_map__relative']],
316 '" usemap ="#clustermap" border="2">'), cat)
317
318 index_html(
319 "Bar plot on top shows the size of individual clusters. Size of the rectangles in lower panel is proportional to the number of reads in a cluster for each species. Clusters and species were sorted using hierarchical clustering. Bars and rectangles in the plot are hyperlinked to the individual cluster reports.")
320 index_html(imagemap)
321 }
322
323 how2cite = readLines(paths[["how_to_cite"]])
324
325 index_html(how2cite, cat, sep="\n")
326 index_html("<br><hr>", cat)
327 index_html('Details:', HTML.title, HR = 3)
328 index_html(pipeline_version_info %>% preformatted, cat)
329 index_html(paste0("Minimal number of reads in cluster to be considered top cluster : ", mincln))
330 index_html(paste0("Reserved Memory : ", round(max_memory/(1024*1024)), "G"))
331 index_html(paste0("Maximum number of processable reads with the reserved memory : ", max_number_reads_for_clustering))
332
333
334 ## export to csv
335 clustering_info$Number_of_analyzed_reads = N_clustering
336 write.table(t(as.data.frame(clustering_info)),
337 file = clusters_summary_csv, sep="\t", col.names = FALSE)
338 cat("\n", file = clusters_summary_csv, append = TRUE)
339 write.table(
340 df_report_csv, file = clusters_summary_csv,
341 sep = "\t", col.names = TRUE, row.names = FALSE, quote = TRUE, append=TRUE)
342 }
343
344 dummy_function = function(){
345 print("dummy function")
346 }
347
348 reformat_df_report = function(df_report){
349 # for printing to csv - this should be consise
350 df_report$TR_consensus = gsub("(<pre>)|(</pre>)","",df_report$TR_consensus)
351 df_report$tandem_rank = NULL
352 ## make suitable order and rename
353 if ("annotations_summary_custom" %in% colnames(df_report)){
354 custom = "annotations_summary_custom"
355 }else{
356 custom=character()
357 }
358 df_out = df_report[,c('index',
359 'supercluster',
360 'size_real',
361 'size_adjusted',
362 'supercluster_best_hit',
363 'TAREAN_annotation',
364 'annotations_summary',
365 custom)
366 ]
367
368 colnames(df_out) = c('Cluster',
369 'Supercluster',
370 'Size',
371 'Size_adjusted',
372 'Automatic_annotation',
373 'TAREAN_annotation',
374 'Cluster_similarity_hits',
375 custom)
376 return(df_out)
377 }
378
379 reformat_df_to_profrep_classification = function(df_report){
380 CL = df_report$index
381 best_hit = df_report$supercluster_best_hit
382 ## format conversion(in order):
383 replacement = list(
384 c("/", "|"),
385 c("Ty1_copia", "Ty1/copia"),
386 c("Ty3_gypsy", "Ty3/gypsy"),
387 c("TatIV_Ogre", "TatIV/Ogre"),
388 c("Ogre_Tat", "Ogre/Tat"),
389 c("EnSpm_CACTA", "EnSpm/CACTA"),
390 c("MuDR_Mutator", "MuDR/Mutator"),
391 c("PIF_Harbinger", "PIF/Harbinger"),
392 c("Tc1/Mariner", "Tc1/Mariner"),
393 c("All|", "")
394 )
395 for (i in replacement){
396 best_hit = gsub(i[1], i[2], best_hit, fixed = TRUE)
397 }
398 best_hit = gsub("^All", "", best_hit, fixed = FALSE)
399 best_hit = ifelse(best_hit == "", paste0("unknown_CL", CL), best_hit)
400 output = data.frame(Cluster = CL, classification = best_hit, stringsAsFactors = FALSE)
401 return(output)
402 }