0
|
1 #!/usr/bin/env Rscript
|
|
2 library(R2HTML)
|
|
3 library(hwriter)
|
|
4 library(DT)
|
|
5 library(tools)
|
|
6
|
|
7 source("htmlheader.R")
|
|
8 source("config.R") # load TANDEM_RANKS
|
|
9 source("utils.R")
|
|
10 DT_OPTIONS = list(pageLength = 1000, lengthMenu = c(10, 50, 100, 1000, 5000, 10000))
|
|
11 HTMLHEADER_TAREAN = gsub("PAGE_TITLE","TAREAN summary", htmlheader)
|
|
12 HTMLHEADER_INDEX = gsub("PAGE_TITLE","Clustering summary", htmlheader)
|
|
13
|
|
14 WD = getwd() # to get script directory when run from Rserve
|
|
15
|
|
16 reformat_header = function(df){
|
|
17 H = colnames(df)
|
|
18 H[H=="TR_score"] = "TAREAN k-mer_coverage"
|
|
19 H[H=="vcount"] = "|V|"
|
|
20 H[H=="ecount"] = "|E|"
|
|
21 H[H=="Genome_Proportion[%]"] = "Proportion[%]"
|
|
22 H[H=="Proportion_Adjusted[%]"] = "Proportion adjusted[%]"
|
|
23 H[H=="supercluster"] = "Super_cluster"
|
|
24 H[H=="size_real"] = "Number of reads"
|
|
25 H[H=="TR_monomer_length"] = "Consensus_length"
|
|
26 H[H=="TR_consensus"] = "Consensus"
|
|
27 H[H=="pbs_score"] = "PBS score"
|
|
28 H[H=="ltr_detection"] = "LTR detection"
|
|
29 H[H=="kmer_analysis"] = "TAREAN k-mer analysis"
|
|
30
|
|
31 # H[H=="annotations_summary"] = "Similarity_hits"
|
|
32 H[H=="annotations_summary"] = "Similarity_hits_[above 0.1%]"
|
|
33 H[H=="annotations_summary_custom"] = "Similarity_hits_to_custom_database"
|
|
34 H[H=="loop_index"] = "connected_component_index C"
|
|
35 H[H=="pair_completeness"] = "pair_completeness_index_P"
|
|
36 H = gsub("_","<br>",H)
|
|
37 H=gsub("TR_","",H)
|
|
38 H = capitalize(H)
|
|
39 colnames(df) = H
|
|
40 return(df)
|
|
41 }
|
|
42
|
|
43 reformat4html=function(df){
|
|
44 for (n in colnames(df)){
|
|
45 if (class(df[,n]) == 'character'){
|
|
46 df[,n] = gsub("\n","<br>", df[,n])
|
|
47 }
|
|
48 if (class(df[,n]) == 'numeric'){
|
|
49 df[,n] = signif(df[,n],3)
|
|
50 }
|
|
51 }
|
|
52 return(df)
|
|
53 }
|
|
54
|
|
55 capitalize = function(s){
|
|
56 paste(toupper(substring(s, 1, 1)),
|
|
57 substring(s, 2),
|
|
58 sep="")
|
|
59 }
|
|
60
|
|
61
|
|
62 create_main_reports = function(paths, N_clustering, N_input,N_omit, merge_threshold,
|
|
63 paired, consensus_files, custom_db, tarean_mode,
|
|
64 HTML_LINKS, pipeline_version_info, max_memory,
|
|
65 max_number_reads_for_clustering, mincln){
|
|
66 ## this create main html index and also tarean report ##
|
|
67 ## index and tarean html reports are created always
|
|
68 ## extract all paths and directories
|
|
69 HTML_LINKS = nested2named_list(HTML_LINKS)
|
|
70 paths = nested2named_list(paths)
|
|
71 csvfile = paths[['clusters_info']]
|
|
72 clusters_summary_csv = paths[['clusters_summary_csv']]
|
|
73 profrep_classification_csv = paths[['profrep_classification_csv']]
|
|
74 htmlfile = paths[["tarean_report_html"]]
|
|
75 html_report_dt = paths[["cluster_report_html"]]
|
|
76 main_report = paths[["main_report_html"]]
|
|
77 summarized_annnotation_html = paths[["summarized_annotation_html"]]
|
|
78 libdir = paths[['libdir']]
|
|
79 clusters_dir = paths[["clusters__relative"]]
|
|
80 superclusters_dir = paths[['superclusters__relative']]
|
|
81 seqdb = paths[['sequences_db']]
|
|
82 hitsortdb = paths[['hitsort_db']]
|
|
83 connect_to_databases(seqdb, hitsortdb)
|
|
84 dfraw = read.table(csvfile, as.is=TRUE, header=TRUE, sep="\t", na.strings = c('None','NA'))
|
|
85 # table must be updated
|
|
86 dfraw$supercluster_best_hit = dbGetQuery(HITSORTDB, "SELECT supercluster_best_hit FROM cluster_info")[, 1]
|
|
87 ## columns to use
|
|
88 selected_cols = c("index", "size_real","size_adjusted", "vcount","ecount",
|
|
89 "loop_index", "pair_completeness",
|
|
90 'satellite_probability','satellite',
|
|
91 'TR_score','pbs_score','ltr_detection', 'TR_monomer_length',
|
|
92 'TR_consensus', "annotations_summary", "supercluster", 'tandem_rank',
|
|
93 'supercluster_best_hit')
|
|
94
|
|
95 ## some columns are added (like Graph_layout, clusters,...)
|
|
96 ## columns for html report
|
|
97 selected_cols_tarean = c(
|
|
98 "Cluster",
|
|
99 "Proportion[%]",
|
|
100 "Proportion_Adjusted[%]",
|
|
101 "size_real",
|
|
102 'satellite_probability',
|
|
103 'TR_monomer_length',
|
|
104 'TR_consensus',
|
|
105 'Graph_layout',
|
|
106 'kmer_analysis',
|
|
107 "loop_index",
|
|
108 "pair_completeness",
|
|
109 'TR_score',
|
|
110 "vcount",
|
|
111 "ecount",
|
|
112 'pbs_score',
|
|
113 "annotations_summary"
|
|
114 )
|
|
115 selected_cols_main = c(
|
|
116 "Cluster",
|
|
117 "supercluster",
|
|
118 "Proportion[%]",
|
|
119 "Proportion_Adjusted[%]",
|
|
120 "size_real",
|
|
121 'Graph_layout',
|
|
122 "annotations_summary",
|
|
123 'ltr_detection',
|
|
124 'satellite_probability',
|
|
125 'TAREAN_annotation',
|
|
126 'TR_monomer_length',
|
|
127 'TR_consensus',
|
|
128 'kmer_analysis',
|
|
129 "loop_index",
|
|
130 "pair_completeness",
|
|
131 'TR_score',
|
|
132 "ecount",
|
|
133 "vcount"
|
|
134 )
|
|
135
|
|
136 if (custom_db){
|
|
137 selected_cols_main = c(selected_cols_main, "annotations_summary_custom")
|
|
138 selected_cols_tarean = c(selected_cols_tarean, "annotations_summary_custom")
|
|
139 selected_cols = c(selected_cols, "annotations_summary_custom")
|
|
140 }
|
|
141 if (is_comparative()){
|
|
142 prefix_codes = dbGetQuery(SEQDB, "SELECT * FROM prefix_codes")
|
|
143 species_counts = dbGetQuery(HITSORTDB, "SELECT * FROM comparative_counts")
|
|
144 superclusters = dbGetQuery(HITSORTDB,paste(
|
|
145 "SELECT supercluster, cluster FROM superclusters WHERE cluster <=",
|
|
146 nrow(species_counts))
|
|
147 )
|
|
148 species_counts = merge(superclusters, species_counts, by.x = "cluster", by.y = "clusterindex")
|
|
149 ## include commented header with total counts:
|
|
150 cat("# Total counts:\t\t", paste(prefix_codes$N, collapse="\t"),"\n#\n",
|
|
151 sep="",
|
|
152 file = paths[['comparative_analysis_counts_csv']])
|
|
153
|
|
154 write.table(species_counts, file = paths[['comparative_analysis_counts_csv']],
|
|
155 sep = "\t", col.names = TRUE, row.names = FALSE, append=TRUE)
|
|
156 species_counts_formated = apply(
|
|
157 species_counts[, prefix_codes$prefix, drop = FALSE],
|
|
158 1, function(x) paste(prefix_codes$prefix, ":", x, "\n",sep='', collapse=""))
|
|
159 dfraw$species_counts = species_counts_formated[1:nrow(dfraw)]
|
|
160 selected_cols = c(selected_cols, "species_counts")
|
|
161 selected_cols_main = c(selected_cols_main, "species_counts")
|
|
162 }
|
|
163
|
|
164
|
|
165 df_report = dfraw[,selected_cols]
|
|
166 ## describe tandem ranks:
|
|
167 df_report$TAREAN_annotation = RANKS_TANDEM[as.character(df_report$tandem_rank)]
|
|
168 ## remove Cluster_similarity_hits
|
|
169 df_report_csv = reformat_df_report(df_report)
|
|
170 df_report_csv = df_report_csv[,!colnames(df_report_csv) %in% "Cluster_similarity_hits"]
|
|
171 df_report_csv$Final_annotation=""
|
|
172
|
|
173 ## make table for profrep classification
|
|
174 write.table(
|
|
175 reformat_df_to_profrep_classification(df_report), file = profrep_classification_csv,
|
|
176 sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
|
|
177
|
|
178 df_report$"kmer_analysis" = ifelse(dfraw$putative_tandem, hwrite("report", link = dfraw$html_tarean), "N/A")
|
|
179 df_report$"Graph_layout" = hwriteImage(dfraw$image_file_tmb, link = dfraw$image_file, table = FALSE)
|
|
180 df_report$Cluster = paste0("CL", df_report$index)
|
|
181 df_report$"Proportion[%]" = signif (100 * df_report$size_real / N_clustering, 2)
|
|
182 df_report$"Proportion_Adjusted[%]" = signif (100 * df_report$size_adjusted / N_clustering, 2)
|
|
183 if (!tarean_mode){
|
|
184
|
|
185 df_report$Cluster=sapply(
|
|
186 df_report$index,
|
|
187 function(x) hwrite(x, link = sprintf("%s/dir_CL%04d/index.html", clusters_dir, x)))
|
|
188
|
|
189 df_report$supercluster = sapply(
|
|
190 df_report$supercluster,
|
|
191 function(x) hwrite(x, link = sprintf("%s/dir_SC%04d/index.html", superclusters_dir, x)))
|
|
192 }
|
|
193 ## TAREAN report
|
|
194 ## copy tarean output data help to place nad make link to it
|
|
195 file.copy(paste0(WD,"/style1.css"), dirname(htmlfile))
|
|
196 file.copy(paste0(WD,"/documentation.html"), dirname(htmlfile))
|
|
197
|
|
198 tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN)
|
|
199 tarean_html("Tandem Repeat Analyzer", HTML.title, HR=1)
|
|
200 tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN)
|
|
201 tarean_html('Run statistics:', HTML.title, HR=2)
|
|
202 tarean_html(paste("Number of input reads:", N_input ))
|
|
203 tarean_html(paste("Number of analyzed reads:", N_clustering))
|
|
204 tarean_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes")))
|
|
205
|
|
206 ## export links to consensus sequecnes in fasta files
|
|
207 tarean_html("Consensus files - fasta format:", HTML.title, HR=2)
|
|
208 for (i in TANDEM_RANKS[TANDEM_RANKS != 0]){ ## no consensus for rank 0
|
|
209 if (!is.null (consensus_files[[i]])){
|
|
210 N = sum(dfraw$tandem_rank == TANDEM_RANKS[i])
|
|
211 name_string = paste(names(TANDEM_RANKS)[i]," - total ", N, "found" )
|
|
212 tarean_html(paste("<p>",
|
|
213 hwrite(name_string, download = name_string,
|
|
214 link = basename(consensus_files[[i]][[1]])),
|
|
215 "<br>\n"))
|
|
216
|
|
217 }
|
|
218 }
|
|
219 ## print link to documentation ##
|
|
220 tarean_html("Documentation", HTML.title, HR=2)
|
|
221 tarean_html(paste('<p> For the explanation of TAREAN output see',
|
|
222 ' <a href="documentation.html#tra" > the help section </a> <p>'))
|
|
223 ## HOW TO CITE section)
|
|
224
|
|
225 ## PRINT TABLES WITH CLUSTERS
|
|
226 for (n in names(TANDEM_RANKS)){
|
|
227 tarean_html(n, HTML.title, HR=2)
|
|
228 inc <- dfraw$tandem_rank == TANDEM_RANKS[n]
|
|
229 if (sum(inc > 0)){
|
|
230 tarean_html(reformat4html(
|
|
231 reformat_header(
|
|
232 df_report[inc, selected_cols_tarean ,drop=FALSE]
|
|
233 )
|
|
234 ),
|
|
235 align = "left", digits = 3)
|
|
236 }else{
|
|
237 tarean_html("not found")
|
|
238 }
|
|
239 }
|
|
240
|
|
241 ## export table with all cluster
|
|
242 cat("",file = html_report_dt)
|
|
243
|
|
244 DT_instance = df_report[,selected_cols_main, drop = FALSE] %>%
|
|
245 reformat_header %>% reformat4html %>% datatable(escape = FALSE, options = DT_OPTIONS) %>%
|
|
246 formatStyle(columns = seq_along(selected_cols), "font-size" = "12px") %>%
|
|
247 formatStyle(columns = "Similarity<br>hits<br>[above 0.1%]", "min-width" = "500px")
|
|
248
|
|
249 saveWidget(DT_instance, file = normalizePath(html_report_dt),
|
|
250 libdir=normalizePath(libdir) , selfcontained = FALSE)
|
|
251
|
|
252 add_preamble(normalizePath(html_report_dt),
|
|
253 preamble='<h2>Cluster annotation</h2> <p><a href="documentation.html#clust"> For table legend see documentation. <a> </p>')
|
|
254
|
|
255 ## Main page - Clustering info - global information about clustering
|
|
256 top_clusters_prop = sum(df_report$size_real)/N_clustering
|
|
257 clustering_info = summary_histogram(fn = paths[["summary_histogram"]], N_clustering, N_omit, df_report$size_adjusted,
|
|
258 top_clusters_prop)
|
|
259 index_html = start_html(main_report, HTMLHEADER_INDEX)
|
|
260 index_html("Clustering Summary", HTML.title, HR = 1)
|
|
261
|
|
262 index_html(paste0('<a href="',
|
|
263 paths[['summary_histogram__relative']],
|
|
264 '"> <img src="', paths[['summary_histogram__relative']],
|
|
265 '" width="700" border="1" >',
|
|
266 ' </a>'), cat)
|
|
267 index_html('<p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>',cat)
|
|
268
|
|
269 index_html('Run information:', HTML.title, HR = 2)
|
|
270 index_html(paste("Number of input reads:", N_input ))
|
|
271 index_html(paste("Number of analyzed reads:", N_clustering))
|
|
272 if (N_omit != 0){
|
|
273 index_html(paste("Number of reads removed by automatic filtering of abundant putative satellites:", N_omit))
|
|
274 index_html(paste("Number of remaining reads after filtering of abundant satellites:", N_clustering - N_omit ))
|
|
275 }
|
|
276
|
|
277 index_html(
|
|
278 paste(
|
|
279 "Proportion of reads in top clusters :",
|
|
280 signif(100 * sum(df_report$size_real)/N_clustering,2),
|
|
281 "%"
|
|
282 ))
|
|
283 index_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes")))
|
|
284 index_html(paste("Paired-end reads:",ifelse(paired, "Yes", "No")))
|
|
285 index_html("Available analyses:", HTML.title, HR=2)
|
|
286 index_html(paste("<p>",hwrite("Tandem repeat analysis", link = HTML_LINKS$INDEX_TO_TAREAN),"</p>"),cat)
|
|
287
|
|
288 if (!tarean_mode){
|
|
289 index_html(paste("<p>", hwrite("Cluster annotation", link = HTML_LINKS$INDEX_TO_CLUSTER_REPORT),"</p>"),cat)
|
|
290 index_html(paste("<p>", hwrite("Supercluster annotation",
|
|
291 link = HTML_LINKS$INDEX_TO_SUPERCLUSTER_REPORT),"</p>"),cat)
|
|
292 index_html(paste("<p>", hwrite("Repeat annotation summary", link = HTML_LINKS$INDEX_TO_SUMMARIZED_ANNOTATION),"</p>"),cat)
|
|
293 }
|
|
294
|
|
295 if (is_comparative()) {
|
|
296 tryCatch({
|
|
297 imagemap = plot_rect_map(
|
|
298 read_counts = paths[['comparative_analysis_counts_csv']],
|
|
299 cluster_annotation = paths[['profrep_classification_csv']],
|
|
300 output_file = paths[['comparative_summary_map']]
|
|
301 )},
|
|
302 error = function(err){
|
|
303 print(paste("error while plotting ", err))
|
|
304 }
|
|
305 )
|
|
306
|
|
307 HTML.title("Comparative analysis - Total number of reads in clustering analysis", file = main_report)
|
|
308 index_html(df2html(
|
|
309 prefix_codes,
|
|
310 header = c("Code", "Total read count"), rounding_function = round),
|
|
311 cat
|
|
312 )
|
|
313 HTML.title("Comparative analysis - Number of reads in individual clusters", file = main_report)
|
|
314
|
|
315 index_html(paste0('<img src="', paths[['comparative_summary_map__relative']],
|
|
316 '" usemap ="#clustermap" border="2">'), cat)
|
|
317
|
|
318 index_html(
|
|
319 "Bar plot on top shows the size of individual clusters. Size of the rectangles in lower panel is proportional to the number of reads in a cluster for each species. Clusters and species were sorted using hierarchical clustering. Bars and rectangles in the plot are hyperlinked to the individual cluster reports.")
|
|
320 index_html(imagemap)
|
|
321 }
|
|
322
|
|
323 how2cite = readLines(paths[["how_to_cite"]])
|
|
324
|
|
325 index_html(how2cite, cat, sep="\n")
|
|
326 index_html("<br><hr>", cat)
|
|
327 index_html('Details:', HTML.title, HR = 3)
|
|
328 index_html(pipeline_version_info %>% preformatted, cat)
|
|
329 index_html(paste0("Minimal number of reads in cluster to be considered top cluster : ", mincln))
|
|
330 index_html(paste0("Reserved Memory : ", round(max_memory/(1024*1024)), "G"))
|
|
331 index_html(paste0("Maximum number of processable reads with the reserved memory : ", max_number_reads_for_clustering))
|
|
332
|
|
333
|
|
334 ## export to csv
|
|
335 clustering_info$Number_of_analyzed_reads = N_clustering
|
|
336 write.table(t(as.data.frame(clustering_info)),
|
|
337 file = clusters_summary_csv, sep="\t", col.names = FALSE)
|
|
338 cat("\n", file = clusters_summary_csv, append = TRUE)
|
|
339 write.table(
|
|
340 df_report_csv, file = clusters_summary_csv,
|
|
341 sep = "\t", col.names = TRUE, row.names = FALSE, quote = TRUE, append=TRUE)
|
|
342 }
|
|
343
|
|
344 dummy_function = function(){
|
|
345 print("dummy function")
|
|
346 }
|
|
347
|
|
348 reformat_df_report = function(df_report){
|
|
349 # for printing to csv - this should be consise
|
|
350 df_report$TR_consensus = gsub("(<pre>)|(</pre>)","",df_report$TR_consensus)
|
|
351 df_report$tandem_rank = NULL
|
|
352 ## make suitable order and rename
|
|
353 if ("annotations_summary_custom" %in% colnames(df_report)){
|
|
354 custom = "annotations_summary_custom"
|
|
355 }else{
|
|
356 custom=character()
|
|
357 }
|
|
358 df_out = df_report[,c('index',
|
|
359 'supercluster',
|
|
360 'size_real',
|
|
361 'size_adjusted',
|
|
362 'supercluster_best_hit',
|
|
363 'TAREAN_annotation',
|
|
364 'annotations_summary',
|
|
365 custom)
|
|
366 ]
|
|
367
|
|
368 colnames(df_out) = c('Cluster',
|
|
369 'Supercluster',
|
|
370 'Size',
|
|
371 'Size_adjusted',
|
|
372 'Automatic_annotation',
|
|
373 'TAREAN_annotation',
|
|
374 'Cluster_similarity_hits',
|
|
375 custom)
|
|
376 return(df_out)
|
|
377 }
|
|
378
|
|
379 reformat_df_to_profrep_classification = function(df_report){
|
|
380 CL = df_report$index
|
|
381 best_hit = df_report$supercluster_best_hit
|
|
382 ## format conversion(in order):
|
|
383 replacement = list(
|
|
384 c("/", "|"),
|
|
385 c("Ty1_copia", "Ty1/copia"),
|
|
386 c("Ty3_gypsy", "Ty3/gypsy"),
|
|
387 c("TatIV_Ogre", "TatIV/Ogre"),
|
|
388 c("Ogre_Tat", "Ogre/Tat"),
|
|
389 c("EnSpm_CACTA", "EnSpm/CACTA"),
|
|
390 c("MuDR_Mutator", "MuDR/Mutator"),
|
|
391 c("PIF_Harbinger", "PIF/Harbinger"),
|
|
392 c("Tc1/Mariner", "Tc1/Mariner"),
|
|
393 c("All|", "")
|
|
394 )
|
|
395 for (i in replacement){
|
|
396 best_hit = gsub(i[1], i[2], best_hit, fixed = TRUE)
|
|
397 }
|
|
398 best_hit = gsub("^All", "", best_hit, fixed = FALSE)
|
|
399 best_hit = ifelse(best_hit == "", paste0("unknown_CL", CL), best_hit)
|
|
400 output = data.frame(Cluster = CL, classification = best_hit, stringsAsFactors = FALSE)
|
|
401 return(output)
|
|
402 }
|