Mercurial > repos > petr-novak > repeatrxplorer

#!/usr/bin/env Rscript
library(R2HTML)
library(hwriter)
library(DT)
library(tools)

source("htmlheader.R")
source("config.R")  # load TANDEM_RANKS
source("utils.R")
DT_OPTIONS = list(pageLength = 1000, lengthMenu = c(10, 50, 100, 1000, 5000, 10000))
HTMLHEADER_TAREAN = gsub("PAGE_TITLE","TAREAN summary", htmlheader)
HTMLHEADER_INDEX = gsub("PAGE_TITLE","Clustering summary", htmlheader)

WD = getwd()  # to get script directory when run from Rserve

reformat_header = function(df){
    H = colnames(df)
    H[H=="TR_score"] = "TAREAN k-mer_coverage"
    H[H=="vcount"] = "|V|"
    H[H=="ecount"] = "|E|"
    H[H=="Genome_Proportion[%]"] = "Proportion[%]"
    H[H=="Proportion_Adjusted[%]"] = "Proportion adjusted[%]"
    H[H=="supercluster"] = "Super_cluster"
    H[H=="size_real"] = "Number of reads"
    H[H=="TR_monomer_length"] = "Consensus_length"
    H[H=="TR_consensus"] = "Consensus"
    H[H=="pbs_score"] = "PBS score"
    H[H=="ltr_detection"] = "LTR detection"
    H[H=="kmer_analysis"] = "TAREAN k-mer analysis"

   # H[H=="annotations_summary"] = "Similarity_hits"
    H[H=="annotations_summary"] = "Similarity_hits_[above 0.1%]"
    H[H=="annotations_summary_custom"] = "Similarity_hits_to_custom_database"
    H[H=="loop_index"] = "connected_component_index C"
    H[H=="pair_completeness"] = "pair_completeness_index_P"
    H = gsub("_","<br>",H)
    H=gsub("TR_","",H)
    H = capitalize(H)
    colnames(df) = H
    return(df)
}

reformat4html=function(df){
    for (n in colnames(df)){
        if (class(df[,n]) == 'character'){
            df[,n] = gsub("\n","<br>", df[,n])
        }
        if (class(df[,n]) == 'numeric'){
            df[,n] = signif(df[,n],3)
        }
    }
    return(df)
}

capitalize = function(s){
    paste(toupper(substring(s, 1, 1)),
          substring(s, 2),
          sep="")
}


create_main_reports = function(paths, N_clustering, N_input,N_omit, merge_threshold,
                               paired, consensus_files, custom_db, tarean_mode,
                               HTML_LINKS, pipeline_version_info, max_memory,
                               max_number_reads_for_clustering, mincln){
    ## this create main html index and also tarean report ##
    ## index and tarean html reports are created always
    ## extract all paths and directories
    HTML_LINKS = nested2named_list(HTML_LINKS)
    paths = nested2named_list(paths)
    csvfile = paths[['clusters_info']]
    clusters_summary_csv = paths[['clusters_summary_csv']]
    profrep_classification_csv = paths[['profrep_classification_csv']]
    htmlfile = paths[["tarean_report_html"]]
    html_report_dt = paths[["cluster_report_html"]]
    main_report = paths[["main_report_html"]]
    summarized_annnotation_html = paths[["summarized_annotation_html"]]
    libdir = paths[['libdir']]
    clusters_dir = paths[["clusters__relative"]]
    superclusters_dir = paths[['superclusters__relative']]
    seqdb = paths[['sequences_db']]
    hitsortdb = paths[['hitsort_db']]
    connect_to_databases(seqdb, hitsortdb)
    dfraw = read.table(csvfile, as.is=TRUE, header=TRUE, sep="\t", na.strings = c('None','NA'))
    # table must be updated
    dfraw$supercluster_best_hit = dbGetQuery(HITSORTDB, "SELECT supercluster_best_hit FROM cluster_info")[, 1]
    ## columns to use
    selected_cols = c("index", "size_real","size_adjusted", "vcount","ecount",
                     "loop_index", "pair_completeness",
                    'satellite_probability','satellite',
                    'TR_score','pbs_score','ltr_detection', 'TR_monomer_length',
                    'TR_consensus', "annotations_summary", "supercluster", 'tandem_rank',
                    'supercluster_best_hit')

    ## some columns are added (like Graph_layout, clusters,...)
    ## columns for html report
    selected_cols_tarean = c(
        "Cluster",
        "Proportion[%]",
        "Proportion_Adjusted[%]",
        "size_real",
        'satellite_probability',
        'TR_monomer_length',
        'TR_consensus',
        'Graph_layout',
        'kmer_analysis',
        "loop_index",
        "pair_completeness",
        'TR_score',
        "vcount",
        "ecount",
        'pbs_score',
        "annotations_summary"
    )
    selected_cols_main = c(
        "Cluster",
        "supercluster",
        "Proportion[%]",
        "Proportion_Adjusted[%]",
        "size_real",
        'Graph_layout',
        "annotations_summary",
        'ltr_detection',
        'satellite_probability',
        'TAREAN_annotation',
        'TR_monomer_length',
        'TR_consensus',
        'kmer_analysis',
        "loop_index",
        "pair_completeness",
        'TR_score',
        "ecount",
        "vcount"
    )

    if (custom_db){
        selected_cols_main = c(selected_cols_main, "annotations_summary_custom")
        selected_cols_tarean = c(selected_cols_tarean, "annotations_summary_custom")
        selected_cols = c(selected_cols, "annotations_summary_custom")
    }
    if (is_comparative()){
        prefix_codes = dbGetQuery(SEQDB, "SELECT * FROM prefix_codes")
        species_counts = dbGetQuery(HITSORTDB, "SELECT * FROM comparative_counts")
        superclusters = dbGetQuery(HITSORTDB,paste(
                                                 "SELECT supercluster, cluster FROM superclusters WHERE cluster <=",
                                                 nrow(species_counts))
                                   )
        species_counts = merge(superclusters, species_counts, by.x = "cluster", by.y = "clusterindex")
        ## include commented header with total counts:
        cat("# Total counts:\t\t", paste(prefix_codes$N, collapse="\t"),"\n#\n",
            sep="",
            file = paths[['comparative_analysis_counts_csv']])

        write.table(species_counts, file = paths[['comparative_analysis_counts_csv']],
                    sep = "\t", col.names = TRUE, row.names = FALSE, append=TRUE)
        species_counts_formated = apply(
            species_counts[, prefix_codes$prefix, drop = FALSE],
            1, function(x) paste(prefix_codes$prefix, ":", x, "\n",sep='', collapse=""))
        dfraw$species_counts = species_counts_formated[1:nrow(dfraw)]
        selected_cols = c(selected_cols, "species_counts")
        selected_cols_main = c(selected_cols_main, "species_counts")
    }


    df_report = dfraw[,selected_cols]
    ## describe tandem ranks:
    df_report$TAREAN_annotation = RANKS_TANDEM[as.character(df_report$tandem_rank)]
    ## remove Cluster_similarity_hits
    df_report_csv = reformat_df_report(df_report)
    df_report_csv = df_report_csv[,!colnames(df_report_csv) %in% "Cluster_similarity_hits"]
    df_report_csv$Final_annotation=""

    ## make table for profrep classification
    write.table(
        reformat_df_to_profrep_classification(df_report), file = profrep_classification_csv,
        sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

    df_report$"kmer_analysis" = ifelse(dfraw$putative_tandem, hwrite("report", link = dfraw$html_tarean), "N/A")
    df_report$"Graph_layout" = hwriteImage(dfraw$image_file_tmb, link = dfraw$image_file, table = FALSE)
    df_report$Cluster = paste0("CL", df_report$index)
    df_report$"Proportion[%]" = signif (100 * df_report$size_real / N_clustering, 2)
    df_report$"Proportion_Adjusted[%]" = signif (100 * df_report$size_adjusted / N_clustering, 2)
    if (!tarean_mode){

        df_report$Cluster=sapply(
            df_report$index,
            function(x) hwrite(x, link = sprintf("%s/dir_CL%04d/index.html", clusters_dir, x)))

        df_report$supercluster = sapply(
            df_report$supercluster,
            function(x) hwrite(x, link = sprintf("%s/dir_SC%04d/index.html", superclusters_dir, x)))
    }
    ## TAREAN report
    ## copy tarean output data help to place nad make link to it
    file.copy(paste0(WD,"/style1.css"), dirname(htmlfile))
    file.copy(paste0(WD,"/documentation.html"), dirname(htmlfile))

    tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN)
    tarean_html("Tandem Repeat Analyzer", HTML.title, HR=1)
    tarean_html = start_html(htmlfile, HTMLHEADER_TAREAN)
    tarean_html('Run statistics:', HTML.title, HR=2)
    tarean_html(paste("Number of input reads:", N_input ))
    tarean_html(paste("Number of analyzed reads:", N_clustering))
    tarean_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes")))

    ## export links to consensus sequecnes in fasta files
    tarean_html("Consensus files - fasta format:", HTML.title, HR=2)
    for (i in TANDEM_RANKS[TANDEM_RANKS != 0]){ ## no consensus for rank 0
        if (!is.null (consensus_files[[i]])){
            N = sum(dfraw$tandem_rank == TANDEM_RANKS[i])
            name_string = paste(names(TANDEM_RANKS)[i]," - total ", N, "found" )
            tarean_html(paste("<p>",
                  hwrite(name_string, download = name_string,
                         link = basename(consensus_files[[i]][[1]])),
                  "<br>\n"))

        }
    }
    ## print link to documentation ##
    tarean_html("Documentation", HTML.title, HR=2)
    tarean_html(paste('<p> For the explanation of TAREAN output see',
        ' <a href="documentation.html#tra" > the help section </a> <p>'))
    ## HOW TO CITE section)

    ## PRINT TABLES WITH CLUSTERS
    for (n in names(TANDEM_RANKS)){
      tarean_html(n, HTML.title, HR=2)
      inc <- dfraw$tandem_rank == TANDEM_RANKS[n]
      if (sum(inc > 0)){
          tarean_html(reformat4html(
              reformat_header(
                  df_report[inc, selected_cols_tarean ,drop=FALSE]
              )
          ),
          align = "left", digits = 3)
      }else{
        tarean_html("not found")
      }
    }

    ## export table with all cluster
    cat("",file = html_report_dt)

    DT_instance =  df_report[,selected_cols_main, drop = FALSE] %>%
        reformat_header %>% reformat4html %>% datatable(escape = FALSE, options = DT_OPTIONS) %>%
        formatStyle(columns = seq_along(selected_cols), "font-size" = "12px") %>%
        formatStyle(columns = "Similarity<br>hits<br>[above 0.1%]", "min-width" = "500px")

    saveWidget(DT_instance, file = normalizePath(html_report_dt),
               libdir=normalizePath(libdir) , selfcontained = FALSE)

    add_preamble(normalizePath(html_report_dt),
                 preamble='<h2>Cluster annotation</h2> <p><a href="documentation.html#clust"> For table legend see documentation. <a> </p>')

    ## Main page - Clustering info - global information about clustering
    top_clusters_prop = sum(df_report$size_real)/N_clustering
    clustering_info = summary_histogram(fn = paths[["summary_histogram"]], N_clustering, N_omit, df_report$size_adjusted,
                    top_clusters_prop)
    index_html = start_html(main_report, HTMLHEADER_INDEX)
    index_html("Clustering Summary", HTML.title, HR = 1)

    index_html(paste0('<a href="',
                      paths[['summary_histogram__relative']],
                      '"> <img src="', paths[['summary_histogram__relative']],
                      '" width="700" border="1" >',
                      ' </a>'), cat)
  index_html('<p> <b> Graphical summary of the clustering results. </b> Bars represent superclusters, with their heights and widths corresponding to the numbers of reads in the superclusters (y-axis) and to their proportions in all analyzed reads (x-axis), respectively. Rectangles inside the supercluster bars represent individual clusters. If the filtering of abundant satellites was performed, the affected clusters are shown in green, and their sizes correspond to the adjusted values. Blue and pink background panels show proportions of reads that were clustered and remained single, respectively. Top clusters are on the left of the dotted line. </p><hr><br><br>',cat)

    index_html('Run information:', HTML.title, HR = 2)
    index_html(paste("Number of input reads:", N_input ))
    index_html(paste("Number of analyzed reads:", N_clustering))
    if (N_omit != 0){
      index_html(paste("Number of reads removed by automatic filtering of abundant putative satellites:", N_omit))
      index_html(paste("Number of remaining reads after filtering of abundant satellites:", N_clustering - N_omit ))
    }

    index_html(
        paste(
            "Proportion of reads in top clusters :",
            signif(100 * sum(df_report$size_real)/N_clustering,2),
            "%"
        ))
    index_html(paste("Cluster merging:",ifelse(merge_threshold == 0,"No", "Yes")))
    index_html(paste("Paired-end reads:",ifelse(paired, "Yes", "No")))
    index_html("Available analyses:", HTML.title, HR=2)
    index_html(paste("<p>",hwrite("Tandem repeat analysis", link = HTML_LINKS$INDEX_TO_TAREAN),"</p>"),cat)

    if (!tarean_mode){
        index_html(paste("<p>", hwrite("Cluster annotation", link = HTML_LINKS$INDEX_TO_CLUSTER_REPORT),"</p>"),cat)
        index_html(paste("<p>", hwrite("Supercluster annotation",
                                       link = HTML_LINKS$INDEX_TO_SUPERCLUSTER_REPORT),"</p>"),cat)
        index_html(paste("<p>", hwrite("Repeat annotation summary", link = HTML_LINKS$INDEX_TO_SUMMARIZED_ANNOTATION),"</p>"),cat)
    }

  if (is_comparative()) {
    tryCatch({
      imagemap = plot_rect_map(
        read_counts = paths[['comparative_analysis_counts_csv']],
        cluster_annotation = paths[['profrep_classification_csv']],
        output_file = paths[['comparative_summary_map']]
      )},
      error = function(err){
        print(paste("error while plotting ", err))
      }
    )

    HTML.title("Comparative analysis - Total number of reads in clustering analysis", file = main_report)
    index_html(df2html(
      prefix_codes,
      header = c("Code", "Total read count"), rounding_function = round),
      cat
      )
    HTML.title("Comparative analysis - Number of reads in individual clusters", file = main_report)

    index_html(paste0('<img src="', paths[['comparative_summary_map__relative']],
                      '" usemap ="#clustermap" border="2">'), cat)

    index_html(
      "Bar plot on top shows the size of individual clusters. Size of the rectangles in lower panel is proportional to the number of reads in a cluster for each species. Clusters and species were sorted using hierarchical clustering. Bars and rectangles in the plot are hyperlinked to the individual cluster reports.")
    index_html(imagemap)
  }

  how2cite = readLines(paths[["how_to_cite"]])

  index_html(how2cite, cat, sep="\n")
  index_html("<br><hr>", cat)
  index_html('Details:', HTML.title, HR = 3)
  index_html(pipeline_version_info %>% preformatted, cat)
  index_html(paste0("Minimal number of reads in cluster to be considered top cluster : ", mincln))
  index_html(paste0("Reserved Memory : ", round(max_memory/(1024*1024)), "G"))
  index_html(paste0("Maximum number of processable reads with the reserved memory : ", max_number_reads_for_clustering))


  ## export to csv
  clustering_info$Number_of_analyzed_reads = N_clustering
  write.table(t(as.data.frame(clustering_info)),
              file = clusters_summary_csv, sep="\t", col.names = FALSE)
  cat("\n", file = clusters_summary_csv, append = TRUE)
  write.table(
    df_report_csv, file = clusters_summary_csv,
    sep = "\t", col.names = TRUE, row.names = FALSE, quote = TRUE, append=TRUE)
}

dummy_function = function(){
    print("dummy function")
}

reformat_df_report = function(df_report){
    # for printing to csv  - this should be consise
    df_report$TR_consensus = gsub("(<pre>)|(</pre>)","",df_report$TR_consensus)
    df_report$tandem_rank = NULL
    ## make suitable order and rename
    if ("annotations_summary_custom" %in% colnames(df_report)){
        custom = "annotations_summary_custom"
    }else{
        custom=character()
    }
    df_out = df_report[,c('index',
                          'supercluster',
                          'size_real',
                          'size_adjusted',
                          'supercluster_best_hit',
                          'TAREAN_annotation',
                          'annotations_summary',
                          custom)
                       ]

    colnames(df_out) = c('Cluster',
                         'Supercluster',
                         'Size',
                         'Size_adjusted',
                         'Automatic_annotation',
                         'TAREAN_annotation',
                         'Cluster_similarity_hits',
                         custom)
    return(df_out)
}

reformat_df_to_profrep_classification = function(df_report){
    CL = df_report$index
    best_hit = df_report$supercluster_best_hit
    ## format conversion(in order):
    replacement = list(
        c("/", "|"),
        c("Ty1_copia", "Ty1/copia"),
        c("Ty3_gypsy", "Ty3/gypsy"),
        c("TatIV_Ogre", "TatIV/Ogre"),
        c("Ogre_Tat", "Ogre/Tat"),
        c("EnSpm_CACTA", "EnSpm/CACTA"),
        c("MuDR_Mutator", "MuDR/Mutator"),
        c("PIF_Harbinger", "PIF/Harbinger"),
        c("Tc1/Mariner", "Tc1/Mariner"),
        c("All|", "")
    )
    for (i in replacement){
        best_hit = gsub(i[1], i[2], best_hit, fixed = TRUE)
    }
    best_hit = gsub("^All", "", best_hit, fixed = FALSE)
    best_hit = ifelse(best_hit == "", paste0("unknown_CL", CL), best_hit)
    output = data.frame(Cluster = CL, classification = best_hit, stringsAsFactors = FALSE)
    return(output)
}
author	petr-novak
date	Thu, 19 Dec 2019 10:24:45 -0500
parents
children