Mercurial > repos > artbio > mutational_patterns
diff mutational_patterns.R @ 3:e332cf9dfa06 draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mutational_patterns commit 0f7593f703ba0dfd12aea1c0460e371f08b57d2f"
author | artbio |
---|---|
date | Thu, 24 Sep 2020 01:32:52 +0000 |
parents | aea952be68cb |
children | 7ba08c826888 |
line wrap: on
line diff
--- a/mutational_patterns.R Wed Sep 16 22:58:28 2020 +0000 +++ b/mutational_patterns.R Thu Sep 24 01:32:52 2020 +0000 @@ -29,6 +29,12 @@ help = "path to the tab separated file describing the levels in function of datasets" ), make_option( + "--cosmic_version", + default = "v2", + type = 'character', + help = "Version of the Cosmic Signature set to be used to express mutational patterns" + ), + make_option( "--signum", default = 2, type = 'integer', @@ -52,7 +58,6 @@ type = 'integer', help = "Number of new signatures to be captured" ), - make_option( "--output_spectrum", default = NA, @@ -66,11 +71,22 @@ help = "path to output dataset" ), make_option( + "--sigmatrix", + default = NA, + type = 'character', + help = "path to signature matrix" + ), + make_option( "--output_cosmic", default = NA, type = 'character', help = "path to output dataset" - ) + ), + make_option( + c("-r", "--rdata"), + type="character", + default=NULL, + help="Path to RData output file") ) opt = parse_args(OptionParser(option_list = option_list), @@ -81,28 +97,27 @@ parser <- newJSONParser() parser$addData(json_dict) fileslist <- parser$getObject() -vcf_files <- attr(fileslist, "names") -sample_names <- unname(unlist(fileslist)) +vcf_paths <- attr(fileslist, "names") +element_identifiers <- unname(unlist(fileslist)) ref_genome <- opt$genome -vcf_table <- data.frame(sample_name=sample_names, path=vcf_files) +vcf_table <- data.frame(element_identifier=element_identifiers, path=vcf_paths) library(MutationalPatterns) library(ref_genome, character.only = TRUE) +library(ggplot2) # Load the VCF files into a GRangesList: -vcfs <- read_vcfs_as_granges(vcf_files, sample_names, ref_genome) +vcfs <- read_vcfs_as_granges(vcf_paths, element_identifiers, ref_genome) if (!is.na(opt$levels)[1]) { # manage levels if there are - levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("sample_name","level")) - library(dplyr) - metadata_table <- left_join(vcf_table, levels_table, by = "sample_name") -# detach("package:dplyr", unload=TRUE) - tissue <- metadata_table$level + levels_table <- read.delim(opt$levels, header=FALSE, col.names=c("element_identifier","level")) + library(plyr) + metadata_table <- join(vcf_table, levels_table, by = "element_identifier") + tissue <- as.vector(metadata_table$level) } ##### This is done for any section ###### mut_mat <- mut_matrix(vcf_list = vcfs, ref_genome = ref_genome) - ###### Section 1 Mutation characteristics and spectrums ############# if (!is.na(opt$output_spectrum)[1]) { pdf(opt$output_spectrum, paper = "special", width = 11.69, height = 11.69) @@ -114,34 +129,41 @@ p1 <- plot_spectrum(type_occurrences, CT = TRUE, legend=TRUE) plot(p1) } else { - p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE, legend=TRUE) # by sample + p2 <- plot_spectrum(type_occurrences, by = tissue, CT=TRUE) # by levels p3 <- plot_spectrum(type_occurrences, CT=TRUE, legend=TRUE) # total grid.arrange(p2, p3, ncol=2, widths=c(4,2.3), heights=c(4,1)) - } + } plot_96_profile(mut_mat, condensed = TRUE) dev.off() } ###### Section 2: De novo mutational signature extraction using NMF ####### if (!is.na(opt$output_denovo)[1]) { - mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix + # opt$rank cannot be higher than the number of samples + if (opt$rank > length(element_identifiers)) {opt$rank <-length(element_identifiers)} + # likewise, opt$signum cannot be higher thant the number of samples + if (opt$signum > length(element_identifiers)) {opt$signum <-length(element_identifiers)} + pseudo_mut_mat <- mut_mat + 0.0001 # First add a small pseudocount to the mutation count matrix # Use the NMF package to generate an estimate rank plot library("NMF") - estimate <- nmf(mut_mat, rank=1:opt$rank+1, method="brunet", nrun=opt$nrun, seed=123456) + estimate <- nmf(pseudo_mut_mat, rank=1:opt$rank, method="brunet", nrun=opt$nrun, seed=123456) # And plot it pdf(opt$output_denovo, paper = "special", width = 11.69, height = 11.69) - p.trans <- plot(estimate) - grid.arrange(p.trans) + p4 <- plot(estimate) + grid.arrange(p4) # Extract 4 (PARAMETIZE) mutational signatures from the mutation count matrix with extract_signatures # (For larger datasets it is wise to perform more iterations by changing the nrun parameter # to achieve stability and avoid local minima) - nmf_res <- extract_signatures(mut_mat, rank=opt$rank, nrun=opt$nrun) + nmf_res <- extract_signatures(pseudo_mut_mat, rank=opt$newsignum, nrun=opt$nrun) # Assign signature names - colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$rank) - rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$rank) + colnames(nmf_res$signatures) <- paste0("NewSig_", 1:opt$newsignum) + rownames(nmf_res$contribution) <- paste0("NewSig_", 1:opt$newsignum) # Plot the 96-profile of the signatures: - p.trans <- plot_96_profile(nmf_res$signatures, condensed = TRUE) - grid.arrange(p.trans) + p5 <- plot_96_profile(nmf_res$signatures, condensed = TRUE) + new_sig_matrix <- reshape2::dcast(p5$data, substitution + context ~ variable, value.var = "value") + new_sig_matrix = format(new_sig_matrix, scientific=TRUE) + write.table(new_sig_matrix, file=opt$sigmatrix, quote = FALSE, row.names = FALSE, sep="\t") + grid.arrange(p5) # Visualize the contribution of the signatures in a barplot pc1 <- plot_contribution(nmf_res$contribution, nmf_res$signature, mode="relative", coord_flip = TRUE) # Visualize the contribution of the signatures in absolute number of mutations @@ -155,14 +177,14 @@ # can be plotted in a user-specified order. # Plot signature contribution as a heatmap with sample clustering dendrogram and a specified signature order: pch1 <- plot_contribution_heatmap(nmf_res$contribution, - sig_order = paste0("NewSig_", 1:opt$rank)) + sig_order = paste0("NewSig_", 1:opt$newsignum)) # Plot signature contribution as a heatmap without sample clustering: pch2 <- plot_contribution_heatmap(nmf_res$contribution, cluster_samples=FALSE) #Combine the plots into one figure: grid.arrange(pch1, pch2, ncol = 2, widths = c(2,1.6)) # Compare the reconstructed mutational profile with the original mutational profile: - plot_compare_profiles(mut_mat[,1], + plot_compare_profiles(pseudo_mut_mat[,1], nmf_res$reconstructed[,1], profile_names = c("Original", "Reconstructed"), condensed = TRUE) @@ -172,18 +194,38 @@ if (!is.na(opt$output_cosmic)[1]) { pdf(opt$output_cosmic, paper = "special", width = 11.69, height = 11.69) - sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") - cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) - mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix - new_order = match(row.names(mut_mat), cancer_signatures$Somatic.Mutation.Type) - cancer_signatures = cancer_signatures[as.vector(new_order),] - row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type - cancer_signatures = as.matrix(cancer_signatures[,4:33]) + pseudo_mut_mat <- mut_mat + 0.0001 # First add a small psuedocount to the mutation count matrix + if (opt$cosmic_version == "v2") { + sp_url <- paste("https://cancer.sanger.ac.uk/cancergenome/assets/", "signatures_probabilities.txt", sep = "") + cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) + new_order = match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type) + cancer_signatures = cancer_signatures[as.vector(new_order),] + row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type + cancer_signatures = as.matrix(cancer_signatures[,4:33]) + colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels + cosmic_tag <- "Signatures (Cosmic v2, March 2015)" + } else { + sp_url <- "https://raw.githubusercontent.com/ARTbio/startbio/master/sigProfiler_SBS_signatures_2019_05_22.tsv" + cancer_signatures = read.table(sp_url, sep = "\t", header = TRUE) + new_order = match(row.names(pseudo_mut_mat), cancer_signatures$Somatic.Mutation.Type) + cancer_signatures = cancer_signatures[as.vector(new_order),] + row.names(cancer_signatures) = cancer_signatures$Somatic.Mutation.Type + cancer_signatures = as.matrix(cancer_signatures[,4:70]) + colnames(cancer_signatures) <- gsub("SBS", "", colnames(cancer_signatures)) # shorten signature labels + cosmic_tag <- "Signatures (Cosmic v3, May 2019)" + } # Plot mutational profiles of the COSMIC signatures - colnames(cancer_signatures) <- gsub("Signature.", "", colnames(cancer_signatures)) # shorten signature labels - p.trans <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) - grid.arrange(p.trans, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3))) + if (opt$cosmic_version == "v2") { + p6 <- plot_96_profile(cancer_signatures, condensed = TRUE, ymax = 0.3) + grid.arrange(p6, top = textGrob("COSMIC signature profiles",gp=gpar(fontsize=12,font=3))) + } else { + print(length(cancer_signatures)) + p6 <- plot_96_profile(cancer_signatures[,1:33], condensed = TRUE, ymax = 0.3) + p6bis <- plot_96_profile(cancer_signatures[,34:67], condensed = TRUE, ymax = 0.3) + grid.arrange(p6, top = textGrob("COSMIC signature profiles (on two pages)",gp=gpar(fontsize=12,font=3))) + grid.arrange(p6bis, top = textGrob("COSMIC signature profiles (continued)",gp=gpar(fontsize=12,font=3))) + } # Hierarchically cluster the COSMIC signatures based on their similarity with average linkage # hclust_cosmic = cluster_signatures(cancer_signatures, method = "average") @@ -211,16 +253,37 @@ # grid.arrange(p.trans) # Find optimal contribution of COSMIC signatures to reconstruct 96 mutational profiles - fit_res <- fit_to_signatures(mut_mat, cancer_signatures) + fit_res <- fit_to_signatures(pseudo_mut_mat, cancer_signatures) # Select signatures with some contribution (above a threshold) threshold <- tail(sort(unlist(rowSums(fit_res$contribution), use.names = FALSE)), opt$signum)[1] select <- which(rowSums(fit_res$contribution) >= threshold) # ensure opt$signum best signatures in samples are retained, the others discarded # Plot contribution barplots - pc1 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute") - pc2 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative") + pc3 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "absolute") + pc4 <- plot_contribution(fit_res$contribution[select,], cancer_signatures[,select], coord_flip = T, mode = "relative") + ##### + # ggplot2 alternative + if (!is.na(opt$levels)[1]) { # if there are levels to display in graphs + pc3_data <- pc3$data + pc3_data <- merge (pc3_data, metadata_table[,c(1,3)], by.x="Sample", by.y="element_identifier") + pc3 <- ggplot(pc3_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) + + geom_bar(stat="identity", position='stack') + + scale_fill_discrete(name="Cosmic\nSignature") + + labs(x = "Samples", y = "Absolute contribution") + theme_bw() + + theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank()) + + facet_grid(~level, scales = "free_x") + pc4_data <- pc4$data + pc4_data <- merge (pc4_data, metadata_table[,c(1,3)], by.x="Sample", by.y="element_identifier") + pc4 <- ggplot(pc4_data, aes(x=Sample, y=Contribution, fill=as.factor(Signature))) + + geom_bar(stat="identity", position='fill') + + scale_fill_discrete(name="Cosmic\nSignature") + + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + + labs(x = "Samples", y = "Relative contribution") + theme_bw() + + theme(panel.grid.minor.x = element_blank(), panel.grid.major.x = element_blank()) + + facet_grid(~level, scales = "free_x") + } # Combine the two plots: - grid.arrange(pc1, pc2) + grid.arrange(pc3, pc4, top = textGrob("Absolute and Relative Contributions of Cosmic signatures to mutational patterns",gp=gpar(fontsize=12,font=3))) ## pie charts of comic signatures contributions in samples sig_data_pie <- as.data.frame(t(head(fit_res$contribution[select,]))) @@ -231,20 +294,19 @@ melted_sig_data_pie_percents <-melt(data=sig_data_pie_percents) melted_sig_data_pie_percents$label <- sub("Sig.", "", melted_sig_data_pie_percents$variable) melted_sig_data_pie_percents$pos <- cumsum(melted_sig_data_pie_percents$value) - melted_sig_data_pie_percents$value/2 - library(ggplot2) - p.trans <- ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) + + p7 <- ggplot(melted_sig_data_pie_percents, aes(x="", y=value, group=variable, fill=variable)) + geom_bar(width = 1, stat = "identity") + geom_text(aes(label = label), position = position_stack(vjust = 0.5), color="black", size=3) + coord_polar("y", start=0) + facet_wrap(~ sample) + - labs(x="", y="Samples", fill = "Signatures (Cosmic_v2,March 2015)") + + labs(x="", y="Samples", fill = cosmic_tag) + theme(axis.text = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank()) - grid.arrange(p.trans) + grid.arrange(p7) # Plot relative contribution of the cancer signatures in each sample as a heatmap with sample clustering - p.trans <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") - grid.arrange(p.trans) + p8 <- plot_contribution_heatmap(fit_res$contribution, cluster_samples = TRUE, method = "complete") + grid.arrange(p8) # Compare the reconstructed mutational profile of sample 1 with its original mutational profile # plot_compare_profiles(mut_mat[,1], fit_res$reconstructed[,1], @@ -255,7 +317,7 @@ # `cos_sim_matrix` # calculate all pairwise cosine similarities - cos_sim_ori_rec <- cos_sim_matrix(mut_mat, fit_res$reconstructed) + cos_sim_ori_rec <- cos_sim_matrix(pseudo_mut_mat, fit_res$reconstructed) # extract cosine similarities per sample between original and reconstructed cos_sim_ori_rec <- as.data.frame(diag(cos_sim_ori_rec)) @@ -270,7 +332,7 @@ colnames(cos_sim_ori_rec) = "cos_sim" cos_sim_ori_rec$sample = row.names(cos_sim_ori_rec) # Make barplot - p.trans <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) + + p9 <- ggplot(cos_sim_ori_rec, aes(y=cos_sim, x=sample)) + geom_bar(stat="identity", fill = "skyblue4") + coord_cartesian(ylim=c(0.8, 1)) + # coord_flip(ylim=c(0.8,1)) + @@ -283,7 +345,12 @@ panel.grid.major.y=element_blank()) + # Add cut.off line geom_hline(aes(yintercept=.95)) - grid.arrange(p.trans, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3))) - + grid.arrange(p9, top = textGrob("Similarity between true and reconstructed profiles (with all Cosmic sig.)",gp=gpar(fontsize=12,font=3))) dev.off() } + + +# Output RData file +if (!is.null(opt$rdata)) { + save.image(file=opt$rdata) +}