comparison signature_score.R @ 5:bd810b6d8a3b draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit ae36c63e1e5c0c285593d427f0ffd348a3d89e3f
author artbio
date Thu, 07 Nov 2024 22:43:58 +0000
parents 31da579595c5
children
comparison
equal deleted inserted replaced
4:31da579595c5 5:bd810b6d8a3b
1 # Signature score 1 # Signature score
2 # Compute the signature score based on the geometric mean of the target gene expression 2 # Compute the signature score based on the geometric mean of the target gene expression
3 # and split cells in 2 groups (high/low) using this signature score. 3 # and split cells in 2 groups (high/low) using this signature score.
4 4
5 options(show.error.messages = FALSE, 5 options(
6 error = function() { 6 show.error.messages = FALSE,
7 cat(geterrmessage(), file = stderr()) 7 error = function() {
8 q("no", 1, FALSE) 8 cat(geterrmessage(), file = stderr())
9 } 9 q("no", 1, FALSE)
10 }
10 ) 11 )
11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
12 warnings() 13 warnings()
13 14
14 library(optparse) 15 library(optparse)
16 library(ggplot2) 17 library(ggplot2)
17 library(gridExtra) 18 library(gridExtra)
18 19
19 # Arguments 20 # Arguments
20 option_list <- list( 21 option_list <- list(
21 make_option( 22 make_option(
22 "--input", 23 "--input",
23 default = NA, 24 default = NA,
24 type = "character", 25 type = "character",
25 help = "Input file that contains log2(CPM +1) values" 26 help = "Input file that contains log2(CPM +1) values"
26 ), 27 ),
27 make_option( 28 make_option(
28 "--sep", 29 "--sep",
29 default = "\t", 30 default = "\t",
30 type = "character", 31 type = "character",
31 help = "File separator [default : '%default' ]" 32 help = "File separator [default : '%default' ]"
32 ), 33 ),
33 make_option( 34 make_option(
34 "--colnames", 35 "--colnames",
35 default = TRUE, 36 default = TRUE,
36 type = "logical", 37 type = "logical",
37 help = "Consider first line as header ? [default : '%default' ]" 38 help = "Consider first line as header ? [default : '%default' ]"
38 ), 39 ),
39 make_option( 40 make_option(
40 "--genes", 41 "--genes",
41 default = NA, 42 default = NA,
42 type = "character", 43 type = "character",
43 help = "List of genes comma separated" 44 help = "List of genes comma separated"
44 ), 45 ),
45 make_option( 46 make_option(
46 "--percentile_threshold", 47 "--percentile_threshold",
47 default = 20, 48 default = 20,
48 type = "integer", 49 type = "integer",
49 help = "detection threshold to keep a gene in signature set [default : '%default' ]" 50 help = "detection threshold to keep a gene in signature set [default : '%default' ]"
50 ), 51 ),
51 make_option( 52 make_option(
52 "--output", 53 "--output",
53 default = "./output.tab", 54 default = "./output.tab",
54 type = "character", 55 type = "character",
55 help = "Output path [default : '%default' ]" 56 help = "Output path [default : '%default' ]"
56 ), 57 ),
57 make_option( 58 make_option(
58 "--stats", 59 "--stats",
59 default = "./statistics.tab", 60 default = "./statistics.tab",
60 type = "character", 61 type = "character",
61 help = "statistics path [default : '%default' ]" 62 help = "statistics path [default : '%default' ]"
62 ), 63 ),
63 make_option( 64 make_option(
64 "--correlations", 65 "--correlations",
65 default = "./correlations.tab", 66 default = "./correlations.tab",
66 type = "character", 67 type = "character",
67 help = "Correlations between signature genes [default : '%default' ]" 68 help = "Correlations between signature genes [default : '%default' ]"
68 ), 69 ),
69 make_option( 70 make_option(
70 "--covariances", 71 "--covariances",
71 default = "./statistics.tab", 72 default = "./statistics.tab",
72 type = "character", 73 type = "character",
73 help = "Covariances between signature genes [default : '%default' ]" 74 help = "Covariances between signature genes [default : '%default' ]"
74 ), 75 ),
75 make_option( 76 make_option(
76 "--pdf", 77 "--pdf",
77 default = "~/output.pdf", 78 default = "~/output.pdf",
78 type = "character", 79 type = "character",
79 help = "pdf path [default : '%default' ]" 80 help = "pdf path [default : '%default' ]"
80 ) 81 )
81 ) 82 )
82 83
83 opt <- parse_args(OptionParser(option_list = option_list), 84 opt <- parse_args(OptionParser(option_list = option_list),
84 args = commandArgs(trailingOnly = TRUE)) 85 args = commandArgs(trailingOnly = TRUE)
86 )
85 87
86 if (opt$sep == "tab") { 88 if (opt$sep == "tab") {
87 opt$sep <- "\t" 89 opt$sep <- "\t"
88 } 90 }
89 if (opt$sep == "comma") { 91 if (opt$sep == "comma") {
90 opt$sep <- "," 92 opt$sep <- ","
91 } 93 }
92 94
93 # Take input data 95 # Take input data
94 data.counts <- read.table( 96 data.counts <- read.table(
95 opt$input, 97 opt$input,
96 h = opt$colnames, 98 h = opt$colnames,
97 row.names = 1, 99 row.names = 1,
98 sep = opt$sep, 100 sep = opt$sep,
99 check.names = FALSE 101 check.names = FALSE
100 ) 102 )
101 103
102 # Get vector of target genes 104 # Get vector of target genes
103 genes <- unlist(strsplit(opt$genes, ",")) 105 genes <- unlist(strsplit(opt$genes, ","))
104 106
105 if (length(unique(genes %in% rownames(data.counts))) == 1) { 107 if (length(unique(genes %in% rownames(data.counts))) == 1) {
106 if (unique(genes %in% rownames(data.counts)) == FALSE) 108 if (unique(genes %in% rownames(data.counts)) == FALSE) {
107 stop("None of these genes are in your dataset: ", opt$genes) 109 stop("None of these genes are in your dataset: ", opt$genes)
110 }
108 } 111 }
109 112
110 logical_genes <- rownames(data.counts) %in% genes 113 logical_genes <- rownames(data.counts) %in% genes
111 114
112 # Retrieve target genes in counts data 115 # Retrieve target genes in counts data
114 117
115 # compute covariance 118 # compute covariance
116 signature.covariances <- as.data.frame(cov(t(signature.counts))) 119 signature.covariances <- as.data.frame(cov(t(signature.counts)))
117 signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) 120 signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances)
118 write.table(signature.covariances, 121 write.table(signature.covariances,
119 file = opt$covariances, 122 file = opt$covariances,
120 quote = FALSE, 123 quote = FALSE,
121 row.names = FALSE, 124 row.names = FALSE,
122 sep = "\t") 125 sep = "\t"
126 )
123 127
124 # compute signature.correlations 128 # compute signature.correlations
125 signature.correlations <- as.data.frame(cor(t(signature.counts))) 129 signature.correlations <- as.data.frame(cor(t(signature.counts)))
126 signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) 130 signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations)
127 write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") 131 write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t")
128 132
129 ## Descriptive Statistics Function 133 ## Descriptive Statistics Function
130 descriptive_stats <- function(InputData) { 134 descriptive_stats <- function(InputData) {
131 SummaryData <- data.frame( 135 SummaryData <- data.frame(
132 mean = rowMeans(InputData), 136 mean = rowMeans(InputData),
133 SD = apply(InputData, 1, sd), 137 SD = apply(InputData, 1, sd),
134 Variance = apply(InputData, 1, var), 138 Variance = apply(InputData, 1, var),
135 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { 139 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
136 (sum(x != 0) / ncol(y)) * 100 140 (sum(x != 0) / ncol(y)) * 100
137 }) 141 })
138 ) 142 )
139 return(SummaryData) 143 return(SummaryData)
140 } 144 }
141 145
142 signature_stats <- descriptive_stats(signature.counts) 146 signature_stats <- descriptive_stats(signature.counts)
143 147
144 # Find poorly detected genes from the signature 148 # Find poorly detected genes from the signature
145 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold 149 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold
146 150
147 # Add warnings 151 # Add warnings
148 if (length(unique(kept_genes)) > 1) { 152 if (length(unique(kept_genes)) > 1) {
149 cat( 153 cat(
150 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", 154 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :",
151 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), 155 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "),
152 "\n" 156 "\n"
153 ) 157 )
154 } else { 158 } else {
155 if (unique(kept_genes) == FALSE) { 159 if (unique(kept_genes) == FALSE) {
156 stop( 160 stop(
157 "None of these genes are detected in ", 161 "None of these genes are detected in ",
158 opt$percent, 162 opt$percent,
159 "% of your cells: ", 163 "% of your cells: ",
160 paste(rownames(signature_stats), collapse = ", "), 164 paste(rownames(signature_stats), collapse = ", "),
161 ". You can be less stringent thanks to --percent parameter." 165 ". You can be less stringent thanks to --percent parameter."
162 ) 166 )
163 } 167 }
164 } 168 }
165 169
166 # Remove genes poorly detected in the dataset 170 # Remove genes poorly detected in the dataset
167 signature.counts <- signature.counts[kept_genes, ] 171 signature.counts <- signature.counts[kept_genes, ]
168 172
171 175
172 # Geometric mean by cell 176 # Geometric mean by cell
173 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych 177 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych
174 178
175 # Add results in signature_output 179 # Add results in signature_output
176 signature_output <- data.frame(cell = names(score), 180 signature_output <- data.frame(
177 score = score, 181 cell = names(score),
178 rate = ifelse(score > mean(score), "HIGH", "LOW"), 182 score = score,
179 nGenes = colSums(data.counts != 0), 183 rate = ifelse(score > mean(score), "HIGH", "LOW"),
180 total_counts = colSums(data.counts)) 184 nGenes = colSums(data.counts != 0),
185 total_counts = colSums(data.counts)
186 )
181 187
182 # statistics of input genes, signature genes first lines 188 # statistics of input genes, signature genes first lines
183 statistics.counts <- rbind(subset(data.counts, logical_genes), 189 statistics.counts <- rbind(
184 subset(data.counts, !logical_genes)) 190 subset(data.counts, logical_genes),
191 subset(data.counts, !logical_genes)
192 )
185 statistics <- descriptive_stats(statistics.counts) 193 statistics <- descriptive_stats(statistics.counts)
186 statistics <- cbind(gene = rownames(statistics), statistics) 194 statistics <- cbind(gene = rownames(statistics), statistics)
187 195
188 196
189 197
190 # Re-arrange score matrix for plots 198 # Re-arrange score matrix for plots
191 score <- data.frame(score = score, 199 score <- data.frame(
192 order = rank(score, ties.method = "first"), 200 score = score,
193 signature = signature_output$rate, 201 order = rank(score, ties.method = "first"),
194 stringsAsFactors = FALSE) 202 signature = signature_output$rate,
203 stringsAsFactors = FALSE
204 )
195 205
196 pdf(file = opt$pdf) 206 pdf(file = opt$pdf)
197 myplot <- ggplot(signature_output, aes(x = rate, y = score)) + 207 myplot <- ggplot(signature_output, aes(x = rate, y = score)) +
198 geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + 208 geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) +
199 geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + 209 geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") +
200 scale_fill_manual(values = c("#ff0000", "#08661e")) + 210 scale_fill_manual(values = c("#ff0000", "#08661e")) +
201 geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") + 211 geom_jitter(size = 0.2) +
202 annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, 212 labs(y = "Score", x = "Rate") +
203 color = "black", label = mean(score$score), parse = TRUE) + 213 annotate("text",
204 labs(title = "Violin plots of Cell signature scores") 214 x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5,
215 color = "black", label = mean(score$score), parse = TRUE
216 ) +
217 labs(title = "Violin plots of Cell signature scores")
205 218
206 print(myplot) 219 print(myplot)
207 dev.off() 220 dev.off()
208 221
209 # Save file 222 # Save file
210 write.table( 223 write.table(
211 signature_output, 224 signature_output,
212 opt$output, 225 opt$output,
213 sep = "\t", 226 sep = "\t",
214 quote = FALSE, 227 quote = FALSE,
215 col.names = TRUE, 228 col.names = TRUE,
216 row.names = FALSE 229 row.names = FALSE
217 ) 230 )
218 231
219 write.table( 232 write.table(
220 statistics, 233 statistics,
221 opt$stats, 234 opt$stats,
222 sep = "\t", 235 sep = "\t",
223 quote = FALSE, 236 quote = FALSE,
224 col.names = TRUE, 237 col.names = TRUE,
225 row.names = FALSE 238 row.names = FALSE
226 ) 239 )