Mercurial > repos > artbio > gsc_signature_score
comparison signature_score.R @ 5:bd810b6d8a3b draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_signature_score commit ae36c63e1e5c0c285593d427f0ffd348a3d89e3f
author | artbio |
---|---|
date | Thu, 07 Nov 2024 22:43:58 +0000 |
parents | 31da579595c5 |
children |
comparison
equal
deleted
inserted
replaced
4:31da579595c5 | 5:bd810b6d8a3b |
---|---|
1 # Signature score | 1 # Signature score |
2 # Compute the signature score based on the geometric mean of the target gene expression | 2 # Compute the signature score based on the geometric mean of the target gene expression |
3 # and split cells in 2 groups (high/low) using this signature score. | 3 # and split cells in 2 groups (high/low) using this signature score. |
4 | 4 |
5 options(show.error.messages = FALSE, | 5 options( |
6 error = function() { | 6 show.error.messages = FALSE, |
7 cat(geterrmessage(), file = stderr()) | 7 error = function() { |
8 q("no", 1, FALSE) | 8 cat(geterrmessage(), file = stderr()) |
9 } | 9 q("no", 1, FALSE) |
10 } | |
10 ) | 11 ) |
11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
12 warnings() | 13 warnings() |
13 | 14 |
14 library(optparse) | 15 library(optparse) |
16 library(ggplot2) | 17 library(ggplot2) |
17 library(gridExtra) | 18 library(gridExtra) |
18 | 19 |
19 # Arguments | 20 # Arguments |
20 option_list <- list( | 21 option_list <- list( |
21 make_option( | 22 make_option( |
22 "--input", | 23 "--input", |
23 default = NA, | 24 default = NA, |
24 type = "character", | 25 type = "character", |
25 help = "Input file that contains log2(CPM +1) values" | 26 help = "Input file that contains log2(CPM +1) values" |
26 ), | 27 ), |
27 make_option( | 28 make_option( |
28 "--sep", | 29 "--sep", |
29 default = "\t", | 30 default = "\t", |
30 type = "character", | 31 type = "character", |
31 help = "File separator [default : '%default' ]" | 32 help = "File separator [default : '%default' ]" |
32 ), | 33 ), |
33 make_option( | 34 make_option( |
34 "--colnames", | 35 "--colnames", |
35 default = TRUE, | 36 default = TRUE, |
36 type = "logical", | 37 type = "logical", |
37 help = "Consider first line as header ? [default : '%default' ]" | 38 help = "Consider first line as header ? [default : '%default' ]" |
38 ), | 39 ), |
39 make_option( | 40 make_option( |
40 "--genes", | 41 "--genes", |
41 default = NA, | 42 default = NA, |
42 type = "character", | 43 type = "character", |
43 help = "List of genes comma separated" | 44 help = "List of genes comma separated" |
44 ), | 45 ), |
45 make_option( | 46 make_option( |
46 "--percentile_threshold", | 47 "--percentile_threshold", |
47 default = 20, | 48 default = 20, |
48 type = "integer", | 49 type = "integer", |
49 help = "detection threshold to keep a gene in signature set [default : '%default' ]" | 50 help = "detection threshold to keep a gene in signature set [default : '%default' ]" |
50 ), | 51 ), |
51 make_option( | 52 make_option( |
52 "--output", | 53 "--output", |
53 default = "./output.tab", | 54 default = "./output.tab", |
54 type = "character", | 55 type = "character", |
55 help = "Output path [default : '%default' ]" | 56 help = "Output path [default : '%default' ]" |
56 ), | 57 ), |
57 make_option( | 58 make_option( |
58 "--stats", | 59 "--stats", |
59 default = "./statistics.tab", | 60 default = "./statistics.tab", |
60 type = "character", | 61 type = "character", |
61 help = "statistics path [default : '%default' ]" | 62 help = "statistics path [default : '%default' ]" |
62 ), | 63 ), |
63 make_option( | 64 make_option( |
64 "--correlations", | 65 "--correlations", |
65 default = "./correlations.tab", | 66 default = "./correlations.tab", |
66 type = "character", | 67 type = "character", |
67 help = "Correlations between signature genes [default : '%default' ]" | 68 help = "Correlations between signature genes [default : '%default' ]" |
68 ), | 69 ), |
69 make_option( | 70 make_option( |
70 "--covariances", | 71 "--covariances", |
71 default = "./statistics.tab", | 72 default = "./statistics.tab", |
72 type = "character", | 73 type = "character", |
73 help = "Covariances between signature genes [default : '%default' ]" | 74 help = "Covariances between signature genes [default : '%default' ]" |
74 ), | 75 ), |
75 make_option( | 76 make_option( |
76 "--pdf", | 77 "--pdf", |
77 default = "~/output.pdf", | 78 default = "~/output.pdf", |
78 type = "character", | 79 type = "character", |
79 help = "pdf path [default : '%default' ]" | 80 help = "pdf path [default : '%default' ]" |
80 ) | 81 ) |
81 ) | 82 ) |
82 | 83 |
83 opt <- parse_args(OptionParser(option_list = option_list), | 84 opt <- parse_args(OptionParser(option_list = option_list), |
84 args = commandArgs(trailingOnly = TRUE)) | 85 args = commandArgs(trailingOnly = TRUE) |
86 ) | |
85 | 87 |
86 if (opt$sep == "tab") { | 88 if (opt$sep == "tab") { |
87 opt$sep <- "\t" | 89 opt$sep <- "\t" |
88 } | 90 } |
89 if (opt$sep == "comma") { | 91 if (opt$sep == "comma") { |
90 opt$sep <- "," | 92 opt$sep <- "," |
91 } | 93 } |
92 | 94 |
93 # Take input data | 95 # Take input data |
94 data.counts <- read.table( | 96 data.counts <- read.table( |
95 opt$input, | 97 opt$input, |
96 h = opt$colnames, | 98 h = opt$colnames, |
97 row.names = 1, | 99 row.names = 1, |
98 sep = opt$sep, | 100 sep = opt$sep, |
99 check.names = FALSE | 101 check.names = FALSE |
100 ) | 102 ) |
101 | 103 |
102 # Get vector of target genes | 104 # Get vector of target genes |
103 genes <- unlist(strsplit(opt$genes, ",")) | 105 genes <- unlist(strsplit(opt$genes, ",")) |
104 | 106 |
105 if (length(unique(genes %in% rownames(data.counts))) == 1) { | 107 if (length(unique(genes %in% rownames(data.counts))) == 1) { |
106 if (unique(genes %in% rownames(data.counts)) == FALSE) | 108 if (unique(genes %in% rownames(data.counts)) == FALSE) { |
107 stop("None of these genes are in your dataset: ", opt$genes) | 109 stop("None of these genes are in your dataset: ", opt$genes) |
110 } | |
108 } | 111 } |
109 | 112 |
110 logical_genes <- rownames(data.counts) %in% genes | 113 logical_genes <- rownames(data.counts) %in% genes |
111 | 114 |
112 # Retrieve target genes in counts data | 115 # Retrieve target genes in counts data |
114 | 117 |
115 # compute covariance | 118 # compute covariance |
116 signature.covariances <- as.data.frame(cov(t(signature.counts))) | 119 signature.covariances <- as.data.frame(cov(t(signature.counts))) |
117 signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) | 120 signature.covariances <- cbind(gene = rownames(signature.covariances), signature.covariances) |
118 write.table(signature.covariances, | 121 write.table(signature.covariances, |
119 file = opt$covariances, | 122 file = opt$covariances, |
120 quote = FALSE, | 123 quote = FALSE, |
121 row.names = FALSE, | 124 row.names = FALSE, |
122 sep = "\t") | 125 sep = "\t" |
126 ) | |
123 | 127 |
124 # compute signature.correlations | 128 # compute signature.correlations |
125 signature.correlations <- as.data.frame(cor(t(signature.counts))) | 129 signature.correlations <- as.data.frame(cor(t(signature.counts))) |
126 signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) | 130 signature.correlations <- cbind(gene = rownames(signature.correlations), signature.correlations) |
127 write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") | 131 write.table(signature.correlations, file = opt$correlations, quote = FALSE, row.names = FALSE, sep = "\t") |
128 | 132 |
129 ## Descriptive Statistics Function | 133 ## Descriptive Statistics Function |
130 descriptive_stats <- function(InputData) { | 134 descriptive_stats <- function(InputData) { |
131 SummaryData <- data.frame( | 135 SummaryData <- data.frame( |
132 mean = rowMeans(InputData), | 136 mean = rowMeans(InputData), |
133 SD = apply(InputData, 1, sd), | 137 SD = apply(InputData, 1, sd), |
134 Variance = apply(InputData, 1, var), | 138 Variance = apply(InputData, 1, var), |
135 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { | 139 Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { |
136 (sum(x != 0) / ncol(y)) * 100 | 140 (sum(x != 0) / ncol(y)) * 100 |
137 }) | 141 }) |
138 ) | 142 ) |
139 return(SummaryData) | 143 return(SummaryData) |
140 } | 144 } |
141 | 145 |
142 signature_stats <- descriptive_stats(signature.counts) | 146 signature_stats <- descriptive_stats(signature.counts) |
143 | 147 |
144 # Find poorly detected genes from the signature | 148 # Find poorly detected genes from the signature |
145 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold | 149 kept_genes <- signature_stats$Percentage_Detection >= opt$percentile_threshold |
146 | 150 |
147 # Add warnings | 151 # Add warnings |
148 if (length(unique(kept_genes)) > 1) { | 152 if (length(unique(kept_genes)) > 1) { |
149 cat( | 153 cat( |
150 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", | 154 "WARNINGS ! Following genes were removed from further analysis due to low gene expression :", |
151 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), | 155 paste(paste(rownames(signature.counts)[!kept_genes], round(signature_stats$Percentage_Detection[!kept_genes], 2), sep = " : "), collapse = ", "), |
152 "\n" | 156 "\n" |
153 ) | 157 ) |
154 } else { | 158 } else { |
155 if (unique(kept_genes) == FALSE) { | 159 if (unique(kept_genes) == FALSE) { |
156 stop( | 160 stop( |
157 "None of these genes are detected in ", | 161 "None of these genes are detected in ", |
158 opt$percent, | 162 opt$percent, |
159 "% of your cells: ", | 163 "% of your cells: ", |
160 paste(rownames(signature_stats), collapse = ", "), | 164 paste(rownames(signature_stats), collapse = ", "), |
161 ". You can be less stringent thanks to --percent parameter." | 165 ". You can be less stringent thanks to --percent parameter." |
162 ) | 166 ) |
163 } | 167 } |
164 } | 168 } |
165 | 169 |
166 # Remove genes poorly detected in the dataset | 170 # Remove genes poorly detected in the dataset |
167 signature.counts <- signature.counts[kept_genes, ] | 171 signature.counts <- signature.counts[kept_genes, ] |
168 | 172 |
171 | 175 |
172 # Geometric mean by cell | 176 # Geometric mean by cell |
173 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych | 177 score <- apply(signature.counts, 2, geometric.mean) # geometric.mean requires psych |
174 | 178 |
175 # Add results in signature_output | 179 # Add results in signature_output |
176 signature_output <- data.frame(cell = names(score), | 180 signature_output <- data.frame( |
177 score = score, | 181 cell = names(score), |
178 rate = ifelse(score > mean(score), "HIGH", "LOW"), | 182 score = score, |
179 nGenes = colSums(data.counts != 0), | 183 rate = ifelse(score > mean(score), "HIGH", "LOW"), |
180 total_counts = colSums(data.counts)) | 184 nGenes = colSums(data.counts != 0), |
185 total_counts = colSums(data.counts) | |
186 ) | |
181 | 187 |
182 # statistics of input genes, signature genes first lines | 188 # statistics of input genes, signature genes first lines |
183 statistics.counts <- rbind(subset(data.counts, logical_genes), | 189 statistics.counts <- rbind( |
184 subset(data.counts, !logical_genes)) | 190 subset(data.counts, logical_genes), |
191 subset(data.counts, !logical_genes) | |
192 ) | |
185 statistics <- descriptive_stats(statistics.counts) | 193 statistics <- descriptive_stats(statistics.counts) |
186 statistics <- cbind(gene = rownames(statistics), statistics) | 194 statistics <- cbind(gene = rownames(statistics), statistics) |
187 | 195 |
188 | 196 |
189 | 197 |
190 # Re-arrange score matrix for plots | 198 # Re-arrange score matrix for plots |
191 score <- data.frame(score = score, | 199 score <- data.frame( |
192 order = rank(score, ties.method = "first"), | 200 score = score, |
193 signature = signature_output$rate, | 201 order = rank(score, ties.method = "first"), |
194 stringsAsFactors = FALSE) | 202 signature = signature_output$rate, |
203 stringsAsFactors = FALSE | |
204 ) | |
195 | 205 |
196 pdf(file = opt$pdf) | 206 pdf(file = opt$pdf) |
197 myplot <- ggplot(signature_output, aes(x = rate, y = score)) + | 207 myplot <- ggplot(signature_output, aes(x = rate, y = score)) + |
198 geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + | 208 geom_violin(aes(fill = rate), alpha = .5, trim = FALSE, show.legend = FALSE, cex = 0.5) + |
199 geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + | 209 geom_abline(slope = 0, intercept = mean(score$score), lwd = 0.5, color = "red") + |
200 scale_fill_manual(values = c("#ff0000", "#08661e")) + | 210 scale_fill_manual(values = c("#ff0000", "#08661e")) + |
201 geom_jitter(size = 0.2) + labs(y = "Score", x = "Rate") + | 211 geom_jitter(size = 0.2) + |
202 annotate("text", x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, | 212 labs(y = "Score", x = "Rate") + |
203 color = "black", label = mean(score$score), parse = TRUE) + | 213 annotate("text", |
204 labs(title = "Violin plots of Cell signature scores") | 214 x = 0.55, y = mean(score$score), cex = 3, vjust = 1.5, |
215 color = "black", label = mean(score$score), parse = TRUE | |
216 ) + | |
217 labs(title = "Violin plots of Cell signature scores") | |
205 | 218 |
206 print(myplot) | 219 print(myplot) |
207 dev.off() | 220 dev.off() |
208 | 221 |
209 # Save file | 222 # Save file |
210 write.table( | 223 write.table( |
211 signature_output, | 224 signature_output, |
212 opt$output, | 225 opt$output, |
213 sep = "\t", | 226 sep = "\t", |
214 quote = FALSE, | 227 quote = FALSE, |
215 col.names = TRUE, | 228 col.names = TRUE, |
216 row.names = FALSE | 229 row.names = FALSE |
217 ) | 230 ) |
218 | 231 |
219 write.table( | 232 write.table( |
220 statistics, | 233 statistics, |
221 opt$stats, | 234 opt$stats, |
222 sep = "\t", | 235 sep = "\t", |
223 quote = FALSE, | 236 quote = FALSE, |
224 col.names = TRUE, | 237 col.names = TRUE, |
225 row.names = FALSE | 238 row.names = FALSE |
226 ) | 239 ) |