Mercurial > repos > artbio > cpm_tpm_rpk
comparison cpm_tpm_rpk.R @ 7:f00c1c34565e draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/cpm_tpm_rpk commit 8cca5f05212cd25113955fb9c11801f4047c12b4
author | artbio |
---|---|
date | Wed, 03 Jul 2024 14:39:26 +0000 |
parents | bcff1eb6fdb5 |
children |
comparison
equal
deleted
inserted
replaced
6:d2575928b824 | 7:f00c1c34565e |
---|---|
1 if (length(commandArgs(TRUE)) == 0) { | 1 if (length(commandArgs(TRUE)) == 0) { |
2 system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) | 2 system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) |
3 q("no") | 3 q("no") |
4 } | 4 } |
5 | 5 |
6 | 6 |
7 # load packages that are provided in the conda env | 7 # load packages that are provided in the conda env |
8 options(show.error.messages = FALSE, | 8 options(show.error.messages = FALSE, |
9 error = function() { | 9 error = function() { |
10 cat(geterrmessage(), file = stderr()) | 10 cat(geterrmessage(), file = stderr()) |
11 q("no", 1, FALSE) | 11 q("no", 1, FALSE) |
12 } | 12 } |
13 ) | 13 ) |
14 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint | 14 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint |
15 warnings() | 15 warnings() |
16 library(optparse) | 16 library(optparse) |
17 library(ggplot2) | 17 library(ggplot2) |
18 library(reshape2) | 18 library(reshape2) |
19 library(Rtsne) # nolint | 19 library(Rtsne) # nolint |
20 library(ggfortify) | 20 library(ggfortify) |
21 | 21 |
22 | 22 |
23 | |
24 #Arguments | 23 #Arguments |
25 option_list <- list( | 24 option_list <- list( |
26 make_option( | 25 make_option( |
27 c("-d", "--data"), | 26 c("-d", "--data"), |
28 default = NA, | 27 default = NA, |
29 type = "character", | 28 type = "character", |
30 help = "Input file that contains count values to transform" | 29 help = "Input file that contains count values to transform" |
31 ), | 30 ), |
32 make_option( | 31 make_option( |
33 c("-t", "--type"), | 32 c("-t", "--type"), |
34 default = "cpm", | 33 default = "cpm", |
35 type = "character", | 34 type = "character", |
36 help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" | 35 help = "Transformation type, either cpm, tpm, rpkm or none[default : '%default' ]" |
37 ), | 36 ), |
38 make_option( | 37 make_option( |
39 c("-s", "--sep"), | 38 c("-s", "--sep"), |
40 default = "\t", | 39 default = "\t", |
41 type = "character", | 40 type = "character", |
42 help = "File separator [default : '%default' ]" | 41 help = "File separator [default : '%default' ]" |
43 ), | 42 ), |
44 make_option( | 43 make_option( |
45 c("-c", "--colnames"), | 44 c("-c", "--colnames"), |
46 default = TRUE, | 45 default = TRUE, |
47 type = "logical", | 46 type = "logical", |
48 help = "Consider first line as header ? [default : '%default' ]" | 47 help = "Consider first line as header ? [default : '%default' ]" |
49 ), | 48 ), |
50 make_option( | 49 make_option( |
51 c("-f", "--gene"), | 50 c("-f", "--gene"), |
52 default = NA, | 51 default = NA, |
53 type = "character", | 52 type = "character", |
54 help = "Two column of gene length file" | 53 help = "Two column of gene length file" |
55 ), | 54 ), |
56 make_option( | 55 make_option( |
57 c("-a", "--gene_sep"), | 56 c("-a", "--gene_sep"), |
58 default = "\t", | 57 default = "\t", |
59 type = "character", | 58 type = "character", |
60 help = "Gene length file separator [default : '%default' ]" | 59 help = "Gene length file separator [default : '%default' ]" |
61 ), | 60 ), |
62 make_option( | 61 make_option( |
63 c("-b", "--gene_header"), | 62 c("-b", "--gene_header"), |
64 default = TRUE, | 63 default = TRUE, |
65 type = "logical", | 64 type = "logical", |
66 help = "Consider first line of gene length as header ? [default : '%default' ]" | 65 help = "Consider first line of gene length as header ? [default : '%default' ]" |
67 ), | 66 ), |
68 make_option( | 67 make_option( |
69 c("-l", "--log"), | 68 c("-l", "--log"), |
70 default = FALSE, | 69 default = FALSE, |
71 type = "logical", | 70 type = "logical", |
72 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" | 71 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" |
73 ), | 72 ), |
74 make_option( | 73 make_option( |
75 c("-o", "--out"), | 74 c("-o", "--out"), |
76 default = "res.tab", | 75 default = "res.tab", |
77 type = "character", | 76 type = "character", |
78 help = "Output name [default : '%default' ]" | 77 help = "Output name [default : '%default' ]" |
79 ), | 78 ), |
80 make_option( | 79 make_option( |
81 "--visu", | 80 "--visu", |
82 default = FALSE, | 81 default = FALSE, |
83 type = "logical", | 82 type = "logical", |
84 help = "performs T-SNE [default : '%default' ]" | 83 help = "performs T-SNE [default : '%default' ]" |
85 ), | 84 ), |
86 make_option( | 85 make_option( |
87 "--tsne_labels", | 86 "--tsne_labels", |
88 default = FALSE, | 87 default = FALSE, |
89 type = "logical", | 88 type = "logical", |
90 help = "add labels to t-SNE plot [default : '%default' ]" | 89 help = "add labels to t-SNE plot [default : '%default' ]" |
91 ), | 90 ), |
92 make_option( | 91 make_option( |
93 "--seed", | 92 "--seed", |
94 default = 42, | 93 default = 42, |
95 type = "integer", | 94 type = "integer", |
96 help = "Seed value for reproducibility [default : '%default' ]" | 95 help = "Seed value for reproducibility [default : '%default' ]" |
97 ), | 96 ), |
98 make_option( | 97 make_option( |
99 "--perp", | 98 "--perp", |
100 default = 5.0, | 99 default = 5.0, |
101 type = "numeric", | 100 type = "numeric", |
102 help = "perplexity [default : '%default' ]" | 101 help = "perplexity [default : '%default' ]" |
103 ), | 102 ), |
104 make_option( | 103 make_option( |
105 "--theta", | 104 "--theta", |
106 default = 1.0, | 105 default = 1.0, |
107 type = "numeric", | 106 type = "numeric", |
108 help = "theta [default : '%default' ]" | 107 help = "theta [default : '%default' ]" |
109 ), | 108 ), |
110 make_option( | 109 make_option( |
111 c("-D", "--tsne_out"), | 110 c("-D", "--tsne_out"), |
112 default = "tsne.pdf", | 111 default = "tsne.pdf", |
113 type = "character", | 112 type = "character", |
114 help = "T-SNE pdf [default : '%default' ]" | 113 help = "T-SNE pdf [default : '%default' ]" |
115 ), | 114 ), |
116 make_option( | 115 make_option( |
117 "--pca_out", | 116 "--pca_out", |
118 default = "pca.pdf", | 117 default = "pca.pdf", |
119 type = "character", | 118 type = "character", |
120 help = "PCA pdf [default : '%default' ]" | 119 help = "PCA pdf [default : '%default' ]" |
121 ) | 120 ) |
122 | 121 |
123 ) | 122 ) |
124 | 123 |
125 opt <- parse_args(OptionParser(option_list = option_list), | 124 opt <- parse_args(OptionParser(option_list = option_list), |
126 args = commandArgs(trailingOnly = TRUE)) | 125 args = commandArgs(trailingOnly = TRUE)) |
127 | 126 |
128 if (opt$data == "" && !(opt$help)) { | 127 if (opt$data == "" && !(opt$help)) { |
129 stop("At least one argument must be supplied (count data).\n", | 128 stop("At least one argument must be supplied (count data).\n", |
130 call. = FALSE) | 129 call. = FALSE) |
131 } else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") { | 130 } else if ((opt$type == "tpm" || opt$type == "rpkm") && opt$gene == "") { |
132 stop("At least two arguments must be supplied (count data and gene length file).\n", | 131 stop("At least two arguments must be supplied (count data and gene length file).\n", |
133 call. = FALSE) | 132 call. = FALSE) |
134 } else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") { | 133 } else if (opt$type != "tpm" && opt$type != "rpkm" && opt$type != "cpm" && opt$type != "none") { |
135 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", | 134 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpkm.\n", |
136 call. = FALSE) | 135 call. = FALSE) |
137 } | 136 } |
138 | 137 |
139 if (opt$sep == "tab") { | 138 if (opt$sep == "tab") { |
140 opt$sep <- "\t" | 139 opt$sep <- "\t" |
141 } | 140 } |
142 if (opt$gene_sep == "tab") { | 141 if (opt$gene_sep == "tab") { |
143 opt$gene_sep <- "\t" | 142 opt$gene_sep <- "\t" |
144 } | 143 } |
145 | 144 |
146 cpm <- function(count) { | 145 cpm <- function(count) { |
147 (count / colSums(count)) * 1000000 | 146 (count / colSums(count)) * 1000000 |
148 } | 147 } |
149 | 148 |
150 | 149 |
151 rpk <- function(count, length) { | 150 rpk <- function(count, length) { |
152 count / (length / 1000) | 151 count / (length / 1000) |
153 } | 152 } |
154 | 153 |
155 tpm <- function(count, length) { | 154 tpm <- function(count, length) { |
156 rpk <- rpk(count, length) | 155 rpk <- rpk(count, length) |
157 per_million_factor <- colSums(rpk) / 1000000 | 156 per_million_factor <- colSums(rpk) / 1000000 |
158 tpm <- rpk / per_million_factor | 157 tpm <- rpk / per_million_factor |
159 return(tpm) | 158 return(tpm) |
159 } | |
160 | |
161 rpkm <- function(count, length) { | |
162 rpk <- rpk(count, length) | |
163 per_million_factor <- colSums(as.data.frame(count)) / 1000000 | |
164 rpkm <- rpk / per_million_factor | |
165 return(rpkm) | |
160 } | 166 } |
161 | 167 |
162 #### running code #### | 168 #### running code #### |
163 | 169 |
164 data <- read.delim( | 170 data <- read.delim( |
165 opt$data, | 171 opt$data, |
166 check.names = FALSE, | 172 check.names = FALSE, |
167 header = opt$colnames, | 173 header = opt$colnames, |
168 row.names = 1, | 174 row.names = 1, |
169 sep = opt$sep | 175 sep = opt$sep |
170 ) | 176 ) |
171 | 177 |
172 if (opt$type == "tpm" || opt$type == "rpk") { | 178 if (opt$type == "tpm" || opt$type == "rpkm") { |
173 gene_length <- as.data.frame( | 179 gene_length <- as.data.frame( |
174 read.delim( | 180 read.delim( |
175 opt$gene, | 181 opt$gene, |
176 header = opt$gene_header, | 182 header = opt$gene_header, |
177 row.names = 1, | 183 row.names = 1, |
178 sep = opt$gene_sep | 184 sep = opt$gene_sep |
185 ) | |
179 ) | 186 ) |
180 ) | 187 gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) |
181 gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) | 188 } |
182 } | 189 |
183 | 190 |
184 | 191 if (opt$type == "cpm") { |
185 if (opt$type == "cpm") | 192 res <- cpm(data) |
186 res <- cpm(data) | 193 } |
187 if (opt$type == "tpm") | 194 if (opt$type == "tpm") { |
188 res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) | 195 res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) |
189 if (opt$type == "rpk") | 196 } |
190 res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) | 197 if (opt$type == "rpkm") { |
191 if (opt$type == "none") | 198 res <- as.data.frame(apply(data, 2, rpkm, length = gene_length), row.names = rownames(data)) |
192 res <- data | 199 } |
200 if (opt$type == "none") { | |
201 res <- data | |
202 } | |
193 colnames(res) <- colnames(data) | 203 colnames(res) <- colnames(data) |
194 | 204 |
195 if (opt$log == TRUE) { | 205 if (opt$log == TRUE) { |
196 res <- log2(res + 1) | 206 res <- log2(res + 1) |
197 } | 207 } |
198 | 208 |
199 if (opt$visu == TRUE) { | 209 if (opt$visu == TRUE) { |
200 df <- res | 210 df <- res |
201 # filter and transpose df for tsne and pca | 211 # filter and transpose df for tsne and pca |
202 df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) | 212 df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) |
203 tdf <- t(df) | 213 tdf <- t(df) |
204 # make tsne and plot results | 214 # make tsne and plot results |
205 set.seed(opt$seed) ## Sets seed for reproducibility | 215 set.seed(opt$seed) ## Sets seed for reproducibility |
206 tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) | 216 tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) |
207 embedding <- as.data.frame(tsne_out$Y) | 217 embedding <- as.data.frame(tsne_out$Y) |
208 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) | 218 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) |
209 gg_legend <- theme(legend.position = "none") | 219 gg_legend <- theme(legend.position = "none") |
210 ggplot(embedding, aes(x = V1, y = V2)) + | 220 ggplot(embedding, aes(x = V1, y = V2)) + |
211 geom_point(size = 1, color = "red") + | 221 geom_point(size = 1, color = "red") + |
212 gg_legend + | 222 gg_legend + |
213 xlab("") + | 223 xlab("") + |
214 ylab("") + | 224 ylab("") + |
215 ggtitle("t-SNE") + | 225 ggtitle("t-SNE") + |
226 if (opt$tsne_labels == TRUE) { | |
227 geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") | |
228 } | |
229 ggsave(file = opt$tsne_out, device = "pdf") | |
230 # make PCA and plot result with ggfortify (autoplot) | |
231 tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE) | |
216 if (opt$tsne_labels == TRUE) { | 232 if (opt$tsne_labels == TRUE) { |
217 geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") | 233 autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2, |
234 label.hjust = 1.2, | |
235 colour = "darkblue") + | |
236 geom_point(size = 1, color = "red") + | |
237 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + | |
238 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + | |
239 ggtitle("PCA") | |
240 ggsave(file = opt$pca_out, device = "pdf") | |
241 } else { | |
242 autoplot(tdf_pca, shape = TRUE, colour = "darkblue") + | |
243 geom_point(size = 1, color = "red") + | |
244 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + | |
245 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + | |
246 ggtitle("PCA") | |
247 ggsave(file = opt$pca_out, device = "pdf") | |
218 } | 248 } |
219 ggsave(file = opt$tsne_out, device = "pdf") | |
220 # make PCA and plot result with ggfortify (autoplot) | |
221 tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE) | |
222 if (opt$tsne_labels == TRUE) { | |
223 autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2, | |
224 label.hjust = 1.2, | |
225 colour = "darkblue") + | |
226 geom_point(size = 1, color = "red") + | |
227 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + | |
228 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + | |
229 ggtitle("PCA") | |
230 ggsave(file = opt$pca_out, device = "pdf") | |
231 } else { | |
232 autoplot(tdf_pca, shape = TRUE, colour = "darkblue") + | |
233 geom_point(size = 1, color = "red") + | |
234 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + | |
235 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + | |
236 ggtitle("PCA") | |
237 ggsave(file = opt$pca_out, device = "pdf") | |
238 } | |
239 } | 249 } |
240 | 250 |
241 # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output | 251 # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output |
242 is_num <- sapply(res, is.numeric) | 252 is_num <- sapply(res, is.numeric) |
243 res[is_num] <- lapply(res[is_num], round, 8) | 253 res[is_num] <- lapply(res[is_num], round, 8) |
244 | 254 |
245 write.table( | 255 write.table( |
246 cbind(Features = rownames(res), res), | 256 cbind(Features = rownames(res), res), |
247 opt$out, | 257 opt$out, |
248 col.names = opt$colnames, | 258 col.names = opt$colnames, |
249 row.names = FALSE, | 259 row.names = FALSE, |
250 quote = FALSE, | 260 quote = FALSE, |
251 sep = "\t" | 261 sep = "\t" |
252 ) | 262 ) |