Mercurial > repos > artbio > cpm_tpm_rpk
comparison cpm_tpm_rpk.R @ 4:be358a1ebf67 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cpm_tpm_rpk commit a8486c89b7ddabfb1e814c2c42f6c04d3896904c
author | artbio |
---|---|
date | Thu, 05 Oct 2023 13:50:22 +0000 |
parents | 8b1020c25f0f |
children | bcff1eb6fdb5 |
comparison
equal
deleted
inserted
replaced
3:8b1020c25f0f | 4:be358a1ebf67 |
---|---|
1 if (length(commandArgs(TRUE)) == 0) { | 1 if (length(commandArgs(TRUE)) == 0) { |
2 system("Rscript cpm_tpm_rpk.R -h", intern = F) | 2 system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) |
3 q("no") | 3 q("no") |
4 } | 4 } |
5 | 5 |
6 | 6 |
7 # load packages that are provided in the conda env | 7 # load packages that are provided in the conda env |
8 options( show.error.messages=F, | 8 options(show.error.messages = FALSE, |
9 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 9 error = function() { |
10 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 10 cat(geterrmessage(), file = stderr()) |
11 q("no", 1, FALSE) | |
12 } | |
13 ) | |
14 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint | |
11 warnings() | 15 warnings() |
12 library(optparse) | 16 library(optparse) |
13 library(ggplot2) | 17 library(ggplot2) |
14 library(reshape2) | 18 library(reshape2) |
15 library(Rtsne) | 19 library(Rtsne) # nolint |
16 library(ggfortify) | 20 library(ggfortify) |
17 | 21 |
18 | 22 |
19 | 23 |
20 #Arguments | 24 #Arguments |
21 option_list = list( | 25 option_list <- list( |
22 make_option( | 26 make_option( |
23 c("-d", "--data"), | 27 c("-d", "--data"), |
24 default = NA, | 28 default = NA, |
25 type = 'character', | 29 type = "character", |
26 help = "Input file that contains count values to transform" | 30 help = "Input file that contains count values to transform" |
27 ), | 31 ), |
28 make_option( | 32 make_option( |
29 c("-t", "--type"), | 33 c("-t", "--type"), |
30 default = 'cpm', | 34 default = "cpm", |
31 type = 'character', | 35 type = "character", |
32 help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" | 36 help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" |
33 ), | 37 ), |
34 make_option( | 38 make_option( |
35 c("-s", "--sep"), | 39 c("-s", "--sep"), |
36 default = '\t', | 40 default = "\t", |
37 type = 'character', | 41 type = "character", |
38 help = "File separator [default : '%default' ]" | 42 help = "File separator [default : '%default' ]" |
39 ), | 43 ), |
40 make_option( | 44 make_option( |
41 c("-c", "--colnames"), | 45 c("-c", "--colnames"), |
42 default = TRUE, | 46 default = TRUE, |
43 type = 'logical', | 47 type = "logical", |
44 help = "Consider first line as header ? [default : '%default' ]" | 48 help = "Consider first line as header ? [default : '%default' ]" |
45 ), | 49 ), |
46 make_option( | 50 make_option( |
47 c("-f", "--gene"), | 51 c("-f", "--gene"), |
48 default = NA, | 52 default = NA, |
49 type = 'character', | 53 type = "character", |
50 help = "Two column of gene length file" | 54 help = "Two column of gene length file" |
51 ), | 55 ), |
52 make_option( | 56 make_option( |
53 c("-a", "--gene_sep"), | 57 c("-a", "--gene_sep"), |
54 default = '\t', | 58 default = "\t", |
55 type = 'character', | 59 type = "character", |
56 help = "Gene length file separator [default : '%default' ]" | 60 help = "Gene length file separator [default : '%default' ]" |
57 ), | 61 ), |
58 make_option( | 62 make_option( |
59 c("-b", "--gene_header"), | 63 c("-b", "--gene_header"), |
60 default = TRUE, | 64 default = TRUE, |
61 type = 'logical', | 65 type = "logical", |
62 help = "Consider first line of gene length as header ? [default : '%default' ]" | 66 help = "Consider first line of gene length as header ? [default : '%default' ]" |
63 ), | 67 ), |
64 make_option( | 68 make_option( |
65 c("-l", "--log"), | 69 c("-l", "--log"), |
66 default = FALSE, | 70 default = FALSE, |
67 type = 'logical', | 71 type = "logical", |
68 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" | 72 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" |
69 ), | 73 ), |
70 make_option( | 74 make_option( |
71 c("-o", "--out"), | 75 c("-o", "--out"), |
72 default = "res.tab", | 76 default = "res.tab", |
73 type = 'character', | 77 type = "character", |
74 help = "Output name [default : '%default' ]" | 78 help = "Output name [default : '%default' ]" |
75 ), | 79 ), |
76 make_option( | 80 make_option( |
77 "--visu", | 81 "--visu", |
78 default = FALSE, | 82 default = FALSE, |
79 type = 'logical', | 83 type = "logical", |
80 help = "performs T-SNE [default : '%default' ]" | 84 help = "performs T-SNE [default : '%default' ]" |
81 ), | 85 ), |
82 make_option( | 86 make_option( |
83 "--tsne_labels", | 87 "--tsne_labels", |
84 default = FALSE, | 88 default = FALSE, |
85 type = 'logical', | 89 type = "logical", |
86 help = "add labels to t-SNE plot [default : '%default' ]" | 90 help = "add labels to t-SNE plot [default : '%default' ]" |
87 ), | 91 ), |
88 make_option( | 92 make_option( |
89 "--seed", | 93 "--seed", |
90 default = 42, | 94 default = 42, |
91 type = 'integer', | 95 type = "integer", |
92 help = "Seed value for reproducibility [default : '%default' ]" | 96 help = "Seed value for reproducibility [default : '%default' ]" |
93 ), | 97 ), |
94 make_option( | 98 make_option( |
95 "--perp", | 99 "--perp", |
96 default = 5.0, | 100 default = 5.0, |
97 type = 'numeric', | 101 type = "numeric", |
98 help = "perplexity [default : '%default' ]" | 102 help = "perplexity [default : '%default' ]" |
99 ), | 103 ), |
100 make_option( | 104 make_option( |
101 "--theta", | 105 "--theta", |
102 default = 1.0, | 106 default = 1.0, |
103 type = 'numeric', | 107 type = "numeric", |
104 help = "theta [default : '%default' ]" | 108 help = "theta [default : '%default' ]" |
105 ), | 109 ), |
106 make_option( | 110 make_option( |
107 c("-D", "--tsne_out"), | 111 c("-D", "--tsne_out"), |
108 default = "tsne.pdf", | 112 default = "tsne.pdf", |
109 type = 'character', | 113 type = "character", |
110 help = "T-SNE pdf [default : '%default' ]" | 114 help = "T-SNE pdf [default : '%default' ]" |
111 ), | 115 ), |
112 make_option( | 116 make_option( |
113 "--pca_out", | 117 "--pca_out", |
114 default = "pca.pdf", | 118 default = "pca.pdf", |
115 type = 'character', | 119 type = "character", |
116 help = "PCA pdf [default : '%default' ]" | 120 help = "PCA pdf [default : '%default' ]" |
117 ) | 121 ) |
118 | 122 |
119 ) | 123 ) |
120 | 124 |
121 opt = parse_args(OptionParser(option_list = option_list), | 125 opt <- parse_args(OptionParser(option_list = option_list), |
122 args = commandArgs(trailingOnly = TRUE)) | 126 args = commandArgs(trailingOnly = TRUE)) |
123 | 127 |
124 if (opt$data == "" & !(opt$help)) { | 128 if (opt$data == "" && !(opt$help)) { |
125 stop("At least one argument must be supplied (count data).\n", | 129 stop("At least one argument must be supplied (count data).\n", |
126 call. = FALSE) | 130 call. = FALSE) |
127 } else if ((opt$type == "tpm" | opt$type == "rpk") & opt$gene == "") { | 131 } else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") { |
128 stop("At least two arguments must be supplied (count data and gene length file).\n", | 132 stop("At least two arguments must be supplied (count data and gene length file).\n", |
129 call. = FALSE) | 133 call. = FALSE) |
130 } else if (opt$type != "tpm" & opt$type != "rpk" & opt$type != "cpm" & opt$type != "none") { | 134 } else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") { |
131 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", | 135 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", |
132 call. = FALSE) | 136 call. = FALSE) |
133 } | 137 } |
134 | 138 |
135 if (opt$sep == "tab") {opt$sep = "\t"} | 139 if (opt$sep == "tab") { |
136 if (opt$gene_sep == "tab") {opt$gene_sep = "\t"} | 140 opt$sep <- "\t" |
141 } | |
142 if (opt$gene_sep == "tab") { | |
143 opt$gene_sep <- "\t" | |
144 } | |
137 | 145 |
138 cpm <- function(count) { | 146 cpm <- function(count) { |
139 t(t(count) / colSums(count)) * 1000000 | 147 (count / colSums(count)) * 1000000 |
140 } | 148 } |
141 | 149 |
142 | 150 |
143 rpk <- function(count, length) { | 151 rpk <- function(count, length) { |
144 count / (length / 1000) | 152 count / (length / 1000) |
145 } | 153 } |
146 | 154 |
147 tpm <- function(count, length) { | 155 tpm <- function(count, length) { |
148 RPK = rpk(count, length) | 156 rpk <- rpk(count, length) |
149 perMillion_factor = colSums(RPK) / 1000000 | 157 per_million_factor <- colSums(rpk) / 1000000 |
150 TPM = RPK / perMillion_factor | 158 tpm <- rpk / per_million_factor |
151 return(TPM) | 159 return(tpm) |
152 } | 160 } |
153 | 161 |
154 data = read.table( | 162 #### running code #### |
163 | |
164 data <- read.table( | |
155 opt$data, | 165 opt$data, |
156 check.names = FALSE, | 166 check.names = FALSE, |
157 header = opt$colnames, | 167 header = opt$colnames, |
158 row.names = 1, | 168 row.names = 1, |
159 sep = opt$sep | 169 sep = opt$sep |
160 ) | 170 ) |
161 | 171 |
162 if (opt$type == "tpm" | opt$type == "rpk") { | 172 if (opt$type == "tpm" || opt$type == "rpk") { |
163 gene_length = as.data.frame( | 173 gene_length <- as.data.frame( |
164 read.table( | 174 read.table( |
165 opt$gene, | 175 opt$gene, |
166 h = opt$gene_header, | 176 header = opt$gene_header, |
167 row.names = 1, | 177 row.names = 1, |
168 sep = opt$gene_sep | 178 sep = opt$gene_sep |
169 ) | 179 ) |
170 ) | 180 ) |
171 gene_length = as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) | 181 gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) |
172 } | 182 } |
173 | 183 |
174 | 184 |
175 if (opt$type == "cpm") | 185 if (opt$type == "cpm") |
176 res = cpm(data) | 186 res <- cpm(data) |
177 if (opt$type == "tpm") | 187 if (opt$type == "tpm") |
178 res = as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) | 188 res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) |
179 if (opt$type == "rpk") | 189 if (opt$type == "rpk") |
180 res = as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) | 190 res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) |
181 if (opt$type == "none") | 191 if (opt$type == "none") |
182 res = data | 192 res <- data |
183 colnames(res) = colnames(data) | 193 colnames(res) <- colnames(data) |
184 | |
185 | 194 |
186 if (opt$log == TRUE) { | 195 if (opt$log == TRUE) { |
187 res = log2(res + 1) | 196 res <- log2(res + 1) |
188 } | 197 } |
198 | |
199 if (opt$visu == TRUE) { | |
200 df <- res | |
201 # filter and transpose df for tsne and pca | |
202 df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) | |
203 tdf <- t(df) | |
204 # make tsne and plot results | |
205 set.seed(opt$seed) ## Sets seed for reproducibility | |
206 tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) | |
207 embedding <- as.data.frame(tsne_out$Y) | |
208 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) | |
209 gg_legend <- theme(legend.position = "none") | |
210 ggplot(embedding, aes(x = V1, y = V2)) + | |
211 geom_point(size = 1, color = "red") + | |
212 gg_legend + | |
213 xlab("") + | |
214 ylab("") + | |
215 ggtitle("t-SNE") + | |
216 if (opt$tsne_labels == TRUE) { | |
217 geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") | |
218 } | |
219 ggsave(file = opt$tsne_out, device = "pdf") | |
220 # make PCA and plot result with ggfortify (autoplot) | |
221 tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE) | |
222 if (opt$tsne_labels == TRUE) { | |
223 autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2, | |
224 label.hjust = 1.2, | |
225 colour = "darkblue") + | |
226 geom_point(size = 1, color = "red") + | |
227 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + | |
228 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + | |
229 ggtitle("PCA") | |
230 ggsave(file = opt$pca_out, device = "pdf") | |
231 } else { | |
232 autoplot(tdf_pca, shape = TRUE, colour = "darkblue") + | |
233 geom_point(size = 1, color = "red") + | |
234 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + | |
235 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + | |
236 ggtitle("PCA") | |
237 ggsave(file = opt$pca_out, device = "pdf") | |
238 } | |
239 } | |
240 | |
241 # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output | |
242 is_num <- sapply(res, is.numeric) | |
243 res[is_num] <- lapply(res[is_num], round, 8) | |
189 | 244 |
190 write.table( | 245 write.table( |
191 cbind(Features = rownames(res), res), | 246 cbind(Features = rownames(res), res), |
192 opt$out, | 247 opt$out, |
193 col.names = opt$colnames, | 248 col.names = opt$colnames, |
194 row.names = F, | 249 row.names = FALSE, |
195 quote = F, | 250 quote = FALSE, |
196 sep = "\t" | 251 sep = "\t" |
197 ) | 252 ) |
198 | |
199 ## | |
200 if (opt$visu == TRUE) { | |
201 df = res | |
202 # filter and transpose df for tsne and pca | |
203 df = df[rowSums(df) != 0,] # remove lines without information (with only 0 counts) | |
204 tdf = t(df) | |
205 # make tsne and plot results | |
206 set.seed(opt$seed) ## Sets seed for reproducibility | |
207 tsne_out <- Rtsne(tdf, perplexity=opt$perp, theta=opt$theta) # | |
208 embedding <- as.data.frame(tsne_out$Y) | |
209 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) | |
210 gg_legend = theme(legend.position="none") | |
211 ggplot(embedding, aes(x=V1, y=V2)) + | |
212 geom_point(size=1, color='red') + | |
213 gg_legend + | |
214 xlab("") + | |
215 ylab("") + | |
216 ggtitle('t-SNE') + | |
217 if (opt$tsne_labels == TRUE) { | |
218 geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=2.5, color='darkblue') | |
219 } | |
220 ggsave(file=opt$tsne_out, device="pdf") | |
221 # make PCA and plot result with ggfortify (autoplot) | |
222 tdf.pca <- prcomp(tdf, center = TRUE, scale. = T) | |
223 if (opt$tsne_labels == TRUE) { | |
224 autoplot(tdf.pca, shape=F, label=T, label.size=2.5, label.vjust=1.2, | |
225 label.hjust=1.2, | |
226 colour="darkblue") + | |
227 geom_point(size=1, color='red') + | |
228 xlab(paste("PC1",summary(tdf.pca)$importance[2,1]*100, "%")) + | |
229 ylab(paste("PC2",summary(tdf.pca)$importance[2,2]*100, "%")) + | |
230 ggtitle('PCA') | |
231 ggsave(file=opt$pca_out, device="pdf") | |
232 } else { | |
233 autoplot(tdf.pca, shape=T, colour="darkblue") + | |
234 geom_point(size=1, color='red') + | |
235 xlab(paste("PC1",summary(tdf.pca)$importance[2,1]*100, "%")) + | |
236 ylab(paste("PC2",summary(tdf.pca)$importance[2,2]*100, "%")) + | |
237 ggtitle('PCA') | |
238 ggsave(file=opt$pca_out, device="pdf") | |
239 } | |
240 } | |
241 | |
242 | |
243 | |
244 | |
245 | |
246 | |
247 | |
248 | |
249 | |
250 | |
251 | |
252 |