comparison cpm_tpm_rpk.R @ 7:f00c1c34565e draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/cpm_tpm_rpk commit 8cca5f05212cd25113955fb9c11801f4047c12b4
author artbio
date Wed, 03 Jul 2024 14:39:26 +0000
parents bcff1eb6fdb5
children
comparison
equal deleted inserted replaced
6:d2575928b824 7:f00c1c34565e
1 if (length(commandArgs(TRUE)) == 0) { 1 if (length(commandArgs(TRUE)) == 0) {
2 system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) 2 system("Rscript cpm_tpm_rpk.R -h", intern = FALSE)
3 q("no") 3 q("no")
4 } 4 }
5 5
6 6
7 # load packages that are provided in the conda env 7 # load packages that are provided in the conda env
8 options(show.error.messages = FALSE, 8 options(show.error.messages = FALSE,
9 error = function() { 9 error = function() {
10 cat(geterrmessage(), file = stderr()) 10 cat(geterrmessage(), file = stderr())
11 q("no", 1, FALSE) 11 q("no", 1, FALSE)
12 } 12 }
13 ) 13 )
14 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint⁠ 14 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint⁠
15 warnings() 15 warnings()
16 library(optparse) 16 library(optparse)
17 library(ggplot2) 17 library(ggplot2)
18 library(reshape2) 18 library(reshape2)
19 library(Rtsne) # nolint⁠ 19 library(Rtsne) # nolint⁠
20 library(ggfortify) 20 library(ggfortify)
21 21
22 22
23
24 #Arguments 23 #Arguments
25 option_list <- list( 24 option_list <- list(
26 make_option( 25 make_option(
27 c("-d", "--data"), 26 c("-d", "--data"),
28 default = NA, 27 default = NA,
29 type = "character", 28 type = "character",
30 help = "Input file that contains count values to transform" 29 help = "Input file that contains count values to transform"
31 ), 30 ),
32 make_option( 31 make_option(
33 c("-t", "--type"), 32 c("-t", "--type"),
34 default = "cpm", 33 default = "cpm",
35 type = "character", 34 type = "character",
36 help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" 35 help = "Transformation type, either cpm, tpm, rpkm or none[default : '%default' ]"
37 ), 36 ),
38 make_option( 37 make_option(
39 c("-s", "--sep"), 38 c("-s", "--sep"),
40 default = "\t", 39 default = "\t",
41 type = "character", 40 type = "character",
42 help = "File separator [default : '%default' ]" 41 help = "File separator [default : '%default' ]"
43 ), 42 ),
44 make_option( 43 make_option(
45 c("-c", "--colnames"), 44 c("-c", "--colnames"),
46 default = TRUE, 45 default = TRUE,
47 type = "logical", 46 type = "logical",
48 help = "Consider first line as header ? [default : '%default' ]" 47 help = "Consider first line as header ? [default : '%default' ]"
49 ), 48 ),
50 make_option( 49 make_option(
51 c("-f", "--gene"), 50 c("-f", "--gene"),
52 default = NA, 51 default = NA,
53 type = "character", 52 type = "character",
54 help = "Two column of gene length file" 53 help = "Two column of gene length file"
55 ), 54 ),
56 make_option( 55 make_option(
57 c("-a", "--gene_sep"), 56 c("-a", "--gene_sep"),
58 default = "\t", 57 default = "\t",
59 type = "character", 58 type = "character",
60 help = "Gene length file separator [default : '%default' ]" 59 help = "Gene length file separator [default : '%default' ]"
61 ), 60 ),
62 make_option( 61 make_option(
63 c("-b", "--gene_header"), 62 c("-b", "--gene_header"),
64 default = TRUE, 63 default = TRUE,
65 type = "logical", 64 type = "logical",
66 help = "Consider first line of gene length as header ? [default : '%default' ]" 65 help = "Consider first line of gene length as header ? [default : '%default' ]"
67 ), 66 ),
68 make_option( 67 make_option(
69 c("-l", "--log"), 68 c("-l", "--log"),
70 default = FALSE, 69 default = FALSE,
71 type = "logical", 70 type = "logical",
72 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" 71 help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]"
73 ), 72 ),
74 make_option( 73 make_option(
75 c("-o", "--out"), 74 c("-o", "--out"),
76 default = "res.tab", 75 default = "res.tab",
77 type = "character", 76 type = "character",
78 help = "Output name [default : '%default' ]" 77 help = "Output name [default : '%default' ]"
79 ), 78 ),
80 make_option( 79 make_option(
81 "--visu", 80 "--visu",
82 default = FALSE, 81 default = FALSE,
83 type = "logical", 82 type = "logical",
84 help = "performs T-SNE [default : '%default' ]" 83 help = "performs T-SNE [default : '%default' ]"
85 ), 84 ),
86 make_option( 85 make_option(
87 "--tsne_labels", 86 "--tsne_labels",
88 default = FALSE, 87 default = FALSE,
89 type = "logical", 88 type = "logical",
90 help = "add labels to t-SNE plot [default : '%default' ]" 89 help = "add labels to t-SNE plot [default : '%default' ]"
91 ), 90 ),
92 make_option( 91 make_option(
93 "--seed", 92 "--seed",
94 default = 42, 93 default = 42,
95 type = "integer", 94 type = "integer",
96 help = "Seed value for reproducibility [default : '%default' ]" 95 help = "Seed value for reproducibility [default : '%default' ]"
97 ), 96 ),
98 make_option( 97 make_option(
99 "--perp", 98 "--perp",
100 default = 5.0, 99 default = 5.0,
101 type = "numeric", 100 type = "numeric",
102 help = "perplexity [default : '%default' ]" 101 help = "perplexity [default : '%default' ]"
103 ), 102 ),
104 make_option( 103 make_option(
105 "--theta", 104 "--theta",
106 default = 1.0, 105 default = 1.0,
107 type = "numeric", 106 type = "numeric",
108 help = "theta [default : '%default' ]" 107 help = "theta [default : '%default' ]"
109 ), 108 ),
110 make_option( 109 make_option(
111 c("-D", "--tsne_out"), 110 c("-D", "--tsne_out"),
112 default = "tsne.pdf", 111 default = "tsne.pdf",
113 type = "character", 112 type = "character",
114 help = "T-SNE pdf [default : '%default' ]" 113 help = "T-SNE pdf [default : '%default' ]"
115 ), 114 ),
116 make_option( 115 make_option(
117 "--pca_out", 116 "--pca_out",
118 default = "pca.pdf", 117 default = "pca.pdf",
119 type = "character", 118 type = "character",
120 help = "PCA pdf [default : '%default' ]" 119 help = "PCA pdf [default : '%default' ]"
121 ) 120 )
122 121
123 ) 122 )
124 123
125 opt <- parse_args(OptionParser(option_list = option_list), 124 opt <- parse_args(OptionParser(option_list = option_list),
126 args = commandArgs(trailingOnly = TRUE)) 125 args = commandArgs(trailingOnly = TRUE))
127 126
128 if (opt$data == "" && !(opt$help)) { 127 if (opt$data == "" && !(opt$help)) {
129 stop("At least one argument must be supplied (count data).\n", 128 stop("At least one argument must be supplied (count data).\n",
130 call. = FALSE) 129 call. = FALSE)
131 } else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") { 130 } else if ((opt$type == "tpm" || opt$type == "rpkm") && opt$gene == "") {
132 stop("At least two arguments must be supplied (count data and gene length file).\n", 131 stop("At least two arguments must be supplied (count data and gene length file).\n",
133 call. = FALSE) 132 call. = FALSE)
134 } else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") { 133 } else if (opt$type != "tpm" && opt$type != "rpkm" && opt$type != "cpm" && opt$type != "none") {
135 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", 134 stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpkm.\n",
136 call. = FALSE) 135 call. = FALSE)
137 } 136 }
138 137
139 if (opt$sep == "tab") { 138 if (opt$sep == "tab") {
140 opt$sep <- "\t" 139 opt$sep <- "\t"
141 } 140 }
142 if (opt$gene_sep == "tab") { 141 if (opt$gene_sep == "tab") {
143 opt$gene_sep <- "\t" 142 opt$gene_sep <- "\t"
144 } 143 }
145 144
146 cpm <- function(count) { 145 cpm <- function(count) {
147 (count / colSums(count)) * 1000000 146 (count / colSums(count)) * 1000000
148 } 147 }
149 148
150 149
151 rpk <- function(count, length) { 150 rpk <- function(count, length) {
152 count / (length / 1000) 151 count / (length / 1000)
153 } 152 }
154 153
155 tpm <- function(count, length) { 154 tpm <- function(count, length) {
156 rpk <- rpk(count, length) 155 rpk <- rpk(count, length)
157 per_million_factor <- colSums(rpk) / 1000000 156 per_million_factor <- colSums(rpk) / 1000000
158 tpm <- rpk / per_million_factor 157 tpm <- rpk / per_million_factor
159 return(tpm) 158 return(tpm)
159 }
160
161 rpkm <- function(count, length) {
162 rpk <- rpk(count, length)
163 per_million_factor <- colSums(as.data.frame(count)) / 1000000
164 rpkm <- rpk / per_million_factor
165 return(rpkm)
160 } 166 }
161 167
162 #### running code #### 168 #### running code ####
163 169
164 data <- read.delim( 170 data <- read.delim(
165 opt$data, 171 opt$data,
166 check.names = FALSE, 172 check.names = FALSE,
167 header = opt$colnames, 173 header = opt$colnames,
168 row.names = 1, 174 row.names = 1,
169 sep = opt$sep 175 sep = opt$sep
170 ) 176 )
171 177
172 if (opt$type == "tpm" || opt$type == "rpk") { 178 if (opt$type == "tpm" || opt$type == "rpkm") {
173 gene_length <- as.data.frame( 179 gene_length <- as.data.frame(
174 read.delim( 180 read.delim(
175 opt$gene, 181 opt$gene,
176 header = opt$gene_header, 182 header = opt$gene_header,
177 row.names = 1, 183 row.names = 1,
178 sep = opt$gene_sep 184 sep = opt$gene_sep
185 )
179 ) 186 )
180 ) 187 gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data))
181 gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) 188 }
182 } 189
183 190
184 191 if (opt$type == "cpm") {
185 if (opt$type == "cpm") 192 res <- cpm(data)
186 res <- cpm(data) 193 }
187 if (opt$type == "tpm") 194 if (opt$type == "tpm") {
188 res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) 195 res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data))
189 if (opt$type == "rpk") 196 }
190 res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) 197 if (opt$type == "rpkm") {
191 if (opt$type == "none") 198 res <- as.data.frame(apply(data, 2, rpkm, length = gene_length), row.names = rownames(data))
192 res <- data 199 }
200 if (opt$type == "none") {
201 res <- data
202 }
193 colnames(res) <- colnames(data) 203 colnames(res) <- colnames(data)
194 204
195 if (opt$log == TRUE) { 205 if (opt$log == TRUE) {
196 res <- log2(res + 1) 206 res <- log2(res + 1)
197 } 207 }
198 208
199 if (opt$visu == TRUE) { 209 if (opt$visu == TRUE) {
200 df <- res 210 df <- res
201 # filter and transpose df for tsne and pca 211 # filter and transpose df for tsne and pca
202 df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) 212 df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts)
203 tdf <- t(df) 213 tdf <- t(df)
204 # make tsne and plot results 214 # make tsne and plot results
205 set.seed(opt$seed) ## Sets seed for reproducibility 215 set.seed(opt$seed) ## Sets seed for reproducibility
206 tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) 216 tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta)
207 embedding <- as.data.frame(tsne_out$Y) 217 embedding <- as.data.frame(tsne_out$Y)
208 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) 218 embedding$Class <- as.factor(sub("Class_", "", rownames(tdf)))
209 gg_legend <- theme(legend.position = "none") 219 gg_legend <- theme(legend.position = "none")
210 ggplot(embedding, aes(x = V1, y = V2)) + 220 ggplot(embedding, aes(x = V1, y = V2)) +
211 geom_point(size = 1, color = "red") + 221 geom_point(size = 1, color = "red") +
212 gg_legend + 222 gg_legend +
213 xlab("") + 223 xlab("") +
214 ylab("") + 224 ylab("") +
215 ggtitle("t-SNE") + 225 ggtitle("t-SNE") +
226 if (opt$tsne_labels == TRUE) {
227 geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue")
228 }
229 ggsave(file = opt$tsne_out, device = "pdf")
230 # make PCA and plot result with ggfortify (autoplot)
231 tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE)
216 if (opt$tsne_labels == TRUE) { 232 if (opt$tsne_labels == TRUE) {
217 geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") 233 autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2,
234 label.hjust = 1.2,
235 colour = "darkblue") +
236 geom_point(size = 1, color = "red") +
237 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
238 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
239 ggtitle("PCA")
240 ggsave(file = opt$pca_out, device = "pdf")
241 } else {
242 autoplot(tdf_pca, shape = TRUE, colour = "darkblue") +
243 geom_point(size = 1, color = "red") +
244 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
245 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
246 ggtitle("PCA")
247 ggsave(file = opt$pca_out, device = "pdf")
218 } 248 }
219 ggsave(file = opt$tsne_out, device = "pdf")
220 # make PCA and plot result with ggfortify (autoplot)
221 tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE)
222 if (opt$tsne_labels == TRUE) {
223 autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2,
224 label.hjust = 1.2,
225 colour = "darkblue") +
226 geom_point(size = 1, color = "red") +
227 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
228 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
229 ggtitle("PCA")
230 ggsave(file = opt$pca_out, device = "pdf")
231 } else {
232 autoplot(tdf_pca, shape = TRUE, colour = "darkblue") +
233 geom_point(size = 1, color = "red") +
234 xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
235 ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
236 ggtitle("PCA")
237 ggsave(file = opt$pca_out, device = "pdf")
238 }
239 } 249 }
240 250
241 # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output 251 # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output
242 is_num <- sapply(res, is.numeric) 252 is_num <- sapply(res, is.numeric)
243 res[is_num] <- lapply(res[is_num], round, 8) 253 res[is_num] <- lapply(res[is_num], round, 8)
244 254
245 write.table( 255 write.table(
246 cbind(Features = rownames(res), res), 256 cbind(Features = rownames(res), res),
247 opt$out, 257 opt$out,
248 col.names = opt$colnames, 258 col.names = opt$colnames,
249 row.names = FALSE, 259 row.names = FALSE,
250 quote = FALSE, 260 quote = FALSE,
251 sep = "\t" 261 sep = "\t"
252 ) 262 )