annotate MT2MQ.R @ 5:c3a418f7cf7f draft

"planemo upload commit 499df83651e020e9106d70ec658edf86355f49d9"
author galaxyp
date Wed, 21 Oct 2020 16:39:25 +0000
parents 9c8e7137d331
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
1 # MT2MQ: prepares metatranscriptomic outputs from ASaiM (HUMAnN2 and metaphlan) for metaquantome
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
2
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
3 # Load libraries
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
4 suppressPackageStartupMessages(library(tidyverse))
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
5 suppressPackageStartupMessages(library(taxize))
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
6
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
7 # Set parameters from arguments
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
8 args <- commandArgs(trailingOnly = TRUE)
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
9 data <- args[1]
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
10 # data: full path to file or directory:
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
11 # - if in functional or f-t mode, should be a tsv file of HUMAnN2 gene families, after regrouping and renaming to GO, joining samples, and renormalizing to CPM.
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
12 # - if in taxonomic mode, should be a directory of tsv files of metaphlan genus-level results
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
13 mode <- args[2]
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
14 # mode:
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
15 # -"f": function
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
16 # -"t": taxonomy
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
17 # -"ft": function-taxonomy
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
18 ontology <- unlist(strsplit(args[3], split = ","))
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
19 # ontology: only for function or f-t mode. A string of the GO namespace(s) to include, separated by commas.
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
20 # ex: to include all: "molecular_function,biological_process,cellular_component"
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
21
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
22 int_file <- args[4]
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
23 # int_file: full path and file name and extension to write intensity file
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
24
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
25 func_file <- args[5]
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
26 # func_file: full path and file name and extension to write func file
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
27
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
28 tax_file <- args[6]
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
29 # tax_file: full path and file name and extension to write tax file
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
30
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
31
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
32 # Functional mode
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
33 if (mode == "f") {
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
34 int <- read.delim(file = data, header = TRUE, sep = "\t") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
35 filter(!grepl(".+g__.+", X..Gene.Family)) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
36 separate(col = X..Gene.Family, into = c("id", "Extra"), sep = ": ", fill = "left") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
37 separate(col = Extra, into = c("namespace", "name"), sep = " ", fill = "left", extra = "merge") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
38 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
39 filter(namespace %in% ontology) %>%
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
40 select(id, name, namespace, 4:ncol(.))
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
41 func <- int %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
42 select(id) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
43 mutate(gos = id)
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
44 write.table(x = int, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE)
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
45 write.table(x = func, file = func_file, quote = FALSE, sep = "\t", row.names = FALSE)
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
46 }
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
47
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
48 # Taxonomic mode
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
49 if (mode == "t") {
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
50 files <- dir(path = data)
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
51 int <- tibble(filename = files) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
52 mutate(file_contents = map(filename, ~read.delim(file = file.path(data, .), header = TRUE, sep = "\t"))) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
53 unnest(cols = c(file_contents)) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
54 rename(sample = filename) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
55 separate(col = sample, into = c("sample", NA), sep = ".tsv") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
56 pivot_wider(names_from = sample, values_from = abundance) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
57 mutate(rank = "genus") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
58 rename(name = genus) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
59 mutate(name = as.character(name)) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
60 mutate(id = get_uid(name, key = NULL, messages = FALSE)) %>%
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
61 select(id, name, rank, 2:ncol(.))
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
62 tax <- int %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
63 select(id) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
64 mutate(tax = id)
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
65 write.table(x = int, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE)
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
66 write.table(x = tax, file = tax_file, quote = FALSE, sep = "\t", row.names = FALSE)
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
67 }
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
68
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
69 # Function-taxonomy mode
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
70 if (mode == "ft") {
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
71 ft <- read.delim(file = data, header = TRUE, sep = "\t") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
72 filter(grepl(".+g__.+", X..Gene.Family)) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
73 separate(col = X..Gene.Family, into = c("id", "Extra"), sep = ": ", fill = "left") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
74 separate(col = Extra, into = c("namespace", "name"), sep = " ", fill = "left", extra = "merge") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
75 separate(col = name, into = c("name", "taxa"), sep = "\\|", extra = "merge") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
76 separate(col = taxa, into = c("Extra", "genus", "species"), sep = "__") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
77 select(-"Extra") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
78 mutate_if(is.character, str_replace_all, pattern = "\\.s", replacement = "") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
79 mutate_at(c("species"), str_replace_all, pattern = "_", replacement = " ") %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
80 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>%
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
81 filter(namespace %in% ontology) %>%
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
82 select(id, name, namespace, 4:ncol(.))
2
9c8e7137d331 "planemo upload commit 59afcdaf7afdf574c475f0faae73127f0e563328"
galaxyp
parents: 1
diff changeset
83 write.table(x = ft, file = int_file, quote = FALSE, sep = "\t", row.names = FALSE)
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
84 }