0
|
1 # MT for iMQ: prepares metatranscriptomic outputs from ASaiM (HUMAnN2 and metaphlan) for metaquantome
|
|
2
|
|
3 # Load libraries
|
|
4 library(tidyverse)
|
|
5 library(readr)
|
|
6
|
|
7 # Set parameters from arguments
|
|
8 args = commandArgs(trailingOnly = TRUE)
|
|
9 data <- args[1]
|
|
10 # data: full path to file or directory:
|
|
11 # - if in functional or f-t mode, should be a tsv file of HUMAnN2 gene families, after joining, renaming to GO, and renormalizing to CPM.
|
|
12 # - if in taxonomic mode, should be a directory of tsv files of metaphlan genus-level results
|
|
13 mode <- args[2]
|
|
14 # mode:
|
|
15 # -"f": function
|
|
16 # -"t": taxonomy
|
|
17 # -"ft": function-taxonomy
|
|
18 ontology <- unlist(strsplit(args[3], split = ","))
|
|
19 # ontology: only for function or f-t mode. A string of the GO namespace(s) to include, separated by spaces.
|
|
20 # ex: to include all: "molecular_function biological_process cellular_component"
|
|
21 outfile <- args[4]
|
|
22 # outfile: full path with pathname and extension for output
|
|
23
|
|
24 # Functional mode
|
|
25 if (mode == "f"){
|
|
26 out <- read_delim(data, "\t", escape_double = FALSE, trim_ws = TRUE) %>%
|
|
27 filter(!grepl(".+g__.+",`# Gene Family`)) %>%
|
|
28 separate(col=`# Gene Family`, into=c("id", "Extra"), sep=": ", fill="left") %>%
|
|
29 separate(col=Extra, into = c("namespace", "name"), sep = " ", fill="left", extra="merge") %>%
|
|
30 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>%
|
|
31 filter(namespace %in% ontology) %>%
|
|
32 select(id, name, namespace, 4:ncol(.))
|
|
33 }
|
|
34
|
|
35 # Taxonomic mode
|
|
36 if (mode == "t"){
|
|
37 files <- dir(path = data)
|
|
38 out <- data_frame(filename = files) %>%
|
|
39 mutate(file_contents= map(filename, ~read_delim(file.path(data, .),delim = "\t"))) %>%
|
|
40 unnest %>%
|
|
41 rename(sample = filename) %>%
|
|
42 separate(col = sample, into = c("sample",NA), sep=".tsv") %>%
|
|
43 pivot_wider(names_from = sample, values_from = abundance) %>%
|
|
44 mutate(rank = "genus") %>%
|
|
45 rename(name = genus) %>%
|
|
46 mutate(id = row_number(name)) %>% # filler for taxon id but should eventually find a way to get id from database
|
|
47 select(id, name, rank, 2:ncol(.))
|
|
48 }
|
|
49
|
|
50 # Function-taxonomy mode
|
|
51 if (mode == "ft"){
|
|
52 out <- read_delim(data,"\t", escape_double = FALSE, trim_ws = TRUE) %>%
|
|
53 filter(grepl(".+g__.+",`# Gene Family`)) %>%
|
|
54 separate(col=`# Gene Family`, into=c("id", "Extra"), sep=": ", fill="left") %>%
|
|
55 separate(col=Extra, into = c("namespace", "name"), sep = " ", fill="left", extra="merge") %>%
|
|
56 separate(col = name, into = c("name", "taxa"), sep="\\|", extra = "merge") %>%
|
|
57 separate(col = taxa, into = c("Extra", "genus", "species"), sep = "__") %>% select(-"Extra") %>%
|
|
58 mutate_if(is.character, str_replace_all, pattern = "\\.s", replacement = "") %>%
|
|
59 mutate_at(c("species"), str_replace_all, pattern = "_", replacement = " ") %>%
|
|
60 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>%
|
|
61 filter(namespace %in% ontology) %>%
|
|
62 select(id, name, namespace, 4:ncol(.))
|
|
63 }
|
|
64
|
|
65 # Write file
|
|
66 write_delim(x = out, path = outfile, delim = "\t", quote_escape = FALSE)
|