annotate MT2MQ.R @ 1:e50ec3a9a3f9 draft

"planemo upload commit c29eec0f6f72e212c6afa82bd0bc052b8f26f1ab"
author galaxyp
date Fri, 26 Jun 2020 11:15:17 -0400
parents 6bee94458567
children 9c8e7137d331
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
1 # MT2MQ: prepares metatranscriptomic outputs from ASaiM (HUMAnN2 and metaphlan) for metaquantome
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
2
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
3 # Load libraries
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
4 suppressPackageStartupMessages(library(tidyverse))
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
5 #default_locale()
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
6
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
7 # Set parameters from arguments
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
8 args = commandArgs(trailingOnly = TRUE)
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
9 data <- args[1]
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
10 # data: full path to file or directory:
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
11 # - if in functional or f-t mode, should be a tsv file of HUMAnN2 gene families, after regrouping and renaming to GO, joining samples, and renormalizing to CPM.
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
12 # - if in taxonomic mode, should be a directory of tsv files of metaphlan genus-level results
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
13 mode <- args[2]
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
14 # mode:
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
15 # -"f": function
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
16 # -"t": taxonomy
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
17 # -"ft": function-taxonomy
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
18 ontology <- unlist(strsplit(args[3], split = ","))
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
19 # ontology: only for function or f-t mode. A string of the GO namespace(s) to include, separated by commas.
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
20 # ex: to include all: "molecular_function,biological_process,cellular_component"
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
21 outfile <- args[4]
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
22 # outfile: full path with pathname and extension for output
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
23
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
24 # Functional mode
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
25 if (mode == "f"){
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
26 out <- read.delim(file=data, header=TRUE, sep='\t') %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
27 filter(!grepl(".+g__.+",X..Gene.Family)) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
28 separate(col=X..Gene.Family, into=c("id", "Extra"), sep=": ", fill="left") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
29 separate(col=Extra, into = c("namespace", "name"), sep = " ", fill="left", extra="merge") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
30 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
31 filter(namespace %in% ontology) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
32 select(id, name, namespace, 4:ncol(.))
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
33 }
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
34
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
35 # Taxonomic mode
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
36 if (mode == "t"){
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
37 files <- dir(path = data)
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
38 out <- tibble(filename = files) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
39 mutate(file_contents= map(filename, ~read.delim(file=file.path(data, .), header=TRUE, sep = "\t"))) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
40 unnest(cols = c(file_contents)) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
41 rename(sample = filename) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
42 separate(col = sample, into = c("sample",NA), sep=".tsv") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
43 pivot_wider(names_from = sample, values_from = abundance) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
44 mutate(rank = "genus") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
45 rename(name = genus) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
46 mutate(id = row_number(name)) %>% # filler for taxon id but should eventually find a way to get id from ncbi database
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
47 select(id, name, rank, 2:ncol(.))
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
48 }
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
49
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
50 # Function-taxonomy mode
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
51 if (mode == "ft"){
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
52 out <- read.delim(file=data, header=TRUE, sep='\t') %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
53 filter(grepl(".+g__.+",X..Gene.Family)) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
54 separate(col=X..Gene.Family, into=c("id", "Extra"), sep=": ", fill="left") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
55 separate(col=Extra, into = c("namespace", "name"), sep = " ", fill="left", extra="merge") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
56 separate(col = name, into = c("name", "taxa"), sep="\\|", extra = "merge") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
57 separate(col = taxa, into = c("Extra", "genus", "species"), sep = "__") %>% select(-"Extra") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
58 mutate_if(is.character, str_replace_all, pattern = "\\.s", replacement = "") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
59 mutate_at(c("species"), str_replace_all, pattern = "_", replacement = " ") %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
60 mutate(namespace = if_else(namespace == "[MF]", true = "molecular_function", false = if_else(namespace == "[BP]", true = "biological_process", false = "cellular_component"))) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
61 filter(namespace %in% ontology) %>%
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
62 select(id, name, namespace, 4:ncol(.))
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
63 }
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
64
6bee94458567 "planemo upload commit 53bcf55b73cb251446150026242b4d47d49d3469"
galaxyp
parents:
diff changeset
65 # Write file
1
e50ec3a9a3f9 "planemo upload commit c29eec0f6f72e212c6afa82bd0bc052b8f26f1ab"
galaxyp
parents: 0
diff changeset
66 write.table(x = out, file = outfile, quote = FALSE, sep = "\t", row.names = FALSE)