annotate text_to_wordmatrix.R @ 0:02e46a96e98a draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author iuc
date Wed, 24 Mar 2021 08:34:22 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
1 #!/usr/bin/env Rscript
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
2 # tool: text_to_wordmatrix
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
3 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
4 #The tool extracts the most frequent words per entity (per row). Text of columns starting with "ABSTRACT" or "TEXT" are considered.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
5 #All extracted terms are used to generate a word matrix with rows = entities and columns = extracted words.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
6 #The resulting matrix is binary with 0= word not present in abstracts of entity and 1= word present in abstracts of entity.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
7 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
8 #Input: Output of "pubmed_by_queries" or "abstracts_by_pmids", or tab-delimited table with entities in column called “ID_<name>”,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
9 #e.g. “ID_genes” and text in columns starting with "ABSTRACT" or "TEXT".
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
10 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
11 #Output: Binary matrix with rows = entities and columns = extracted words.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
12 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
13 #usage: text_to_wordmatrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] [-r] [-l] [-w] [-s] [-p]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
14 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
15 # optional arguments:
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
16 # -h, --help show help message
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
17 # -i INPUT, --input INPUT input file name. add path if file is not in working directory
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
18 # -o OUTPUT, --output OUTPUT output file name. [default "text_to_wordmatrix_output"]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
19 # -n NUMBER, --number NUMBER number of most frequent words that should be extracted [default "50"]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
20 # -r, --remove_num remove any numbers in text
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
21 # -l, --lower_case by default all characters are translated to lower case. otherwise use -l
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
22 # -w, --remove_stopwords by default a set of english stopwords (e.g., "the" or "not") are removed. otherwise use -w
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
23 # -s, --stemDoc apply Porter"s stemming algorithm: collapsing words to a common root to aid comparison of vocabulary
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
24 # -p, --plurals by default words in plural and singular are merged to the singular form. otherwise use -p
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
25
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
26 if ("--install_packages" %in% commandArgs()) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
27 print("Installing packages")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
28 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
29 if (!require("PubMedWordcloud")) install.packages("PubMedWordcloud", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
30 if (!require("SnowballC")) install.packages("SnowballC", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
31 if (!require("textclean")) install.packages("textclean", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
32 if (!require("SemNetCleaner")) install.packages("SemNetCleaner", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
33 if (!require("stringi")) install.packages("stringi", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
34 if (!require("stringr")) install.packages("stringr", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
35 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
36
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
37 suppressPackageStartupMessages(library("argparse"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
38 suppressPackageStartupMessages(library("PubMedWordcloud"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
39 suppressPackageStartupMessages(library("SnowballC"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
40 suppressPackageStartupMessages(library("SemNetCleaner"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
41 suppressPackageStartupMessages(library("textclean"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
42 suppressPackageStartupMessages(library("stringi"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
43 suppressPackageStartupMessages(library("stringr"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
44
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
45 parser <- ArgumentParser()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
46 parser$add_argument("-i", "--input",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
47 help = "input fie name. add path if file is not in workind directory")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
48 parser$add_argument("-o", "--output", default = "text_to_wordmatrix_output",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
49 help = "output file name. [default \"%(default)s\"]")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
50 parser$add_argument("-n", "--number", type = "integer", default = 50, choices = seq(1, 500), metavar = "{0..500}",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
51 help = "number of most frequent words used per ID in word matrix [default \"%(default)s\"]")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
52 parser$add_argument("-r", "--remove_num", action = "store_true", default = FALSE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
53 help = "remove any numbers in text")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
54 parser$add_argument("-l", "--lower_case", action = "store_false", default = TRUE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
55 help = "by default all characters are translated to lower case. otherwise use -l")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
56 parser$add_argument("-w", "--remove_stopwords", action = "store_false", default = TRUE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
57 help = "by default a set of English stopwords (e.g., 'the' or 'not') are removed. otherwise use -s")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
58 parser$add_argument("-s", "--stemDoc", action = "store_true", default = FALSE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
59 help = "apply Porter's stemming algorithm: collapsing words to a common root to aid comparison of vocabulary")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
60 parser$add_argument("-p", "--plurals", action = "store_false", default = TRUE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
61 help = "by default words in plural and singular are merged to the singular form. otherwise use -p")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
62 parser$add_argument("--install_packages", action = "store_true", default = FALSE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
63 help = "If you want to auto install missing required packages.")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
64
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
65 args <- parser$parse_args()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
66
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
67
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
68 data <- read.delim(args$input, stringsAsFactors = FALSE, header = TRUE, sep = "\t")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
69 word_matrix <- data.frame()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
70
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
71 text_cols_index <- grep(c("ABSTRACT|TEXT"), names(data))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
72
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
73 for (row in seq(nrow(data))) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
74 top_words <- cleanAbstracts(abstracts = data[row, text_cols_index],
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
75 rmNum = args$remove_num,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
76 tolw = args$lower_case,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
77 rmWords = args$remove_stopwords,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
78 stemDoc = args$stemDoc)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
79
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
80 top_words$word <- as.character(top_words$word)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
81
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
82 cat("Most frequent words for row", row, " are extracted.", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
83
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
84 if (args$plurals == TRUE) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
85 top_words$word <- sapply(top_words$word, function(x) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
86 singularize(x)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
87 })
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
88 top_words <- aggregate(freq~word, top_words, sum)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
89 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
90
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
91 top_words <- top_words[order(top_words$freq, decreasing = TRUE), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
92 top_words$word <- as.character(top_words$word)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
93
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
94 number_extract <- min(args$number, nrow(top_words))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
95 word_matrix[row, sapply(1:number_extract, function(x) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
96 paste0(top_words$word[x])
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
97 })] <- top_words$freq[1:number_extract]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
98 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
99
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
100 word_matrix <- as.matrix(word_matrix)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
101 word_matrix[is.na(word_matrix)] <- 0
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
102 word_matrix <- (word_matrix > 0) * 1 #binary matrix
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
103
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
104 cat("A matrix with ", nrow(word_matrix), " rows and ", ncol(word_matrix), "columns is generated.", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
105
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
106 write.table(word_matrix, args$output, row.names = FALSE, sep = "\t", quote = FALSE)