annotate pmids_to_pubtator_matrix.R @ 0:02e46a96e98a draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
author iuc
date Wed, 24 Mar 2021 08:34:22 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
1 #!/usr/bin/env Rscript
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
2 #tool: pmids_to_pubtator_matrix
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
3 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID).
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
10 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
13 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
14 #Output: Binary matrix in that each column represents one of the extracted terms.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
15 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
18 #
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
19 # optional arguments:
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
20 # -h, --help show help message
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
26
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
27 if ("--install_packages" %in% commandArgs()) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
28 print("Installing packages")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
29 if (!require("argparse")) install.packages("argparse", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
30 if (!require("stringr")) install.packages("stringr", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
31 if (!require("RCurl")) install.packages("RCurl", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
32 if (!require("stringi")) install.packages("stringi", repo = "http://cran.rstudio.com/");
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
33 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
34
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
35 suppressPackageStartupMessages(library("argparse"))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
36 library("stringr")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
37 library("RCurl")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
38 library("stringi")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
39
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
40 parser <- ArgumentParser()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
41
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
42 parser$add_argument("-i", "--input",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
43 help = "input fie name. add path if file is not in workind directory")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
44 parser$add_argument("-o", "--output", default = "pmids_to_pubtator_matrix_output",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
45 help = "output file name. [default \"%(default)s\"]")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
46 parser$add_argument("-c", "--categories", choices = c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs = "+",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
47 default = c("Gene", "Disease", "Mutation", "Chemical"),
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
49 parser$add_argument("-b", "--byid", action = "store_true", default = FALSE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
50 help = "If you want to find common gene IDs / mesh IDs instead of scientific terms.")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
51 parser$add_argument("-n", "--number", default = NULL, type = "integer",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
52 help = "Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
53 parser$add_argument("--install_packages", action = "store_true", default = FALSE,
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
54 help = "If you want to auto install missing required packages.")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
55
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
56 args <- parser$parse_args()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
57
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
58
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
59 data <- read.delim(args$input, stringsAsFactors = FALSE, header = TRUE, sep = "\t")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
60
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
61 pmid_cols_index <- grep(c("PMID"), names(data))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
62 word_matrix <- data.frame()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
63 dict_table <- data.frame()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
64 pmids_count <- 0
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
65 pubtator_max_ids <- 100
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
66
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
67
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
68 merge_pubtator_table <- function(out_data, table) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
69 out_data <- unlist(strsplit(out_data, "\n", fixed = T))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
70 for (i in 3:length(out_data)) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
71 temps <- unlist(strsplit(out_data[i], "\t", fixed = T))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
72 if (length(temps) == 5) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
73 temps <- c(temps, NA)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
74 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
75 if (length(temps) == 6) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
76 table <- rbind(table, temps)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
77 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
78 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
79 return(table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
80 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
81
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
82
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
83 get_pubtator_terms <- function(pmids) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
84 table <- NULL
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
85 for (pmid_split in split(pmids, ceiling(seq_along(pmids) / pubtator_max_ids))) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
86 out_data <- NULL
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
87 try_num <- 1
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
88 t_0 <- Sys.time()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
89 while (TRUE) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
90 # Timing check: kill at 3 min
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
91 if (try_num > 1) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
92 cat("Connection problem. Please wait. Try number:", try_num, "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
93 Sys.sleep(time = 2 * try_num)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
94 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
95 try_num <- try_num + 1
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
96 t_1 <- Sys.time()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
97 if (as.numeric(difftime(t_1, t_0, units = "mins")) > 3) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
98 message("Killing the request! Something is not working. Please, try again later", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
99 return(table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
100 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
101 out_data <- tryCatch({
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
102 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=",
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
103 paste(pmid_split, collapse = ","), sep = ""))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
104 }, error = function(e) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
105 print(e)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
106 next
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
107 }, finally = {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
108 Sys.sleep(0)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
109 })
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
110 if (!is.null(out_data)) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
111 table <- merge_pubtator_table(out_data, table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
112 break
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
113 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
114 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
115 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
116 return(table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
117 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
118
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
119 extract_category_terms <- function(table, categories) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
120 index_categories <- c()
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
121 categories <- as.character(unlist(categories))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
122 if (ncol(table) == 6) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
123 for (i in categories) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
124 tmp_index <- grep(TRUE, i == as.character(table[, 5]))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
125 if (length(tmp_index) > 0) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
126 index_categories <- c(index_categories, tmp_index)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
127 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
128 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
129 table <- as.data.frame(table, stringsAsFactors = FALSE)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
130 table <- table[index_categories, c(4, 6)]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
131 table <- table[!is.na(table[, 2]), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
132 table <- table[!(table[, 2] == "NA"), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
133 table <- table[!(table[, 1] == "NA"), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
134 }else{
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
135 return(NULL)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
136 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
137 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
138
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
139 extract_frequent_ids_or_terms <- function(table) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
140 if (is.null(table)) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
141 return(NULL)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
142 break
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
143 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
144 if (args$byid) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
145 if (!is.null(args$number)) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
146 #retrieve top X mesh_ids
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
147 table_mesh <- as.data.frame(table(table[, 2]))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
148 colnames(table_mesh)[1] <- "mesh_id"
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
149 table <- table[order(table_mesh$Freq, decreasing = TRUE), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
150 table <- table[1:min(args$number, nrow(table_mesh)), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
151 table_mesh$mesh_id <- as.character(table_mesh$mesh_id)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
152 #subset table for top X mesh_ids
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
153 table <- table[which(as.character(table$V6) %in% as.character(table_mesh$mesh_id)), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
154 table <- table[!duplicated(table[, 2]), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
155 } else {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
156 table <- table[!duplicated(table[, 2]), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
157 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
158 } else {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
159 if (!is.null(args$number)) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
160 table[, 1] <- tolower(as.character(table[, 1]))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
161 table <- as.data.frame(table(table[, 1]))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
162 colnames(table)[1] <- "term"
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
163 table <- table[order(table$Freq, decreasing = TRUE), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
164 table <- table[1:min(args$number, nrow(table)), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
165 table$term <- as.character(table$term)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
166 } else {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
167 table[, 1] <- tolower(as.character(table[, 1]))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
168 table <- table[!duplicated(table[, 1]), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
169 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
170 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
171 return(table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
172 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
173
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
174
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
175 #for all PMIDs of a row get PubTator terms and add them to the matrix
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
176 for (i in seq(nrow(data))) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
177 pmids <- as.character(data[i, pmid_cols_index])
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
178 pmids <- pmids[!pmids == "NA"]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
179 if (pmids_count > 10000) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
180 cat("Break (10s) to avoid killing of requests. Please wait.", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
181 Sys.sleep(10)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
182 pmids_count <- 0
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
183 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
184 pmids_count <- pmids_count + length(pmids)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
185 #get puptator terms and process them with functions
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
186 if (length(pmids) > 0) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
187 table <- get_pubtator_terms(pmids)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
188 table <- extract_category_terms(table, args$categories)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
189 table <- extract_frequent_ids_or_terms(table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
190 if (!is.null(table)) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
191 colnames(table) <- c("term", "mesh_id")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
192 # add data in binary matrix
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
193 if (args$byid) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
194 mesh_ids <- as.character(table$mesh_id)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
195 if (length(mesh_ids) > 0) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
196 word_matrix[i, mesh_ids] <- 1
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
197 cat(length(mesh_ids), " IDs for PMIDs of row", i, " were added", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
198 # add data in dictionary
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
199 dict_table <- rbind(dict_table, table)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
200 dict_table <- dict_table[!duplicated(as.character(dict_table[, 2])), ]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
201 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
202 } else {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
203 terms <- as.character(table[, 1])
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
204 if (length(terms) > 0) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
205 word_matrix[i, terms] <- 1
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
206 cat(length(terms), " terms for PMIDs of row", i, " were added.", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
207 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
208 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
209 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
210 } else {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
211 cat("No terms for PMIDs of row", i, " were found.", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
212 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
213 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
214
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
215 if (args$byid) {
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
216 #change column names of matrix: exchange mesh ids/ids with term
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
217 index_names <- match(names(word_matrix), as.character(dict_table[[2]]))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
218 names(word_matrix) <- dict_table[index_names, 1]
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
219 }
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
220
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
221 colnames(word_matrix) <- gsub("[^[:print:]]", "", colnames(word_matrix))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
222 colnames(word_matrix) <- gsub('\"', "", colnames(word_matrix), fixed = TRUE)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
223
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
224 #merge duplicated columns
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
225 word_matrix <- as.data.frame(do.call(cbind, by(t(word_matrix), INDICES = names(word_matrix), FUN = colSums)))
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
226
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
227 #save binary matrix
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
228 word_matrix <- as.matrix(word_matrix)
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
229 word_matrix[is.na(word_matrix)] <- 0
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
230 cat("Matrix with ", nrow(word_matrix), " rows and ", ncol(word_matrix), " columns generated.", "\n")
02e46a96e98a "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/simtext commit 63a5e13cf89cdd209d20749c582ec5b8dde4e208"
iuc
parents:
diff changeset
231 write.table(word_matrix, args$output, row.names = FALSE, sep = "\t", quote = FALSE)