Mercurial > repos > dlalgroup > pmids_to_pubtator_matrix
annotate pmids_to_pubtator_matrix.R @ 2:ada4c7b3fc39 draft default tip
"planemo upload for repository https://github.com/dlal-group/simtext commit 63f67ee02be2eb4323a5ba5dcdd33d1fd0b7c24e"
author | dlalgroup |
---|---|
date | Thu, 08 Oct 2020 05:39:58 +0000 |
parents | 3f4adc85ba5d |
children |
rev | line source |
---|---|
0
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
1 #!/usr/bin/env Rscript |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
2 #tool: pmids_to_pubtator_matrix |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
3 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
4 #The tool uses all PMIDs per row and extracts "Gene", "Disease", "Mutation", "Chemical" and "Species" terms of the |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
5 #corresponding abstracts, using PubTator annotations. The user can choose from which categories terms should be extracted. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
6 #The extracted terms are united in one large binary matrix, with 0= term not present in abstracts of that row and 1= term |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
7 #present in abstracts of that row. The user can decide if the extracted scientific terms should be extracted and used as |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
8 #they are or if they should be grouped by their geneIDs/ meshIDs (several terms can often be grouped into one ID). |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
9 #äAlso, by default all terms are extracted, otherwise the user can specify a number of most frequent words to be extracted per row. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
10 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
11 #Input: Output of abstracts_by_pmids or tab-delimited table with columns containing PMIDs. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
12 #The names of the PMID columns should start with "PMID", e.g. "PMID_1", "PMID_2" etc. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
13 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
14 #Output: Binary matrix in that each column represents one of the extracted terms. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
15 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
16 # usage: $ pmids_to_pubtator_matrix.R [-h] [-i INPUT] [-o OUTPUT] [-n NUMBER] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
17 # [-c {Genes,Diseases,Mutations,Chemicals,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...]] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
18 # |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
19 # optional arguments: |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
20 # -h, --help show help message |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
21 # -i INPUT, --input INPUT input file name. add path if file is not in workind directory |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
22 # -n NUMBER, --number NUMBER Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted. |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
23 # -o OUTPUT, --output OUTPUT output file name. [default "pmids_to_pubtator_matrix_output"] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
24 # -c {Gene,Disease,Mutation,Chemical,Species} [{Genes,Diseases,Mutations,Chemicals,Species} ...], --categories {Gene,Disease,Mutation,Chemical,Species} [{Gene,Disease,Mutation,Chemical,Species} ...] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
25 # Pubtator categories that should be considered. [default "('Gene', 'Disease', 'Mutation','Chemical')"] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
26 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
27 if ( '--install_packages' %in% commandArgs()) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
28 print('Installing packages') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
29 if (!require('argparse')) install.packages('argparse',repo="http://cran.rstudio.com/"); |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
30 if (!require('stringr')) install.packages('stringr',repo="http://cran.rstudio.com/"); |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
31 if (!require('RCurl')) install.packages('RCurl',repo="http://cran.rstudio.com/"); |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
32 if (!require('stringi')) install.packages('stringi',repo="http://cran.rstudio.com/"); |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
33 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
34 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
35 suppressPackageStartupMessages(library("argparse")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
36 library('stringr') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
37 library('stringi') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
38 library('RCurl') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
39 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
40 parser <- ArgumentParser() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
41 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
42 parser$add_argument("-i", "--input", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
43 help = "input fie name. add path if file is not in workind directory") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
44 parser$add_argument("-o", "--output", default="pmids_to_pubtator_matrix_output", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
45 help = "output file name. [default \"%(default)s\"]") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
46 parser$add_argument("-c", "--categories", choices=c("Gene", "Disease", "Mutation", "Chemical", "Species"), nargs="+", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
47 default= c("Gene", "Disease", "Mutation", "Chemical"), |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
48 help = "Pubtator categories that should be considered. [default \"%(default)s\"]") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
49 parser$add_argument("-b", "--byid", action="store_true", default=FALSE, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
50 help="If you want to find common gene IDs / mesh IDs instead of scientific terms.") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
51 parser$add_argument("-n", "--number", default=NULL, type="integer", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
52 help="Number of most frequent terms/IDs to extract. By default all terms/IDs are extracted.") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
53 parser$add_argument("--install_packages", action="store_true", default=FALSE, |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
54 help="If you want to auto install missing required packages.") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
55 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
56 args <- parser$parse_args() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
57 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
58 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
59 data = read.delim(args$input, stringsAsFactors=FALSE, header = TRUE, sep='\t') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
60 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
61 pmid_cols_index <- grep(c("PMID"), names(data)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
62 word_matrix = data.frame() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
63 dict.table = data.frame() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
64 pmids_count <- 0 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
65 pubtator_max_ids = 100 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
66 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
67 get_pubtator_terms = function(pmids, categories){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
68 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
69 table = NULL |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
70 for (pmid_split in split(pmids, ceiling(seq_along(pmids)/pubtator_max_ids))){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
71 out.data = NULL |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
72 try_num <- 1 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
73 t_0 <- Sys.time() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
74 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
75 while(TRUE) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
76 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
77 # Timing check: kill at 3 min |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
78 if (try_num > 1){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
79 cat("Connection problem. Please wait. Try number:",try_num,"\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
80 Sys.sleep(time = 2*try_num) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
81 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
82 try_num <- try_num + 1 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
83 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
84 t_1 <- Sys.time() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
85 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
86 if(as.numeric(difftime(t_1, t_0, units = "mins")) > 3){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
87 message("Killing the request! Something is not working. Please, try again later","\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
88 return(table) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
89 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
90 out.data <- tryCatch({ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
91 getURL(paste("https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/pubtator?pmids=", |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
92 paste(pmid_split, collapse=","), sep = "")) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
93 }, error = function(e) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
94 print(e) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
95 next |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
96 }, finally = { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
97 Sys.sleep(0) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
98 }) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
99 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
100 if(!is.null(out.data)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
101 out.data = unlist(strsplit(out.data, "\n", fixed = T)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
102 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
103 # skip first few lines, is this needed? |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
104 for (i in 3:length(out.data)) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
105 temps = unlist(strsplit(out.data[i], "\t", fixed = T)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
106 if (length(temps) == 5) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
107 # make 5 be 6 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
108 temps = c(temps, NA) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
109 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
110 if (length(temps) == 6) { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
111 # keep only 6 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
112 table = rbind(table, temps) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
113 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
114 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
115 break |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
116 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
117 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
118 } #end while loop |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
119 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
120 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
121 index.categories = c() |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
122 categories = as.character(unlist(categories)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
123 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
124 if(ncol(table) == 6){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
125 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
126 for(i in categories){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
127 tmp.index = grep(TRUE, i == as.character(table[,5])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
128 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
129 if(length(tmp.index) > 0){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
130 index.categories = c(index.categories,tmp.index) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
131 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
132 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
133 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
134 table = as.data.frame(table, stringsAsFactors=FALSE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
135 table = table[index.categories,c(4,6)] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
136 table = table[!is.na(table[,2]),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
137 table = table[!(table[,2] == "NA"),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
138 table = table[!(table[,1] == "NA"),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
139 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
140 if(args$byid){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
141 if(!is.null(args$number)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
142 #retrieve top X mesh.ids |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
143 table.mesh = as.data.frame(table(table[,2])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
144 colnames(table.mesh)[1] = "mesh.id" |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
145 table = table[order(table.mesh$Freq, decreasing = TRUE),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
146 table = table[1:min(args$number, nrow(table.mesh)),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
147 table.mesh$mesh.id = as.character(table.mesh$mesh.id) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
148 #subset table for top X mesh.ids |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
149 table = table[which(as.character(table$V6) %in% as.character(table.mesh$mesh.id)),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
150 table = table[!duplicated(table[,2]),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
151 }else{ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
152 table = table[!duplicated(table[,2]),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
153 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
154 } else { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
155 if(!is.null(args$number)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
156 table[,1] = tolower(as.character(table[,1])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
157 table = as.data.frame(table(table[,1])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
158 colnames(table)[1] = "term" |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
159 table = table[order(table$Freq, decreasing = TRUE),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
160 table = table[1:min(args$number, nrow(table)),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
161 table$term = as.character(table$term) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
162 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
163 }else{ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
164 table[,1] = tolower(as.character(table[,1])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
165 table = table[!duplicated(table[,1]),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
166 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
167 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
168 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
169 return(table) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
170 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
171 } else { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
172 return(NULL) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
173 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
174 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
175 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
176 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
177 #for all PMIDs of a row get PubTator terms and add them to the matrix |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
178 for (i in 1:nrow(data)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
179 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
180 pmids = as.character(data[i,pmid_cols_index]) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
181 pmids = pmids[!pmids == "NA"] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
182 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
183 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
184 if ( (pmids_count > 10000)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
185 cat("Break (10s) to avoid killing of requests. Please wait.",'\n') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
186 Sys.sleep(10) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
187 pmids_count = 0 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
188 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
189 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
190 pmids_count = pmids_count + length(pmids) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
191 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
192 #get puptator terms with get_pubtator_terms function |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
193 if (length(pmids) >0){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
194 table = get_pubtator_terms(pmids, args$categories) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
195 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
196 if(!is.null(table)){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
197 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
198 colnames(table)= c("term","mesh.id") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
199 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
200 # add data in binary matrix |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
201 if (args$byid){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
202 mesh.ids = as.character(table$mesh.id) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
203 if (length(mesh.ids) > 0 ){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
204 word_matrix[i,mesh.ids] <- 1 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
205 cat(length(mesh.ids), " IDs for PMIDs of row", i," were added",'\n') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
206 # add data in dictionary |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
207 dict.table = rbind(dict.table, table) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
208 dict.table = dict.table[!duplicated(as.character(dict.table[,2])),] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
209 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
210 } else { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
211 terms = as.character(table[,1]) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
212 if (length(terms) > 0 ){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
213 word_matrix[i,terms] <- 1 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
214 cat(length(terms), " terms for PMIDs of row", i," were added.",'\n') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
215 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
216 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
217 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
218 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
219 } else { |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
220 cat("No terms for PMIDs of row", i," were found.",'\n') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
221 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
222 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
223 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
224 if (args$byid){ |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
225 #change column names of matrix: exchange mesh ids/ids with term |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
226 index_names = match(names(word_matrix), as.character(dict.table[[2]])) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
227 names(word_matrix) = dict.table[index_names,1] |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
228 } |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
229 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
230 colnames(word_matrix) = gsub("[^[:print:]]","",colnames(word_matrix)) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
231 colnames(word_matrix) = gsub('\"', "", colnames(word_matrix), fixed = TRUE) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
232 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
233 #merge duplicated columns |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
234 word_matrix = as.data.frame(do.call(cbind, by(t(word_matrix),INDICES=names(word_matrix),FUN=colSums))) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
235 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
236 #save binary matrix |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
237 word_matrix <- as.matrix(word_matrix) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
238 word_matrix[is.na(word_matrix)] <- 0 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
239 cat("Matrix with ",nrow(word_matrix)," rows and ",ncol(word_matrix)," columns generated.","\n") |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
240 #write.table(word_matrix, args$output) |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
241 write.table(word_matrix, args$output, row.names = FALSE, sep = '\t') |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
242 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
243 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
244 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
245 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
246 |
3f4adc85ba5d
"planemo upload for repository https://github.com/dlal-group/simtext commit fd3f5b7b0506fbc460f2a281f694cb57f1c90a3c-dirty"
dlalgroup
parents:
diff
changeset
|
247 |