Mercurial > repos > ecology > wormsmeasurements
comparison wormsmeasurements.R @ 1:6f75ab89587a draft default tip
planemo upload for repository https://github.com/jeanlecras/tools-ecology/tree/master/tools/WormsMeasurements commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
| author | ecology |
|---|---|
| date | Wed, 28 May 2025 10:13:42 +0000 |
| parents | 23b963a1284e |
| children |
comparison
equal
deleted
inserted
replaced
| 0:23b963a1284e | 1:6f75ab89587a |
|---|---|
| 11 args <- commandArgs(trailingOnly = TRUE) | 11 args <- commandArgs(trailingOnly = TRUE) |
| 12 if (length(args) == 0) { | 12 if (length(args) == 0) { |
| 13 stop("This tool needs at least one argument") | 13 stop("This tool needs at least one argument") |
| 14 } | 14 } |
| 15 | 15 |
| 16 occurrence <- read.csv(args[1], header=T, sep="\t") %>% arrange(scientificName) | 16 scientificName_name <- args[3] |
| 17 occurrence <- read.csv(args[1], header=T, sep="\t") %>% | |
| 18 arrange(.[[scientificName_name]]) | |
| 17 measurement_types <- unlist(str_split(args[2], ",")) | 19 measurement_types <- unlist(str_split(args[2], ",")) |
| 18 include_inherited <- ifelse(args[4]=="true", T, F) | 20 include_inherited <- ifelse(args[4]=="true", T, F) |
| 19 pivot_wider <- ifelse(args[5]=="true", T, F) | 21 pivot_wider <- ifelse(args[5]=="true", T, F) |
| 20 scientificName_name <- args[3] | 22 exclude_NA <- ifelse(args[6]=="true", T, F) |
| 23 | |
| 24 # regex to only keep genus and specific epithet from scientific names | |
| 25 regex_find <- "^([A-Z][^A-Z(]+)(.*)$" | |
| 26 regex_replace <- "\\1" | |
| 21 | 27 |
| 22 | 28 |
| 23 ### | 29 # function to extract the measurement values from the attributes data tibble |
| 24 extract_traits_values <- function(traits_data) { | 30 extract_traits_values <- function(traits_data) { |
| 25 result <- setNames(rep(NA, length(measurement_types)), measurement_types) | 31 result <- setNames(rep(NA, length(measurement_types)), measurement_types) |
| 26 | 32 |
| 27 if (is.null(traits_data) || nrow(traits_data) == 0) { | 33 if (is.null(traits_data) || nrow(traits_data) == 0) { |
| 28 return(result) | 34 return(result) |
| 40 result[traits_filtered$measurementType[i]] <- traits_filtered$measurementValue[i] | 46 result[traits_filtered$measurementType[i]] <- traits_filtered$measurementValue[i] |
| 41 } | 47 } |
| 42 return(result) | 48 return(result) |
| 43 } | 49 } |
| 44 | 50 |
| 51 # function to call the call the WoRMS API and get the measurement values | |
| 45 get_life_history_traits <- function(scientific_name) { | 52 get_life_history_traits <- function(scientific_name) { |
| 46 if (scientific_name %in% names(cache)) { | 53 clean_scientific_name <- trimws(gsub(regex_find, regex_replace, scientific_name)) |
| 47 return(cache[[scientific_name]]) | 54 |
| 55 if (clean_scientific_name %in% names(cache)) { | |
| 56 return(cache[[clean_scientific_name]]) | |
| 48 } | 57 } |
| 49 | 58 |
| 50 worms_id <- tryCatch( | 59 worms_id <- tryCatch( |
| 51 wm_name2id(name = scientific_name), | 60 wm_name2id(name = clean_scientific_name), |
| 52 error = function(e) NA | 61 error = function(e) NA |
| 53 ) | 62 ) |
| 54 | 63 |
| 55 if (is.na(worms_id) || length(worms_id) == 0) { | 64 if (is.na(worms_id) || length(worms_id) == 0) { |
| 56 cache[[scientific_name]] <<- NULL | 65 cache[[clean_scientific_name]] <<- NULL |
| 57 return(NULL) | 66 return(NULL) |
| 58 } | 67 } |
| 59 | 68 |
| 60 data_attr <- tryCatch( | 69 data_attr <- tryCatch( |
| 61 wm_attr_data(worms_id, include_inherited=include_inherited), | 70 wm_attr_data(worms_id, include_inherited=include_inherited), |
| 62 error = function(e) NULL | 71 error = function(e) NULL |
| 63 ) | 72 ) |
| 64 | 73 |
| 65 if (is.null(data_attr)) { | 74 if (is.null(data_attr)) { |
| 66 cache[[scientific_name]] <<- NULL | 75 cache[[clean_scientific_name]] <<- NULL |
| 67 return(NULL) | 76 return(NULL) |
| 68 } | 77 } |
| 69 | 78 |
| 70 traits <- extract_traits_values(data_attr) | 79 traits <- extract_traits_values(data_attr) |
| 71 cache[[scientific_name]] <<- traits | 80 cache[[clean_scientific_name]] <<- traits |
| 72 return(traits) | 81 return(traits) |
| 73 } | 82 } |
| 74 | 83 |
| 84 # a cache to limit API calls | |
| 75 cache <- list() | 85 cache <- list() |
| 76 | 86 |
| 87 # add a columns conataining the lists of values of the measurments requested | |
| 77 trait_data <- occurrence %>% | 88 trait_data <- occurrence %>% |
| 78 mutate(life_history_traits = map(.data[[scientificName_name]], ~ get_life_history_traits(.x))) | 89 mutate(life_history_traits = map(.data[[scientificName_name]], ~ get_life_history_traits(.x))) |
| 79 | 90 |
| 80 view(trait_data) | 91 # convert the column of lists to multiple columns of unique values |
| 81 trait_data <- trait_data %>% | 92 trait_data <- trait_data %>% |
| 82 unnest_wider(life_history_traits) | 93 unnest_wider(life_history_traits) |
| 83 | 94 |
| 84 if (pivot_wider) { | 95 # make sur each measurement type has a column |
| 85 trait_data <- dummy_cols(trait_data, select_columns = measurement_types, remove_selected_columns=T, ignore_na=T) | 96 for (col in measurement_types) { |
| 86 | 97 if (!(col %in% names(trait_data))) { |
| 98 trait_data[[col]] <- NA | |
| 99 } | |
| 87 } | 100 } |
| 88 | 101 |
| 102 # list of quantitativ measurements | |
| 103 numeric_cols <- c() | |
| 104 | |
| 105 # try to convert columns to numeric and remember them | |
| 106 trait_data <- trait_data %>% | |
| 107 mutate(across(all_of(measurement_types), ~ { | |
| 108 numeric_col <- suppressWarnings(as.numeric(.)) | |
| 109 if (all(is.na(.) == is.na(numeric_col))) { | |
| 110 numeric_cols <<- c(numeric_cols, cur_column()) | |
| 111 numeric_col | |
| 112 } else { | |
| 113 . | |
| 114 } | |
| 115 })) | |
| 116 | |
| 117 # filter NA but only in the added columns | |
| 118 if (exclude_NA) { | |
| 119 trait_data <- trait_data[complete.cases(trait_data[, measurement_types]),] | |
| 120 } | |
| 121 | |
| 122 # determine what are the qualitativ columns to be one hot encoded | |
| 123 factor_cols <- setdiff(measurement_types, numeric_cols) | |
| 124 | |
| 125 # one hot encode quantitativ columns | |
| 126 if (pivot_wider & length(factor_cols) > 0) { | |
| 127 trait_data <- dummy_cols(trait_data, select_columns = factor_cols, remove_selected_columns=T, ignore_na=T) | |
| 128 } | |
| 129 | |
| 130 # write the enriched dataset as tabular | |
| 89 write.table(trait_data, "enriched_data.tabular", sep="\t", row.names = FALSE) | 131 write.table(trait_data, "enriched_data.tabular", sep="\t", row.names = FALSE) |
