Mercurial > repos > mnhn65mo > regionalgam
comparison dennis_gam_initial_functions.R @ 0:5b126f770671 draft
Uploaded
author | mnhn65mo |
---|---|
date | Fri, 03 Aug 2018 08:28:22 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b126f770671 |
---|---|
1 ### R-Script Adapted from script provided by the CEH, UK BY: Reto Schmucki [ reto.schmucki@mail.mcgill.ca] | |
2 ### DATE: 14 July 2014 function to run two stage model in DENNIS et al. 2013 | |
3 | |
4 | |
5 .onAttach <- function(libname, pkgname) { | |
6 packageStartupMessage(" The regionalGAM package that is no longer maintained, \n use the new rbms package instead. \n | |
7 devtools::install_github(\"RetoSchmucki/rbms\", force=TRUE)") | |
8 } | |
9 | |
10 | |
11 #' year_day_func Function | |
12 #' This function generate the full sequence of days, months and include the observation to that file. | |
13 #' @param sp_data A data.frame with your observation. | |
14 #' @keywords year days | |
15 #' @export | |
16 #' @author Reto Schmucki | |
17 #' @examples | |
18 #' year_day_func() | |
19 | |
20 | |
21 # FUNCTIONS | |
22 | |
23 year_day_func = function(sp_data) { | |
24 | |
25 year <- unique(sp_data$YEAR) | |
26 | |
27 origin.d <- paste(year, "01-01", sep = "-") | |
28 if ((year%%4 == 0) & ((year%%100 != 0) | (year%%400 == 0))) { | |
29 nday <- 366 | |
30 } else { | |
31 nday <- 365 | |
32 } | |
33 | |
34 date.serie <- as.POSIXlt(seq(as.Date(origin.d), length = nday, by = "day"), format = "%Y-%m-%d") | |
35 | |
36 dayno <- as.numeric(julian(date.serie, origin = as.Date(origin.d)) + 1) | |
37 month <- as.numeric(strftime(date.serie, format = "%m")) | |
38 week <- as.numeric(strftime(date.serie, format = "%W")) | |
39 week_day <- as.numeric(strftime(date.serie, format = "%u")) | |
40 day <- as.numeric(strftime(date.serie, format = "%d")) | |
41 | |
42 site_list <- sp_data[!duplicated(sp_data$SITE), c("SITE")] | |
43 | |
44 all_day_site <- data.frame(SPECIES = sp_data$SPECIES[1], SITE = rep(site_list, rep(nday, length(site_list))), | |
45 YEAR = sp_data$YEAR[1], MONTH = month, WEEK = week, DAY = day, DAY_WEEK = week_day, DAYNO = dayno, | |
46 COUNT = NA) | |
47 | |
48 count_index <- match(paste(sp_data$SITE, sp_data$DAYNO, sep = "_"), paste(all_day_site$SITE, all_day_site$DAYNO, | |
49 sep = "_")) | |
50 all_day_site$COUNT[count_index] <- sp_data$COUNT | |
51 site_count_length <- aggregate(sp_data$COUNT, by = list(sp_data$SITE), function(x) list(1:length(x))) | |
52 names(site_count_length$x) <- as.character(site_count_length$Group.1) | |
53 site_countno <- utils::stack(site_count_length$x) | |
54 all_day_site$COUNTNO <- NA | |
55 all_day_site$COUNTNO[count_index] <- site_countno$values # add count number to ease extraction of single count | |
56 | |
57 # Add zero to close observation season two weeks before and after the first and last | |
58 first_obs <- min(all_day_site$DAYNO[!is.na(all_day_site$COUNT)]) | |
59 last_obs <- max(all_day_site$DAYNO[!is.na(all_day_site$COUNT)]) | |
60 | |
61 closing_season <- c((first_obs - 11):(first_obs - 7), (last_obs + 7):(last_obs + 11)) | |
62 | |
63 # If closing season is before day 1 or day 365, simply set the first and last 5 days to 0 | |
64 if (min(closing_season) < 1) | |
65 closing_season[1:5] <- c(1:5) | |
66 if (max(closing_season) > nday) | |
67 closing_season[6:10] <- c((nday - 4):nday) | |
68 | |
69 all_day_site$COUNT[all_day_site$DAYNO %in% closing_season] <- 0 | |
70 all_day_site$ANCHOR <- 0 | |
71 all_day_site$ANCHOR[all_day_site$DAYNO %in% closing_season] <- 1 | |
72 | |
73 all_day_site <- all_day_site[order(all_day_site$SITE, all_day_site$DAYNO), ] | |
74 | |
75 return(all_day_site) | |
76 } | |
77 | |
78 | |
79 #' trap_area Function | |
80 #' | |
81 #' This function compute the area under the curve using the trapezoid method. | |
82 #' @param x A vector or a two columns matrix. | |
83 #' @param y A vector, Default is NULL | |
84 #' @keywords trapezoid | |
85 #' @export | |
86 #' @examples | |
87 #' trap_area() | |
88 | |
89 | |
90 trap_area = function(x, y = NULL) { | |
91 # If y is null and x has multiple columns then set y to x[,2] and x to x[,1] | |
92 if (is.null(y)) { | |
93 if (length(dim(x)) == 2) { | |
94 y = x[, 2] | |
95 x = x[, 1] | |
96 } else { | |
97 stop("ERROR: need to either specifiy both x and y or supply a two column data.frame/matrix to x") | |
98 } | |
99 } | |
100 | |
101 # Check x and y are same length | |
102 if (length(x) != length(y)) { | |
103 stop("ERROR: x and y need to be the same length") | |
104 } | |
105 | |
106 # Need to exclude any pairs that are NA for either x or y | |
107 rm_inds = which(is.na(x) | is.na(y)) | |
108 if (length(rm_inds) > 0) { | |
109 x = x[-rm_inds] | |
110 y = y[-rm_inds] | |
111 } | |
112 | |
113 # Determine values of trapezoids under curve Get inds | |
114 inds = 1:(length(x) - 1) | |
115 # Determine area using trapezoidal rule Area = ( (b1 + b2)/2 ) * h where b1 and b2 are lengths of bases | |
116 # (the parallel sides) and h is the height (the perpendicular distance between two bases) | |
117 areas = ((y[inds] + y[inds + 1])/2) * diff(x) | |
118 | |
119 # total area is sum of all trapezoid areas | |
120 tot_area = sum(areas) | |
121 | |
122 # Return total area | |
123 return(tot_area) | |
124 } | |
125 | |
126 | |
127 #' trap_index Function | |
128 #' | |
129 #' This function compute the area under the curve (Abundance Index) across species, sites and years | |
130 #' @param sp_data A data.frame containing species count data generated from the year_day_func() | |
131 #' @param y A vector, Default is NULL | |
132 #' @keywords Abundance index | |
133 #' @export | |
134 #' @examples | |
135 #' trap_index() | |
136 | |
137 | |
138 | |
139 trap_index = function(sp_data, data_col = "IMP", time_col = "DAYNO", by_col = c("SPECIES", "SITE", "YEAR")) { | |
140 | |
141 # Build output data.frame | |
142 out_obj = unique(sp_data[, by_col]) | |
143 # Set row.names to be equal to collapsing of output rows (will be unique, you need them to make uploading | |
144 # values back to data.frame will be easier) | |
145 row.names(out_obj) = apply(out_obj, 1, paste, collapse = "_") | |
146 | |
147 # Using this row.names from out_obj above as index in by function to loop through values all unique combs | |
148 # of by_cols and fit trap_area to data | |
149 ind_dat = by(sp_data[, c(time_col, data_col)], apply(sp_data[, by_col], 1, paste, collapse = "_"), trap_area) | |
150 | |
151 # Add this data to output object | |
152 out_obj[names(ind_dat), "SINDEX"] = round(ind_dat/7, 1) | |
153 | |
154 # Set row.names to defaults | |
155 row.names(out_obj) = NULL | |
156 | |
157 # Return output object | |
158 return(out_obj) | |
159 } | |
160 | |
161 | |
162 #' flight_curve Function | |
163 #' This function compute the flight curve across and years | |
164 #' @param your_dataset A data.frame containing original species count data. The data format is a csv (comma "," separated) with 6 columns with headers, namely "species","transect_id","visit_year","visit_month","visit_day","sp_count" | |
165 #' @keywords standardize flight curve | |
166 #' @export | |
167 #' @examples | |
168 #' flight_curve() | |
169 | |
170 | |
171 flight_curve <- function(your_dataset) { | |
172 | |
173 if("mgcv" %in% installed.packages() == "FALSE") { | |
174 print("mgcv package is not installed.") | |
175 x <- readline("Do you want to install it? Y/N") | |
176 if (x == 'Y') { | |
177 install.packages("mgcv") | |
178 } | |
179 if (x == 'N') { | |
180 stop("flight curve can not be computed without the mgcv package, sorry") | |
181 } | |
182 } | |
183 your_dataset$DAYNO <- strptime(paste(your_dataset$DAY, your_dataset$MONTH, | |
184 your_dataset$YEAR, sep = "/"), "%d/%m/%Y")$yday + 1 | |
185 dataset <- your_dataset[, c("SPECIES", "SITE", "YEAR", "MONTH", | |
186 "DAY", "DAYNO", "COUNT")] | |
187 sample_year <- unique(dataset$YEAR) | |
188 sample_year <- sample_year[order(sample_year)] | |
189 if (length(sample_year) >1 ) { | |
190 for (y in sample_year) { | |
191 dataset_y <- dataset[dataset$YEAR == y, ] | |
192 nsite <- length(unique(dataset_y$SITE)) | |
193 # Determine missing days and add to dataset | |
194 sp_data_all <- year_day_func(dataset_y) | |
195 if (nsite > 200) { | |
196 sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, | |
197 200, replace = F)]), ] | |
198 sp_data_all <- sp_data_all | |
199 } | |
200 sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 | |
201 print(paste("Fitting the GAM for",as.character(sp_data_all$SPECIES[1]),"and year",y,"with",length(unique(sp_data_all$SITE)),"sites :",Sys.time())) | |
202 if(length(unique(sp_data_all$SITE))>1){ | |
203 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) -1, | |
204 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
205 } | |
206 else { | |
207 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, | |
208 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
209 } | |
210 # Give a second try if the GAM does not converge the first time | |
211 if (class(gam_obj_site)[1] == "try-error") { | |
212 # Determine missing days and add to dataset | |
213 sp_data_all <- year_day_func(dataset_y) | |
214 if (nsite > 200) { | |
215 sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, | |
216 200, replace = F)]), ] | |
217 } | |
218 else { | |
219 sp_data_all <- sp_data_all | |
220 } | |
221 sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 | |
222 print(paste("Fitting the GAM for",sp_data_all$SPECIES[1],"at year", y,"with",length(unique(sp_data_all$SITE)),"sites :",Sys.time(),"second try")) | |
223 if(length(unique(sp_data_all$SITE))>1){ | |
224 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) -1, | |
225 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
226 } | |
227 else { | |
228 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, | |
229 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
230 } | |
231 if (class(gam_obj_site)[1] == "try-error") { | |
232 print(paste("Error in fitting the flight period for",sp_data_all$SPECIES[1],"at year", y,"no convergence after two trial")) | |
233 sp_data_all[, "FITTED"] <- NA | |
234 sp_data_all[, "COUNT_IMPUTED"] <- NA | |
235 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA | |
236 sp_data_all[, "NM"] <- NA | |
237 } | |
238 else { | |
239 # Generate a list of values for all days from the additive model and use | |
240 # these value to fill the missing observations | |
241 sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, | |
242 c("trimDAYNO", "SITE")], type = "response") | |
243 # force zeros at the beginning end end of the year | |
244 sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 | |
245 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 | |
246 # if infinite number in predict replace with NA. | |
247 if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ | |
248 print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) | |
249 sp_data_all[, "FITTED"] <- NA | |
250 sp_data_all[, "COUNT_IMPUTED"] <- NA | |
251 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA | |
252 sp_data_all[, "NM"] <- NA | |
253 } | |
254 else { | |
255 sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT | |
256 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] | |
257 # Define the flight curve from the fitted values and append them over | |
258 # years (this is one flight curve per year for all site) | |
259 site_sums <- aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), | |
260 FUN = sum) | |
261 # Rename sum column | |
262 names(site_sums)[names(site_sums) == "x"] <- "SITE_YR_FSUM" | |
263 # Add data to sp_data data.frame (ensure merge does not sort the data!) | |
264 sp_data_all = merge(sp_data_all, site_sums, by <- c("SITE"), | |
265 all = TRUE, sort = FALSE) | |
266 # Calculate normalized values | |
267 sp_data_all[, "NM"] <- sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM | |
268 } | |
269 } | |
270 } | |
271 else { | |
272 # Generate a list of values for all days from the additive model and use | |
273 # these value to fill the missing observations | |
274 sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, | |
275 c("trimDAYNO", "SITE")], type = "response") | |
276 # force zeros at the beginning end end of the year | |
277 sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 | |
278 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 | |
279 # if infinite number in predict replace with NA. | |
280 if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ | |
281 print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) | |
282 sp_data_all[, "FITTED"] <- NA | |
283 sp_data_all[, "COUNT_IMPUTED"] <- NA | |
284 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA | |
285 sp_data_all[, "NM"] <- NA | |
286 } | |
287 else { | |
288 sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT | |
289 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] | |
290 # Define the flight curve from the fitted values and append them over | |
291 # years (this is one flight curve per year for all site) | |
292 site_sums = aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), | |
293 FUN = sum) | |
294 # Rename sum column | |
295 names(site_sums)[names(site_sums) == "x"] = "SITE_YR_FSUM" | |
296 # Add data to sp_data data.frame (ensure merge does not sort the data!) | |
297 sp_data_all = merge(sp_data_all, site_sums, by = c("SITE"), all = TRUE, | |
298 sort = FALSE) | |
299 # Calculate normalized values | |
300 sp_data_all[, "NM"] = sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM | |
301 } | |
302 } | |
303 sp_data_filled <- sp_data_all | |
304 flight_curve <- data.frame(species = sp_data_filled$SPECIES, year = sp_data_filled$YEAR, | |
305 week = sp_data_filled$WEEK, DAYNO = sp_data_filled$DAYNO, DAYNO_adj = sp_data_filled$trimDAYNO, | |
306 nm = sp_data_filled$NM)[!duplicated(paste(sp_data_filled$YEAR, | |
307 sp_data_filled$DAYNO, sep = "_")), ] | |
308 flight_curve <- flight_curve[order(flight_curve$DAYNO), ] | |
309 # bind if exist else create | |
310 if (is.na(flight_curve$nm[1])) next() | |
311 if ("flight_pheno" %in% ls()) { | |
312 flight_pheno <- rbind(flight_pheno, flight_curve) | |
313 } | |
314 else { | |
315 flight_pheno <- flight_curve | |
316 } | |
317 } # end of year loop | |
318 } | |
319 else { | |
320 y <- unique(dataset$YEAR) | |
321 dataset_y <- dataset[dataset$YEAR == y, ] | |
322 nsite <- length(unique(dataset_y$SITE)) | |
323 # Determine missing days and add to dataset | |
324 sp_data_all <- year_day_func(dataset_y) | |
325 if (nsite > 200) { | |
326 sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, | |
327 200, replace = F)]), ] | |
328 } | |
329 else { | |
330 sp_data_all <- sp_data_all | |
331 } | |
332 sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 | |
333 print(paste("Fitting the GAM for",sp_data_all$SPECIES[1],"at year", y,":",Sys.time())) | |
334 if(length(unique(sp_data_all$SITE))>1){ | |
335 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) -1, | |
336 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
337 } | |
338 else { | |
339 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, | |
340 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
341 } | |
342 # Give a second try if the GAM does not converge the first time | |
343 if (class(gam_obj_site)[1] == "try-error") { | |
344 # Determine missing days and add to dataset | |
345 sp_data_all <- year_day_func(dataset_y) | |
346 if (nsite > 200) { | |
347 sp_data_all <- sp_data_all[as.character(sp_data_all$SITE) %in% as.character(unique(dataset_y$SITE)[sample(1:nsite, | |
348 200, replace = F)]), ] | |
349 } | |
350 else { | |
351 sp_data_all <- sp_data_all | |
352 } | |
353 sp_data_all$trimDAYNO <- sp_data_all$DAYNO - min(sp_data_all$DAYNO) + 1 | |
354 print(paste("Fitting the GAM for",sp_data_all$SPECIES[1],"at year", y,"with",length(unique(sp_data_all$SITE)),"sites :",Sys.time(),"second try")) | |
355 if(length(unique(sp_data_all$SITE))>1){ | |
356 gam_obj_site <- try(mgcv::bam(COUNT ~ s(trimDAYNO, bs = "cr") + as.factor(SITE) - 1, | |
357 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
358 } | |
359 else { | |
360 gam_obj_site <- try(mgcv::gam(COUNT ~ s(trimDAYNO, bs = "cr") -1, | |
361 data = sp_data_all, family = poisson(link = "log")), silent = TRUE) | |
362 } | |
363 if (class(gam_obj_site)[1] == "try-error") { | |
364 print(paste("Error in fitting the flight period for",sp_data_all$SPECIES[1],"at year", y,"no convergence after two trial")) | |
365 sp_data_all[, "FITTED"] <- NA | |
366 sp_data_all[, "COUNT_IMPUTED"] <- NA | |
367 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA | |
368 sp_data_all[, "NM"] <- NA | |
369 } | |
370 else { | |
371 # Generate a list of values for all days from the additive model and use | |
372 # these value to fill the missing observations | |
373 sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, | |
374 c("trimDAYNO", "SITE")], type = "response") | |
375 # force zeros at the beginning end end of the year | |
376 sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 | |
377 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 | |
378 # if infinite number in predict replace with NA. | |
379 if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ | |
380 print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) | |
381 sp_data_all[, "FITTED"] <- NA | |
382 sp_data_all[, "COUNT_IMPUTED"] <- NA | |
383 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA | |
384 sp_data_all[, "NM"] <- NA | |
385 } | |
386 else { | |
387 sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT | |
388 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] | |
389 # Define the flight curve from the fitted values and append them over | |
390 # years (this is one flight curve per year for all site) | |
391 site_sums <- aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), | |
392 FUN = sum) | |
393 # Rename sum column | |
394 names(site_sums)[names(site_sums) == "x"] <- "SITE_YR_FSUM" | |
395 # Add data to sp_data data.frame (ensure merge does not sort the data!) | |
396 sp_data_all = merge(sp_data_all, site_sums, by <- c("SITE"), | |
397 all = TRUE, sort = FALSE) | |
398 # Calculate normalized values | |
399 sp_data_all[, "NM"] <- sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM | |
400 } | |
401 } | |
402 } | |
403 else { | |
404 # Generate a list of values for all days from the additive model and use | |
405 # these value to fill the missing observations | |
406 sp_data_all[, "FITTED"] <- mgcv::predict.gam(gam_obj_site, newdata = sp_data_all[, | |
407 c("trimDAYNO", "SITE")], type = "response") | |
408 # force zeros at the beginning end end of the year | |
409 sp_data_all[sp_data_all$trimDAYNO < 60, "FITTED"] <- 0 | |
410 sp_data_all[sp_data_all$trimDAYNO > 305, "FITTED"] <- 0 | |
411 # if infinite number in predict replace with NA. | |
412 if(sum(is.infinite(sp_data_all[, "FITTED"]))>0){ | |
413 print(paste("Error in the flight period for",sp_data_all$SPECIES[1],"at year", y,"weird predicted values")) | |
414 sp_data_all[, "FITTED"] <- NA | |
415 sp_data_all[, "COUNT_IMPUTED"] <- NA | |
416 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- NA | |
417 sp_data_all[, "NM"] <- NA | |
418 } | |
419 else { | |
420 sp_data_all[, "COUNT_IMPUTED"] <- sp_data_all$COUNT | |
421 sp_data_all[is.na(sp_data_all$COUNT), "COUNT_IMPUTED"] <- sp_data_all$FITTED[is.na(sp_data_all$COUNT)] | |
422 # Define the flight curve from the fitted values and append them over | |
423 # years (this is one flight curve per year for all site) | |
424 site_sums = aggregate(sp_data_all$FITTED, by = list(SITE = sp_data_all$SITE), | |
425 FUN = sum) | |
426 # Rename sum column | |
427 names(site_sums)[names(site_sums) == "x"] = "SITE_YR_FSUM" | |
428 # Add data to sp_data data.frame (ensure merge does not sort the data!) | |
429 sp_data_all = merge(sp_data_all, site_sums, by = c("SITE"), all = TRUE, | |
430 sort = FALSE) | |
431 # Calculate normalized values | |
432 sp_data_all[, "NM"] = sp_data_all$FITTED/sp_data_all$SITE_YR_FSUM | |
433 } | |
434 } | |
435 sp_data_filled <- sp_data_all | |
436 flight_curve <- data.frame(species = sp_data_filled$SPECIES, year = sp_data_filled$YEAR, | |
437 week = sp_data_filled$WEEK, DAYNO = sp_data_filled$DAYNO, DAYNO_adj = sp_data_filled$trimDAYNO, | |
438 nm = sp_data_filled$NM)[!duplicated(paste(sp_data_filled$YEAR, | |
439 sp_data_filled$DAYNO, sep = "_")), ] | |
440 flight_curve <- flight_curve[order(flight_curve$DAYNO), ] | |
441 if (is.na(flight_curve$nm[1])){ | |
442 flight_pheno <- data.frame() | |
443 } | |
444 else { | |
445 # bind if exist else create | |
446 if ("flight_pheno" %in% ls()) { | |
447 flight_pheno <- rbind(flight_pheno, flight_curve) | |
448 } | |
449 else { | |
450 flight_pheno <- flight_curve | |
451 } | |
452 } | |
453 } | |
454 return(flight_pheno) | |
455 } | |
456 | |
457 | |
458 #' abundance_index Function | |
459 #' | |
460 #' This function compute the Abundance Index across sites and years from your dataset and the regional flight curve | |
461 #' @param your_dataset A data.frame containing original species count data. The data format is a csv (comma "," separated) with 6 columns with headers, namely "species","transect_id","visit_year","visit_month","visit_day","sp_count" | |
462 #' @param flight_pheno A data.frame for the regional flight curve computed with the function flight_curve | |
463 #' @keywords standardize flight curve | |
464 #' @export | |
465 #' @examples | |
466 #' abundance_index() | |
467 | |
468 abundance_index <- function(your_dataset,flight_pheno) { | |
469 | |
470 your_dataset$DAYNO <- strptime(paste(your_dataset$DAY, your_dataset$MONTH, | |
471 your_dataset$YEAR, sep = "/"), "%d/%m/%Y")$yday + 1 | |
472 | |
473 dataset <- your_dataset[, c("SPECIES", "SITE", "YEAR", "MONTH", | |
474 "DAY", "DAYNO", "COUNT")] | |
475 | |
476 sample_year <- unique(dataset$YEAR) | |
477 sample_year <- sample_year[order(sample_year)] | |
478 | |
479 | |
480 if (length(sample_year)>1){ | |
481 | |
482 for (y in sample_year) { | |
483 | |
484 year_pheno <- flight_pheno[flight_pheno$year == y, ] | |
485 | |
486 dataset_y <- dataset[dataset$YEAR == y, ] | |
487 | |
488 sp_data_site <- year_day_func(dataset_y) | |
489 sp_data_site$trimDAYNO <- sp_data_site$DAYNO - min(sp_data_site$DAYNO) + 1 | |
490 | |
491 sp_data_site <- merge(sp_data_site, year_pheno[, c("DAYNO", "nm")], | |
492 by = c("DAYNO"), all.x = TRUE, sort = FALSE) | |
493 | |
494 # compute proportion of the flight curve sampled due to missing visits | |
495 pro_missing_count <- data.frame(SITE = sp_data_site$SITE, WEEK = sp_data_site$WEEK, | |
496 NM = sp_data_site$nm, COUNT = sp_data_site$COUNT, ANCHOR = sp_data_site$ANCHOR) | |
497 pro_missing_count$site_week <- paste(pro_missing_count$SITE, pro_missing_count$WEEK, | |
498 sep = "_") | |
499 siteweeknocount <- aggregate(pro_missing_count$COUNT, by = list(pro_missing_count$site_week), | |
500 function(x) sum(!is.na(x)) == 0) | |
501 pro_missing_count <- pro_missing_count[pro_missing_count$site_week %in% | |
502 siteweeknocount$Group.1[siteweeknocount$x == TRUE], ] | |
503 pro_count_agg <- aggregate(pro_missing_count$NM, by = list(pro_missing_count$SITE), | |
504 function(x) 1 - sum(x, na.rm = T)) | |
505 names(pro_count_agg) <- c("SITE", "PROP_PHENO_SAMPLED") | |
506 | |
507 # remove samples outside the monitoring window | |
508 sp_data_site$COUNT[sp_data_site$nm==0] <- NA | |
509 | |
510 # Compute the regional GAM index | |
511 | |
512 if(length(unique(sp_data_site$SITE))>1){ | |
513 glm_obj_site <- glm(COUNT ~ factor(SITE) + offset(log(nm)) - 1, data = sp_data_site, | |
514 family = quasipoisson(link = "log"), control = list(maxit = 100)) | |
515 } else { | |
516 glm_obj_site <- glm(COUNT ~ offset(log(nm)) - 1, data = sp_data_site, | |
517 family = quasipoisson(link = "log"), control = list(maxit = 100)) | |
518 } | |
519 | |
520 sp_data_site[, "FITTED"] <- predict.glm(glm_obj_site, newdata = sp_data_site, | |
521 type = "response") | |
522 sp_data_site[, "COUNT_IMPUTED"] <- sp_data_site$COUNT | |
523 sp_data_site[is.na(sp_data_site$COUNT), "COUNT_IMPUTED"] <- sp_data_site$FITTED[is.na(sp_data_site$COUNT)] | |
524 | |
525 ## add fitted value for missing mid-week data | |
526 sp_data_site <- sp_data_site[!paste(sp_data_site$DAY_WEEK, sp_data_site$COUNT) %in% | |
527 c("1 NA", "2 NA", "3 NA", "5 NA", "6 NA", "7 NA"), ] | |
528 | |
529 ## remove all added mid-week values for weeks with real count | |
530 ## (observation) | |
531 sp_data_site$site_week <- paste(sp_data_site$SITE, sp_data_site$WEEK, | |
532 sep = "_") | |
533 siteweekcount <- aggregate(sp_data_site$COUNT, by = list(sp_data_site$site_week), | |
534 function(x) sum(!is.na(x)) > 0) | |
535 sp_data_site <- sp_data_site[!(is.na(sp_data_site$COUNT) & (sp_data_site$site_week %in% | |
536 siteweekcount$Group.1[siteweekcount$x == TRUE])), names(sp_data_site) != | |
537 "site_week"] | |
538 | |
539 ## Compute the regional GAM index | |
540 print(paste("Compute index for",sp_data_site$SPECIES[1],"at year", y,"for",length(unique(sp_data_site$SITE)),"sites:",Sys.time())) | |
541 regional_gam_index <- trap_index(sp_data_site, data_col = "COUNT_IMPUTED", | |
542 time_col = "DAYNO", by_col = c("SPECIES", "SITE", "YEAR")) | |
543 | |
544 cumu_index <- merge(regional_gam_index, pro_count_agg, by = c("SITE"), | |
545 all.x = TRUE, sort = FALSE) | |
546 names(cumu_index) <- c("SITE", "SPECIES", "YEAR", "regional_gam", "prop_pheno_sampled") | |
547 | |
548 cumu_index <- cumu_index[order(cumu_index$SITE), ] | |
549 | |
550 # bind if exist else create | |
551 if ("cumullated_indices" %in% ls()) { | |
552 cumullated_indices <- rbind(cumullated_indices, cumu_index) | |
553 } else { | |
554 cumullated_indices <- cumu_index | |
555 } | |
556 | |
557 } # end of year loop | |
558 | |
559 } else { | |
560 | |
561 y <- unique(dataset$YEAR) | |
562 year_pheno <- flight_pheno[flight_pheno$year == y, ] | |
563 | |
564 dataset_y <- dataset[dataset$YEAR == y, ] | |
565 | |
566 sp_data_site <- year_day_func(dataset_y) | |
567 sp_data_site$trimDAYNO <- sp_data_site$DAYNO - min(sp_data_site$DAYNO) + 1 | |
568 | |
569 sp_data_site <- merge(sp_data_site, year_pheno[, c("DAYNO", "nm")], | |
570 by = c("DAYNO"), all.x = TRUE, sort = FALSE) | |
571 | |
572 # compute proportion of the flight curve sampled due to missing visits | |
573 pro_missing_count <- data.frame(SITE = sp_data_site$SITE, WEEK = sp_data_site$WEEK, | |
574 NM = sp_data_site$nm, COUNT = sp_data_site$COUNT, ANCHOR = sp_data_site$ANCHOR) | |
575 pro_missing_count$site_week <- paste(pro_missing_count$SITE, pro_missing_count$WEEK, | |
576 sep = "_") | |
577 siteweeknocount <- aggregate(pro_missing_count$COUNT, by = list(pro_missing_count$site_week), | |
578 function(x) sum(!is.na(x)) == 0) | |
579 pro_missing_count <- pro_missing_count[pro_missing_count$site_week %in% | |
580 siteweeknocount$Group.1[siteweeknocount$x == TRUE], ] | |
581 pro_count_agg <- aggregate(pro_missing_count$NM, by = list(pro_missing_count$SITE), | |
582 function(x) 1 - sum(x, na.rm = T)) | |
583 names(pro_count_agg) <- c("SITE", "PROP_PHENO_SAMPLED") | |
584 | |
585 # remove samples outside the monitoring window | |
586 sp_data_site$COUNT[sp_data_site$nm==0] <- NA | |
587 | |
588 # Compute the regional GAM index | |
589 if(length(unique(sp_data_site$SITE))>1){ | |
590 glm_obj_site <- glm(COUNT ~ factor(SITE) + offset(log(nm)) - 1, data = sp_data_site, | |
591 family = quasipoisson(link = "log"), control = list(maxit = 100)) | |
592 } else { | |
593 glm_obj_site <- glm(COUNT ~ offset(log(nm)) - 1, data = sp_data_site, | |
594 family = quasipoisson(link = "log"), control = list(maxit = 100)) | |
595 } | |
596 | |
597 sp_data_site[, "FITTED"] <- predict.glm(glm_obj_site, newdata = sp_data_site, | |
598 type = "response") | |
599 sp_data_site[, "COUNT_IMPUTED"] <- sp_data_site$COUNT | |
600 sp_data_site[is.na(sp_data_site$COUNT), "COUNT_IMPUTED"] <- sp_data_site$FITTED[is.na(sp_data_site$COUNT)] | |
601 | |
602 # add fitted value for missing mid-week data | |
603 sp_data_site <- sp_data_site[!paste(sp_data_site$DAY_WEEK, sp_data_site$COUNT) %in% | |
604 c("1 NA", "2 NA", "3 NA", "5 NA", "6 NA", "7 NA"), ] | |
605 | |
606 # remove all added mid-week values for weeks with real count | |
607 # (observation) | |
608 sp_data_site$site_week <- paste(sp_data_site$SITE, sp_data_site$WEEK, | |
609 sep = "_") | |
610 siteweekcount <- aggregate(sp_data_site$COUNT, by = list(sp_data_site$site_week), | |
611 function(x) sum(!is.na(x)) > 0) | |
612 sp_data_site <- sp_data_site[!(is.na(sp_data_site$COUNT) & (sp_data_site$site_week %in% | |
613 siteweekcount$Group.1[siteweekcount$x == TRUE])), names(sp_data_site) != | |
614 "site_week"] | |
615 | |
616 # Compute the regional gam index | |
617 print(paste("Compute index for",sp_data_site$SPECIES[1],"at year", y,"for",length(unique(sp_data_site$SITE)),"sites:",Sys.time())) | |
618 regional_gam_index <- trap_index(sp_data_site, data_col = "COUNT_IMPUTED", | |
619 time_col = "DAYNO", by_col = c("SPECIES", "SITE", "YEAR")) | |
620 | |
621 cumu_index <- merge(regional_gam_index, pro_count_agg, by = c("SITE"), | |
622 all.x = TRUE, sort = FALSE) | |
623 names(cumu_index) <- c("SITE", "SPECIES", "YEAR", "regional_gam", "prop_pheno_sampled") | |
624 | |
625 cumu_index <- cumu_index[order(cumu_index$SITE), ] | |
626 | |
627 # bind if exist else create | |
628 if ("cumullated_indices" %in% ls()) { | |
629 cumullated_indices <- rbind(cumullated_indices, cumu_index) | |
630 } else { | |
631 cumullated_indices <- cumu_index | |
632 } | |
633 | |
634 } | |
635 | |
636 return(cumullated_indices) | |
637 | |
638 } |