Mercurial > repos > mnhn65mo > vigiechiro
comparison IdValid.R @ 0:0e3db3a308c0 draft default tip
Uploaded
author | mnhn65mo |
---|---|
date | Mon, 06 Aug 2018 09:13:29 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0e3db3a308c0 |
---|---|
1 library(data.table) | |
2 | |
3 ValidHier=function(x,y) #used to write validator id over observer id | |
4 { | |
5 if(y==""){x}else{y} | |
6 } | |
7 | |
8 f2p <- function(x) #get date-time data from recording file names | |
9 { | |
10 if (is.data.frame((x)[1])) {pretemps <- vector(length = nrow(x))} | |
11 op <- options(digits.secs = 3) | |
12 pretemps <- paste(substr(x, nchar(x) - 18, nchar(x)-4), ".", substr(x, nchar(x) - 2, nchar(x)), sep = "") | |
13 strptime(pretemps, "%Y%m%d_%H%M%OS",tz="UTC")-7200 | |
14 } | |
15 | |
16 args <- commandArgs(trailingOnly = TRUE) | |
17 | |
18 | |
19 #print(args) | |
20 | |
21 | |
22 #for test | |
23 #inputest=list.files("C:/Users/Yves Bas/Documents/GitHub/65MO_Galaxy-E/raw_scripts/Vigie-Chiro/output_IdCorrect_2ndLayer_input_IdValid/",full.names=T) | |
24 #for (i in 1:length(inputest)) | |
25 #{ | |
26 #args=c(inputest[i],"Referentiel_seuils_C2.csv") | |
27 #args=c("5857d56d9ebce1000ed89ea7-DataCorrC2.csv","Referentiel_seuils_C2.csv") | |
28 | |
29 | |
30 | |
31 IdCorrect=fread(args[1]) | |
32 RefSeuil=fread(args[2]) | |
33 #IdV=as.data.frame(subset(IdCorrect,select=observateur_taxon:validateur_probabilite)) | |
34 | |
35 #Step 0 :compute id score from 2nd Layer | |
36 test=match("participation",names(IdCorrect)) | |
37 IdCorrect$IdScore=apply(as.data.frame(IdCorrect)[,(test+1):(ncol(IdCorrect)-1)],MARGIN=1,FUN=max) | |
38 #compute true success probabilities according to logistic regression issued from "Referentiel_seuils" | |
39 CorrSp=match(IdCorrect$ProbEsp_C2bs,RefSeuil$Espece) | |
40 PSp=RefSeuil$Pente[CorrSp] | |
41 ISp=RefSeuil$Int[CorrSp] | |
42 | |
43 suppressWarnings(IdCorrect$IdProb<-mapply(FUN=function(w,x,y) if((!is.na(y))&(y>0)&(y<1000)) {(exp(y*w+x)/(1+exp(y*w+x)))}else{w} ,IdCorrect$IdScore,ISp,PSp)) | |
44 | |
45 | |
46 | |
47 | |
48 #Step 1 :compute id with confidence regarding a hierarchy (validator > observer) | |
49 IdCorrect$IdV=mapply(ValidHier,IdCorrect$observateur_taxon,IdCorrect$validateur_taxon) | |
50 IdCorrect$ConfV=mapply(ValidHier,IdCorrect$observateur_probabilite | |
51 ,IdCorrect$validateur_probabilite) | |
52 | |
53 | |
54 #print(paste(args[1],length(subset(IdCorrect$ConfV,IdCorrect$ConfV!="")))) | |
55 | |
56 #Step 2: Get numerictime data | |
57 suppressWarnings(IdCorrect$Session<-NULL) | |
58 suppressWarnings(IdCorrect$TimeNum<-NULL) | |
59 | |
60 if (substr(IdCorrect$`nom du fichier`[1],2,2)=="i") #for car/walk transects | |
61 { | |
62 FileInfo=as.data.table(tstrsplit(IdCorrect$`nom du fichier`,"-")) | |
63 IdCorrect$Session=as.numeric(substr(FileInfo$V4,5,nchar(FileInfo$V4))) | |
64 TimeSec=as.data.table(tstrsplit(FileInfo$V5,"_")) | |
65 TimeSec=as.data.frame(TimeSec) | |
66 if(sum(TimeSec[,(ncol(TimeSec)-1)]!="00000")==0) #to deal with double Kaleidoscope treatments | |
67 { | |
68 print("NOMS DE FICHIERS NON CONFORMES") | |
69 print("Vous les avez probablement traiter 2 fois par Kaleidoscope") | |
70 stop("Merci de nous signaler cette erreur par mail pour correction") | |
71 }else{ | |
72 IdCorrect$TimeNum=(IdCorrect$Session*800 | |
73 +as.numeric(TimeSec[,(ncol(TimeSec)-1)]) | |
74 +as.numeric(TimeSec[,(ncol(TimeSec))])/1000) | |
75 } | |
76 | |
77 }else{ | |
78 if(substr(IdCorrect$`nom du fichier`[1],2,2)=="a") #for stationary recordings | |
79 { | |
80 DateRec=as.POSIXlt(f2p(IdCorrect$`nom du fichier`)) | |
81 Nuit=format(as.Date(DateRec-43200*(DateRec$hour<12)),format="%d/%m/%Y") | |
82 #Nuit[is.na(Nuit)]=0 | |
83 IdCorrect$Session=Nuit | |
84 IdCorrect$TimeNum=as.numeric(DateRec) | |
85 | |
86 }else{ | |
87 print("NOMS DE FICHIERS NON CONFORMES") | |
88 stop("Ils doivent commencer par Cir (routier/pedestre) ou par Car (points fixes") | |
89 } | |
90 } | |
91 | |
92 | |
93 | |
94 | |
95 #Step 3 :treat sequentially each species identified by Tadarida-C | |
96 IdExtrap=vector() #to store the id extrapolated from validations | |
97 IdC2=IdCorrect[0,] #to store data in the right order | |
98 TypeE=vector() #to store the type of extrapolation made | |
99 for (j in 1:nlevels(as.factor(IdCorrect$ProbEsp_C2bs))) | |
100 { | |
101 IdSp=subset(IdCorrect | |
102 ,IdCorrect$ProbEsp_C2bs==levels(as.factor(IdCorrect$ProbEsp_C2bs))[j]) | |
103 if(sum(IdSp$IdV=="")==(nrow(IdSp))) #case 1 : no validation no change | |
104 { | |
105 IdC2=rbind(IdC2,IdSp) | |
106 IdExtrap=c(IdExtrap,rep(IdSp$ProbEsp_C2bs[1],nrow(IdSp))) | |
107 TypeE=c(TypeE,rep(0,nrow(IdSp))) | |
108 }else{ #case 2: some validation | |
109 Vtemp=subset(IdSp,IdSp$IdV!="") | |
110 #case2A: validations are homogeneous | |
111 if(nlevels(as.factor(Vtemp$IdV))==1) | |
112 { | |
113 IdC2=rbind(IdC2,IdSp) | |
114 IdExtrap=c(IdExtrap,rep(Vtemp$IdV[1],nrow(IdSp))) | |
115 TypeE=c(TypeE,rep(2,nrow(IdSp))) | |
116 }else{ | |
117 #case 2B: validations are heterogeneous | |
118 #case 2B1: some validations confirms the species identified by Tadarida and highest confidence are confirmed | |
119 subVT=subset(Vtemp,Vtemp$IdV==levels(as.factor(IdCorrect$ProbEsp_C2bs))[j]) | |
120 subVF=subset(Vtemp,Vtemp$IdV!=levels(as.factor(IdCorrect$ProbEsp_C2bs))[j]) | |
121 if((nrow(subVT)>0)&(max(subVT$IdProb)>max(subVF$IdProb))) | |
122 { | |
123 Vtemp=Vtemp[order(Vtemp$IdProb),] | |
124 test=(Vtemp$IdV!=Vtemp$ProbEsp_C2bs) | |
125 Fr1=max(which(test == TRUE)) #find the error with highest indices | |
126 Thr1=mean(Vtemp$IdProb[(Fr1):(Fr1+1)]) #define first threshold as the median confidence between the first error and the confirmed ID right over it | |
127 #id over this threshold are considered right | |
128 IdHC=subset(IdSp,IdSp$IdProb>Thr1) | |
129 IdC2=rbind(IdC2,IdHC) | |
130 IdExtrap=c(IdExtrap,rep(Vtemp$IdV[nrow(Vtemp)],nrow(IdHC))) | |
131 TypeE=c(TypeE,rep(2,nrow(IdHC))) | |
132 #id under this threshold are attributed to validated id closest in time | |
133 Vtemp=Vtemp[order(Vtemp$TimeNum),] | |
134 cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf) | |
135 CorrV=findInterval(IdSp$TimeNum, cuts) | |
136 IdE=Vtemp$IdV[CorrV] | |
137 IdEL=subset(IdE,IdSp$IdProb<=Thr1) | |
138 IdLC=subset(IdSp,IdSp$IdProb<=Thr1) | |
139 IdExtrap=c(IdExtrap,IdEL) | |
140 TypeE=c(TypeE,rep(1,length(IdEL))) | |
141 IdC2=rbind(IdC2,IdLC) | |
142 | |
143 | |
144 }else{ | |
145 #case 2B2: all validations concerns errors | |
146 #id are extrapolated on time only | |
147 Vtemp=Vtemp[order(Vtemp$TimeNum),] | |
148 cuts <- c(-Inf, Vtemp$TimeNum[-1]-diff(Vtemp$TimeNum)/2, Inf) | |
149 CorrV=findInterval(IdSp$TimeNum, cuts) | |
150 IdE=Vtemp$IdV[CorrV] | |
151 IdExtrap=c(IdExtrap,IdE) | |
152 TypeE=c(TypeE,rep(1,length(IdE))) | |
153 IdC2=rbind(IdC2,IdSp) | |
154 } | |
155 } | |
156 | |
157 | |
158 } | |
159 | |
160 | |
161 } | |
162 test1=(nrow(IdC2)==length(IdExtrap)) | |
163 test2=(nrow(IdC2)==nrow(IdCorrect)) | |
164 if((test1==F)|(test2==F)) | |
165 { | |
166 (stop("Erreur de traitement !!!")) | |
167 } | |
168 | |
169 IdC2$IdExtrap=IdExtrap | |
170 IdC2$TypeE=TypeE | |
171 | |
172 | |
173 IdC2=IdC2[order(IdC2$IdProb,decreasing=T),] | |
174 IdC2=IdC2[order(IdC2$ConfV,decreasing=T),] | |
175 IdC2=IdC2[order(IdC2$`nom du fichier`),] | |
176 #discard duplicated species within the same files (= false positives corrected by 2nd layer) | |
177 IdC2=unique(IdC2,by=c("nom du fichier","IdExtrap")) | |
178 | |
179 write.table(IdC2,"output.tabular",row.names=F,sep="\t") | |
180 #write.table(IdC2,paste0(substr(args[1],1,nchar(args[1])-15),"-IdC2.csv"),row.names=F,sep="\t") |