Mercurial > repos > ecology > vigiechiro_idcorrect_2ndlayer
comparison IdCorrect_2ndLayer.R @ 0:6681b6ba1d7e draft
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tools/vigiechiro commit d2de8e10c11bfa3b04729e59bba58e08d23b56aa
author | ecology |
---|---|
date | Wed, 13 Mar 2019 11:18:36 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6681b6ba1d7e |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 suppressMessages(library(data.table)) | |
4 suppressMessages(library(randomForest)) | |
5 args <- commandArgs(trailingOnly = TRUE) | |
6 | |
7 set.seed(1) #To test reproductibility | |
8 | |
9 filename=args[3] | |
10 if (exists("ClassifEspC2b")==F){load(args[2])} | |
11 | |
12 DataPar=fread(args[1],na.strings="") #id to be corrected | |
13 DataPar$participation=substr(filename,nchar(filename)-40,nchar(filename)-17) | |
14 test1=duplicated(cbind(DataPar$'nom du fichier',DataPar$tadarida_taxon)) | |
15 test2=(DataPar$tadarida_taxon=="empty") | |
16 DataPar=subset(DataPar,(!test1)|(test2)) | |
17 DataPar$tadarida_probabilite[DataPar$tadarida_probabilite==""]="0" | |
18 DataPar$tadarida_probabilite=as.numeric(DataPar$tadarida_probabilite) | |
19 | |
20 | |
21 #table counting number of contacts per species | |
22 nbcT=as.matrix(table(DataPar$participation,DataPar$tadarida_taxon)) | |
23 | |
24 DataPar$tadarida_probabilite=as.numeric(DataPar$tadarida_probabilite) | |
25 | |
26 #generating input variables for second layer classification | |
27 | |
28 Q25=vector() | |
29 Q50=vector() | |
30 Q75=vector() | |
31 Q90=vector() | |
32 Q95=vector() | |
33 Q98=vector() | |
34 Q100=vector() | |
35 compt=0 | |
36 PropSp=nbcT[0,] | |
37 VoteO=DataPar[0,] | |
38 for (j in 1:nlevels(as.factor(DataPar$tadarida_taxon))) | |
39 { | |
40 Datasub2=subset(DataPar,DataPar$tadarida_taxon==levels(as.factor(DataPar$tadarida_taxon))[j]) | |
41 | |
42 Q25=c(Q25,rep(quantile(Datasub2$tadarida_probabilite,0.25),nrow(Datasub2))) | |
43 Q50=c(Q50,rep(quantile(Datasub2$tadarida_probabilite,0.50),nrow(Datasub2))) | |
44 Q75=c(Q75,rep(quantile(Datasub2$tadarida_probabilite,0.75),nrow(Datasub2))) | |
45 Q90=c(Q90,rep(quantile(Datasub2$tadarida_probabilite,0.90),nrow(Datasub2))) | |
46 Q95=c(Q95,rep(quantile(Datasub2$tadarida_probabilite,0.95),nrow(Datasub2))) | |
47 Q98=c(Q98,rep(quantile(Datasub2$tadarida_probabilite,0.98),nrow(Datasub2))) | |
48 Q100=c(Q100,rep(max(Datasub2$tadarida_probabilite),nrow(Datasub2))) | |
49 Ncont1=nrow(Datasub2) | |
50 VoteO=rbind(VoteO,Datasub2) | |
51 PropSp0=nbcT/Ncont1 | |
52 PropSp=rbind(PropSp,PropSp0[rep(seq_len(nrow(PropSp0)),nrow(Datasub2)),]) | |
53 compt=compt+nrow(Datasub2) | |
54 #print(paste(compt,levels(as.factor(DataPar$tadarida_taxon))[j])) | |
55 } | |
56 | |
57 VoteC2=cbind(VoteO,PropSp,Q25,Q50,Q75,Q90,Q95,Q98,Q100) | |
58 | |
59 | |
60 #editing column titles to identify var of type "proportion d'abondances" | |
61 for (i in 15:(ncol(VoteC2)-7)) | |
62 { | |
63 colnames(VoteC2)[i]=paste0(names(VoteC2)[i],"_prop") | |
64 } | |
65 | |
66 #Add missing species | |
67 EspForm=subset(row.names(ClassifEspC2b$importance) | |
68 ,substr(row.names(ClassifEspC2b$importance) | |
69 ,nchar(row.names(ClassifEspC2b$importance))-4 | |
70 ,nchar(row.names(ClassifEspC2b$importance))) | |
71 =="_prop") | |
72 test=match(EspForm,colnames(VoteC2)) | |
73 EspM=subset(EspForm,is.na(test)) | |
74 Zeros=matrix(nrow=nrow(VoteC2),ncol=length(EspM)) | |
75 Zeros[is.na(Zeros)]=0 | |
76 colnames(Zeros)=EspM | |
77 VoteC2=cbind(VoteC2,Zeros) | |
78 | |
79 ListDV=levels(as.factor(DataPar$'nom du fichier')) | |
80 #calcule les probabilités max par espèce et par fichier | |
81 #(utile pour corriger les erreurs dues à la coexistence de taxons dans le même fichier | |
82 #ex: cris sociaux de Pipistrelles identifiées comme autre chose (Noctule, oreillard...)) | |
83 #comptue max proba per species and files | |
84 #(useful to correct errors that came from multiple taxons in the same file | |
85 #eg ; Pipistrelles socials shouting identified as something else (Noctule, oreillard..)) | |
86 | |
87 MaxI=tapply(DataPar$tadarida_probabilite | |
88 ,INDEX=list(c(DataPar$'nom du fichier'),c(DataPar$tadarida_taxon)) | |
89 ,FUN=max) | |
90 MaxI2=as.data.frame(cbind(row.names(MaxI),MaxI)) | |
91 for (i in 2:ncol(MaxI2)) | |
92 { | |
93 MaxI2[,i]=as.numeric(as.character(MaxI2[,i])) | |
94 } | |
95 MaxI2[is.na(MaxI2)]=0 | |
96 | |
97 #édition des titres de colonne pour identifier les variables de type "indices max" | |
98 #editing col titles to identify "indices max" variables | |
99 for (i in 2:(ncol(MaxI2))) | |
100 { | |
101 colnames(MaxI2)[i]=paste0(names(MaxI2)[i],"_maxI") | |
102 } | |
103 | |
104 | |
105 #add missing species | |
106 EspForm=subset(row.names(ClassifEspC2b$importance) | |
107 ,substr(row.names(ClassifEspC2b$importance) | |
108 ,nchar(row.names(ClassifEspC2b$importance))-4 | |
109 ,nchar(row.names(ClassifEspC2b$importance))) | |
110 =="_maxI") | |
111 test=match(EspForm,colnames(MaxI2)) | |
112 EspM=subset(EspForm,is.na(test)) | |
113 Zeros=matrix(nrow=nrow(MaxI2),ncol=length(EspM)) | |
114 Zeros[is.na(Zeros)]=0 | |
115 colnames(Zeros)=EspM | |
116 MaxI2=cbind(MaxI2,Zeros) | |
117 | |
118 | |
119 | |
120 | |
121 #indice de confiance à l'echelle de l'observation (groupe de cris identifié comme provenant d'une seule espèce par la première couche) | |
122 #Confidence indice on obs scale (shoutings groups identified as comming from a single species from the first layer) | |
123 if(exists("IdS3")){rm(IdS3)} | |
124 for (i in 1:nlevels(as.factor(DataPar$tadarida_taxon))) | |
125 { | |
126 Idsub=subset(DataPar,DataPar$tadarida_taxon==levels(as.factor(DataPar$tadarida_taxon))[i]) | |
127 IdS2=cbind('nom du fichier'=Idsub$'nom du fichier',tadarida_taxon=Idsub$tadarida_taxon,prob=Idsub$tadarida_probabilite) | |
128 colnames(IdS2)[3]=paste(levels(as.factor(DataPar$tadarida_taxon))[i]) | |
129 if(exists("IdS3")){IdS3=merge(IdS3,IdS2,all=T)}else{IdS3=IdS2} | |
130 } | |
131 | |
132 for (i in 3:ncol(IdS3)) | |
133 { | |
134 IdS3[,i]=as.numeric(as.character(IdS3[,i])) | |
135 } | |
136 | |
137 #édition des titres de colonne pour identifier les variables de type "indices de l'observation" | |
138 #editing col titles to identify "indices de l'observation" variables | |
139 for (i in 3:(ncol(IdS3))) | |
140 { | |
141 colnames(IdS3)[i]=paste0(names(IdS3)[i],"_ValI") | |
142 } | |
143 | |
144 IdS3[is.na(IdS3)]=0 | |
145 | |
146 #add missing species | |
147 EspForm=subset(row.names(ClassifEspC2b$importance) | |
148 ,substr(row.names(ClassifEspC2b$importance) | |
149 ,nchar(row.names(ClassifEspC2b$importance))-4 | |
150 ,nchar(row.names(ClassifEspC2b$importance))) | |
151 =="_ValI") | |
152 test=match(EspForm,colnames(IdS3)) | |
153 EspM=subset(EspForm,is.na(test)) | |
154 Zeros=matrix(nrow=nrow(IdS3),ncol=length(EspM)) | |
155 Zeros[is.na(Zeros)]=0 | |
156 colnames(Zeros)=EspM | |
157 IdS3=cbind(IdS3,Zeros) | |
158 | |
159 #on merge les prop d'espèces, les quantiles et les indices par fichiers et par observations | |
160 #merge species probabilities, quantiles and indice per files and per obs | |
161 VoteC3=merge(VoteC2,MaxI2,by.x="nom du fichier",by.y="V1") | |
162 VoteC4=merge(VoteC3,IdS3,by=c("nom du fichier","tadarida_taxon")) | |
163 VoteC4$temps_fin=as.numeric(as.character(VoteC4$temps_fin)) | |
164 VoteC4$temps_debut=as.numeric(as.character(VoteC4$temps_debut)) | |
165 VoteC4$frequence=as.numeric(as.character(VoteC4$frequence_mediane)) | |
166 VoteC4$durseq=VoteC4$temps_fin-VoteC4$temps_debut | |
167 | |
168 ProbEsp_C2b=predict(ClassifEspC2b,VoteC4,type="prob",norm.votes=TRUE) | |
169 ProbEsp_C2bs=predict(ClassifEspC2b,VoteC4,type="response",norm.votes=TRUE) | |
170 | |
171 colnum=match("participation",colnames(VoteC4)) | |
172 DataCorrC2=cbind(VoteC4[,1:colnum],ProbEsp_C2b,ProbEsp_C2bs) | |
173 DataCorrC2=DataCorrC2[order(DataCorrC2$tadarida_probabilite,decreasing=T),] | |
174 DataCorrC2=DataCorrC2[order(DataCorrC2$'nom du fichier'),] | |
175 | |
176 DataCorrC2$ProbEsp_C2bs=as.character(DataCorrC2$ProbEsp_C2bs) | |
177 DataCorrC2$ProbEsp_C2bs[is.na(DataCorrC2$ProbEsp_C2bs)]="empty" | |
178 | |
179 fout_name="output.tabular" | |
180 | |
181 write.table(DataCorrC2,file=fout_name,row.names=FALSE,sep="\t",quote=FALSE,na="NA") |