comparison xarray_tool.py @ 3:663268794710 draft

"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit 57b6d23e3734d883e71081c78e77964d61be82ba"
author ecology
date Sun, 06 Jun 2021 08:49:43 +0000
parents e8650cdf092f
children 9bbaab36a5d4
comparison
equal deleted inserted replaced
2:e8650cdf092f 3:663268794710
2 # - getting metadata information 2 # - getting metadata information
3 # - select data and save results in csv file for further post-processing 3 # - select data and save results in csv file for further post-processing
4 4
5 import argparse 5 import argparse
6 import csv 6 import csv
7 import os
7 import warnings 8 import warnings
8 9
9 import geopandas as gdp 10 import geopandas as gdp
10 11
11 import pandas as pd 12 import pandas as pd
19 class XarrayTool (): 20 class XarrayTool ():
20 def __init__(self, infile, outfile_info="", outfile_summary="", 21 def __init__(self, infile, outfile_info="", outfile_summary="",
21 select="", outfile="", outputdir="", latname="", 22 select="", outfile="", outputdir="", latname="",
22 latvalN="", latvalS="", lonname="", lonvalE="", 23 latvalN="", latvalS="", lonname="", lonvalE="",
23 lonvalW="", filter_list="", coords="", time="", 24 lonvalW="", filter_list="", coords="", time="",
24 verbose=False 25 verbose=False, no_missing=False, coords_info=None,
25 ): 26 tolerance=None):
26 self.infile = infile 27 self.infile = infile
27 self.outfile_info = outfile_info 28 self.outfile_info = outfile_info
28 self.outfile_summary = outfile_summary 29 self.outfile_summary = outfile_summary
29 self.select = select 30 self.select = select
30 self.outfile = outfile 31 self.outfile = outfile
31 self.outputdir = outputdir 32 self.outputdir = outputdir
32 self.latname = latname 33 self.latname = latname
34 if tolerance != "" and tolerance is not None:
35 self.tolerance = float(tolerance)
36 else:
37 self.tolerance = -1
33 if latvalN != "" and latvalN is not None: 38 if latvalN != "" and latvalN is not None:
34 self.latvalN = float(latvalN) 39 self.latvalN = float(latvalN)
35 else: 40 else:
36 self.latvalN = "" 41 self.latvalN = ""
37 if latvalS != "" and latvalS is not None: 42 if latvalS != "" and latvalS is not None:
49 self.lonvalW = "" 54 self.lonvalW = ""
50 self.filter = filter_list 55 self.filter = filter_list
51 self.time = time 56 self.time = time
52 self.coords = coords 57 self.coords = coords
53 self.verbose = verbose 58 self.verbose = verbose
59 self.no_missing = no_missing
54 # initialization 60 # initialization
55 self.dset = None 61 self.dset = None
56 self.gset = None 62 self.gset = None
63 self.coords_info = coords_info
57 if self.verbose: 64 if self.verbose:
58 print("infile: ", self.infile) 65 print("infile: ", self.infile)
59 print("outfile_info: ", self.outfile_info) 66 print("outfile_info: ", self.outfile_info)
60 print("outfile_summary: ", self.outfile_summary) 67 print("outfile_summary: ", self.outfile_summary)
61 print("outfile: ", self.outfile) 68 print("outfile: ", self.outfile)
69 print("lonvalE: ", self.lonvalE) 76 print("lonvalE: ", self.lonvalE)
70 print("lonvalW: ", self.lonvalW) 77 print("lonvalW: ", self.lonvalW)
71 print("filter: ", self.filter) 78 print("filter: ", self.filter)
72 print("time: ", self.time) 79 print("time: ", self.time)
73 print("coords: ", self.coords) 80 print("coords: ", self.coords)
81 print("coords_info: ", self.coords_info)
74 82
75 def info(self): 83 def info(self):
76 f = open(self.outfile_info, 'w') 84 f = open(self.outfile_info, 'w')
77 ds = xr.open_dataset(self.infile) 85 ds = xr.open_dataset(self.infile)
78 ds.info(f) 86 ds.info(f)
111 if (op == 'bi'): 119 if (op == 'bi'):
112 rl = float(split_filter[3]) 120 rl = float(split_filter[3])
113 if filter_varname == self.select: 121 if filter_varname == self.select:
114 # filter on values of the selected variable 122 # filter on values of the selected variable
115 if op == 'bi': 123 if op == 'bi':
116 self.dset = self.dset.where((self.dset <= rl) & (self.dset >= ll)) 124 self.dset = self.dset.where(
125 (self.dset <= rl) & (self.dset >= ll)
126 )
117 elif op == 'le': 127 elif op == 'le':
118 self.dset = self.dset.where(self.dset <= ll) 128 self.dset = self.dset.where(self.dset <= ll)
119 elif op == 'ge': 129 elif op == 'ge':
120 self.dset = self.dset.where(self.dset >= ll) 130 self.dset = self.dset.where(self.dset >= ll)
121 elif op == 'e': 131 elif op == 'e':
139 self.datetime_selection() 149 self.datetime_selection()
140 if self.filter: 150 if self.filter:
141 self.filter_selection() 151 self.filter_selection()
142 152
143 self.area_selection() 153 self.area_selection()
144 # convert to dataframe 154 if self.gset.count() > 1:
145 self.gset = self.gset.to_dataframe().dropna(how='all').reset_index() 155 # convert to dataframe if several rows and cols
146 self.gset.to_csv(self.outfile, header=True, sep='\t') 156 self.gset = self.gset.to_dataframe().dropna(how='all'). \
157 reset_index()
158 self.gset.to_csv(self.outfile, header=True, sep='\t')
159 else:
160 data = {
161 self.latname: [self.gset[self.latname].values],
162 self.lonname: [self.gset[self.lonname].values],
163 self.select: [self.gset.values]
164 }
165
166 df = pd.DataFrame(data, columns=[self.latname, self.lonname,
167 self.select])
168 df.to_csv(self.outfile, header=True, sep='\t')
147 169
148 def datetime_selection(self): 170 def datetime_selection(self):
149 split_filter = self.time.split('#') 171 split_filter = self.time.split('#')
150 time_varname = split_filter[0] 172 time_varname = split_filter[0]
151 op = split_filter[1] 173 op = split_filter[1]
163 def filter_selection(self): 185 def filter_selection(self):
164 for single_filter in self.filter: 186 for single_filter in self.filter:
165 self.rowfilter(single_filter) 187 self.rowfilter(single_filter)
166 188
167 def area_selection(self): 189 def area_selection(self):
190
168 if self.latvalS != "" and self.lonvalW != "": 191 if self.latvalS != "" and self.lonvalW != "":
169 # Select geographical area 192 # Select geographical area
170 self.gset = self.dset.sel({self.latname: 193 self.gset = self.dset.sel({self.latname:
171 slice(self.latvalS, self.latvalN), 194 slice(self.latvalS, self.latvalN),
172 self.lonname: 195 self.lonname:
173 slice(self.lonvalW, self.lonvalE)}) 196 slice(self.lonvalW, self.lonvalE)})
174 elif self.latvalN != "" and self.lonvalE != "": 197 elif self.latvalN != "" and self.lonvalE != "":
175 # select nearest location 198 # select nearest location
176 self.nearest_location() # find nearest location without NaN values 199 if self.no_missing:
177 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, 200 self.nearest_latvalN = self.latvalN
178 self.lonname: self.nearest_lonvalE}, 201 self.nearest_lonvalE = self.lonvalE
179 method='nearest') 202 else:
203 # find nearest location without NaN values
204 self.nearest_location()
205 if self.tolerance > 0:
206 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
207 self.lonname: self.nearest_lonvalE},
208 method='nearest',
209 tolerance=self.tolerance)
210 else:
211 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
212 self.lonname: self.nearest_lonvalE},
213 method='nearest')
180 else: 214 else:
181 self.gset = self.dset 215 self.gset = self.dset
182 216
183 def nearest_location(self): 217 def nearest_location(self):
184 # Build a geopandas dataframe with all first elements in each dimension 218 # Build a geopandas dataframe with all first elements in each dimension
204 def selection_from_coords(self): 238 def selection_from_coords(self):
205 fcoords = pd.read_csv(self.coords, sep='\t') 239 fcoords = pd.read_csv(self.coords, sep='\t')
206 for row in fcoords.itertuples(): 240 for row in fcoords.itertuples():
207 self.latvalN = row[0] 241 self.latvalN = row[0]
208 self.lonvalE = row[1] 242 self.lonvalE = row[1]
209 self.outfile = (self.outputdir + '/' + self.select + '_' + str(row.Index) + '.tabular') 243 self.outfile = (os.path.join(self.outputdir,
244 self.select + '_' +
245 str(row.Index) + '.tabular'))
210 self.selection() 246 self.selection()
247
248 def get_coords_info(self):
249 ds = xr.open_dataset(self.infile)
250 for c in ds.coords:
251 filename = os.path.join(self.coords_info,
252 c.strip() +
253 '.tabular')
254 pd = ds.coords[c].to_pandas()
255 pd.index = range(len(pd))
256 pd.to_csv(filename, header=False, sep='\t')
211 257
212 258
213 if __name__ == '__main__': 259 if __name__ == '__main__':
214 warnings.filterwarnings("ignore") 260 warnings.filterwarnings("ignore")
215 parser = argparse.ArgumentParser() 261 parser = argparse.ArgumentParser()
253 parser.add_argument( 299 parser.add_argument(
254 '--lonvalW', 300 '--lonvalW',
255 help='West longitude value' 301 help='West longitude value'
256 ) 302 )
257 parser.add_argument( 303 parser.add_argument(
304 '--tolerance',
305 help='Maximum distance between original and selected value for '
306 ' inexact matches e.g. abs(index[indexer] - target) <= tolerance'
307 )
308 parser.add_argument(
258 '--coords', 309 '--coords',
259 help='Input file containing Latitude and Longitude' 310 help='Input file containing Latitude and Longitude'
260 'for geographical selection' 311 'for geographical selection'
261 ) 312 )
262 parser.add_argument( 313 parser.add_argument(
314 '--coords_info',
315 help='output-folder where for each coordinate, coordinate values '
316 ' are being printed in the corresponding outputfile'
317 )
318 parser.add_argument(
263 '--filter', 319 '--filter',
264 nargs="*", 320 nargs="*",
265 help='Filter list variable#operator#value_s#value_e' 321 help='Filter list variable#operator#value_s#value_e'
266 ) 322 )
267 parser.add_argument( 323 parser.add_argument(
279 '(valid only when --select)' 335 '(valid only when --select)'
280 ) 336 )
281 parser.add_argument( 337 parser.add_argument(
282 "-v", "--verbose", 338 "-v", "--verbose",
283 help="switch on verbose mode", 339 help="switch on verbose mode",
340 action="store_true"
341 )
342 parser.add_argument(
343 "--no_missing",
344 help="""Do not take into account possible null/missing values
345 (only valid for single location)""",
284 action="store_true" 346 action="store_true"
285 ) 347 )
286 args = parser.parse_args() 348 args = parser.parse_args()
287 349
288 p = XarrayTool(args.infile, args.info, args.summary, args.select, 350 p = XarrayTool(args.infile, args.info, args.summary, args.select,
289 args.outfile, args.outputdir, args.latname, 351 args.outfile, args.outputdir, args.latname,
290 args.latvalN, args.latvalS, args.lonname, 352 args.latvalN, args.latvalS, args.lonname,
291 args.lonvalE, args.lonvalW, args.filter, 353 args.lonvalE, args.lonvalW, args.filter,
292 args.coords, args.time, args.verbose) 354 args.coords, args.time, args.verbose,
355 args.no_missing, args.coords_info, args.tolerance)
293 if args.info: 356 if args.info:
294 p.info() 357 p.info()
295 if args.summary: 358 if args.summary:
296 p.summary() 359 p.summary()
297 if args.coords: 360 if args.coords:
298 p.selection_from_coords() 361 p.selection_from_coords()
299 elif args.select: 362 elif args.select:
300 p.selection() 363 p.selection()
364 elif args.coords_info:
365 p.get_coords_info()