comparison xarray_tool.py @ 0:b0780241f916 draft

"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit 57b6d23e3734d883e71081c78e77964d61be82ba"
author ecology
date Sun, 06 Jun 2021 08:50:11 +0000
parents
children e87073edecd6
comparison
equal deleted inserted replaced
-1:000000000000 0:b0780241f916
1 # xarray tool for:
2 # - getting metadata information
3 # - select data and save results in csv file for further post-processing
4
5 import argparse
6 import csv
7 import os
8 import warnings
9
10 import geopandas as gdp
11
12 import pandas as pd
13
14 from shapely.geometry import Point
15 from shapely.ops import nearest_points
16
17 import xarray as xr
18
19
20 class XarrayTool ():
21 def __init__(self, infile, outfile_info="", outfile_summary="",
22 select="", outfile="", outputdir="", latname="",
23 latvalN="", latvalS="", lonname="", lonvalE="",
24 lonvalW="", filter_list="", coords="", time="",
25 verbose=False, no_missing=False, coords_info=None,
26 tolerance=None):
27 self.infile = infile
28 self.outfile_info = outfile_info
29 self.outfile_summary = outfile_summary
30 self.select = select
31 self.outfile = outfile
32 self.outputdir = outputdir
33 self.latname = latname
34 if tolerance != "" and tolerance is not None:
35 self.tolerance = float(tolerance)
36 else:
37 self.tolerance = -1
38 if latvalN != "" and latvalN is not None:
39 self.latvalN = float(latvalN)
40 else:
41 self.latvalN = ""
42 if latvalS != "" and latvalS is not None:
43 self.latvalS = float(latvalS)
44 else:
45 self.latvalS = ""
46 self.lonname = lonname
47 if lonvalE != "" and lonvalE is not None:
48 self.lonvalE = float(lonvalE)
49 else:
50 self.lonvalE = ""
51 if lonvalW != "" and lonvalW is not None:
52 self.lonvalW = float(lonvalW)
53 else:
54 self.lonvalW = ""
55 self.filter = filter_list
56 self.time = time
57 self.coords = coords
58 self.verbose = verbose
59 self.no_missing = no_missing
60 # initialization
61 self.dset = None
62 self.gset = None
63 self.coords_info = coords_info
64 if self.verbose:
65 print("infile: ", self.infile)
66 print("outfile_info: ", self.outfile_info)
67 print("outfile_summary: ", self.outfile_summary)
68 print("outfile: ", self.outfile)
69 print("select: ", self.select)
70 print("outfile: ", self.outfile)
71 print("outputdir: ", self.outputdir)
72 print("latname: ", self.latname)
73 print("latvalN: ", self.latvalN)
74 print("latvalS: ", self.latvalS)
75 print("lonname: ", self.lonname)
76 print("lonvalE: ", self.lonvalE)
77 print("lonvalW: ", self.lonvalW)
78 print("filter: ", self.filter)
79 print("time: ", self.time)
80 print("coords: ", self.coords)
81 print("coords_info: ", self.coords_info)
82
83 def info(self):
84 f = open(self.outfile_info, 'w')
85 ds = xr.open_dataset(self.infile)
86 ds.info(f)
87 f.close()
88
89 def summary(self):
90 f = open(self.outfile_summary, 'w')
91 ds = xr.open_dataset(self.infile)
92 writer = csv.writer(f, delimiter='\t')
93 header = ['VariableName', 'NumberOfDimensions']
94 for idx, val in enumerate(ds.dims.items()):
95 header.append('Dim' + str(idx) + 'Name')
96 header.append('Dim' + str(idx) + 'Size')
97 writer.writerow(header)
98 for name, da in ds.data_vars.items():
99 line = [name]
100 line.append(len(ds[name].shape))
101 for d, s in zip(da.shape, da.sizes):
102 line.append(s)
103 line.append(d)
104 writer.writerow(line)
105 for name, da in ds.coords.items():
106 line = [name]
107 line.append(len(ds[name].shape))
108 for d, s in zip(da.shape, da.sizes):
109 line.append(s)
110 line.append(d)
111 writer.writerow(line)
112 f.close()
113
114 def rowfilter(self, single_filter):
115 split_filter = single_filter.split('#')
116 filter_varname = split_filter[0]
117 op = split_filter[1]
118 ll = float(split_filter[2])
119 if (op == 'bi'):
120 rl = float(split_filter[3])
121 if filter_varname == self.select:
122 # filter on values of the selected variable
123 if op == 'bi':
124 self.dset = self.dset.where(
125 (self.dset <= rl) & (self.dset >= ll)
126 )
127 elif op == 'le':
128 self.dset = self.dset.where(self.dset <= ll)
129 elif op == 'ge':
130 self.dset = self.dset.where(self.dset >= ll)
131 elif op == 'e':
132 self.dset = self.dset.where(self.dset == ll)
133 else: # filter on other dimensions of the selected variable
134 if op == 'bi':
135 self.dset = self.dset.sel({filter_varname: slice(ll, rl)})
136 elif op == 'le':
137 self.dset = self.dset.sel({filter_varname: slice(None, ll)})
138 elif op == 'ge':
139 self.dset = self.dset.sel({filter_varname: slice(ll, None)})
140 elif op == 'e':
141 self.dset = self.dset.sel({filter_varname: ll},
142 method='nearest')
143
144 def selection(self):
145 if self.dset is None:
146 self.ds = xr.open_dataset(self.infile)
147 self.dset = self.ds[self.select] # select variable
148 if self.time:
149 self.datetime_selection()
150 if self.filter:
151 self.filter_selection()
152
153 self.area_selection()
154 if self.gset.count() > 1:
155 # convert to dataframe if several rows and cols
156 self.gset = self.gset.to_dataframe().dropna(how='all'). \
157 reset_index()
158 self.gset.to_csv(self.outfile, header=True, sep='\t')
159 else:
160 data = {
161 self.latname: [self.gset[self.latname].values],
162 self.lonname: [self.gset[self.lonname].values],
163 self.select: [self.gset.values]
164 }
165
166 df = pd.DataFrame(data, columns=[self.latname, self.lonname,
167 self.select])
168 df.to_csv(self.outfile, header=True, sep='\t')
169
170 def datetime_selection(self):
171 split_filter = self.time.split('#')
172 time_varname = split_filter[0]
173 op = split_filter[1]
174 ll = split_filter[2]
175 if (op == 'sl'):
176 rl = split_filter[3]
177 self.dset = self.dset.sel({time_varname: slice(ll, rl)})
178 elif (op == 'to'):
179 self.dset = self.dset.sel({time_varname: slice(None, ll)})
180 elif (op == 'from'):
181 self.dset = self.dset.sel({time_varname: slice(ll, None)})
182 elif (op == 'is'):
183 self.dset = self.dset.sel({time_varname: ll}, method='nearest')
184
185 def filter_selection(self):
186 for single_filter in self.filter:
187 self.rowfilter(single_filter)
188
189 def area_selection(self):
190
191 if self.latvalS != "" and self.lonvalW != "":
192 # Select geographical area
193 self.gset = self.dset.sel({self.latname:
194 slice(self.latvalS, self.latvalN),
195 self.lonname:
196 slice(self.lonvalW, self.lonvalE)})
197 elif self.latvalN != "" and self.lonvalE != "":
198 # select nearest location
199 if self.no_missing:
200 self.nearest_latvalN = self.latvalN
201 self.nearest_lonvalE = self.lonvalE
202 else:
203 # find nearest location without NaN values
204 self.nearest_location()
205 if self.tolerance > 0:
206 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
207 self.lonname: self.nearest_lonvalE},
208 method='nearest',
209 tolerance=self.tolerance)
210 else:
211 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
212 self.lonname: self.nearest_lonvalE},
213 method='nearest')
214 else:
215 self.gset = self.dset
216
217 def nearest_location(self):
218 # Build a geopandas dataframe with all first elements in each dimension
219 # so we assume null values correspond to a mask that is the same for
220 # all dimensions in the dataset.
221 dsel_frame = self.dset
222 for dim in self.dset.dims:
223 if dim != self.latname and dim != self.lonname:
224 dsel_frame = dsel_frame.isel({dim: 0})
225 # transform to pandas dataframe
226 dff = dsel_frame.to_dataframe().dropna().reset_index()
227 # transform to geopandas to collocate
228 gdf = gdp.GeoDataFrame(dff,
229 geometry=gdp.points_from_xy(dff[self.lonname],
230 dff[self.latname]))
231 # Find nearest location where values are not null
232 point = Point(self.lonvalE, self.latvalN)
233 multipoint = gdf.geometry.unary_union
234 queried_geom, nearest_geom = nearest_points(point, multipoint)
235 self.nearest_latvalN = nearest_geom.y
236 self.nearest_lonvalE = nearest_geom.x
237
238 def selection_from_coords(self):
239 fcoords = pd.read_csv(self.coords, sep='\t')
240 for row in fcoords.itertuples():
241 self.latvalN = row[0]
242 self.lonvalE = row[1]
243 self.outfile = (os.path.join(self.outputdir,
244 self.select + '_' +
245 str(row.Index) + '.tabular'))
246 self.selection()
247
248 def get_coords_info(self):
249 ds = xr.open_dataset(self.infile)
250 for c in ds.coords:
251 filename = os.path.join(self.coords_info,
252 c.strip() +
253 '.tabular')
254 pd = ds.coords[c].to_pandas()
255 pd.index = range(len(pd))
256 pd.to_csv(filename, header=False, sep='\t')
257
258
259 if __name__ == '__main__':
260 warnings.filterwarnings("ignore")
261 parser = argparse.ArgumentParser()
262
263 parser.add_argument(
264 'infile',
265 help='netCDF input filename'
266 )
267 parser.add_argument(
268 '--info',
269 help='Output filename where metadata information is stored'
270 )
271 parser.add_argument(
272 '--summary',
273 help='Output filename where data summary information is stored'
274 )
275 parser.add_argument(
276 '--select',
277 help='Variable name to select'
278 )
279 parser.add_argument(
280 '--latname',
281 help='Latitude name'
282 )
283 parser.add_argument(
284 '--latvalN',
285 help='North latitude value'
286 )
287 parser.add_argument(
288 '--latvalS',
289 help='South latitude value'
290 )
291 parser.add_argument(
292 '--lonname',
293 help='Longitude name'
294 )
295 parser.add_argument(
296 '--lonvalE',
297 help='East longitude value'
298 )
299 parser.add_argument(
300 '--lonvalW',
301 help='West longitude value'
302 )
303 parser.add_argument(
304 '--tolerance',
305 help='Maximum distance between original and selected value for '
306 ' inexact matches e.g. abs(index[indexer] - target) <= tolerance'
307 )
308 parser.add_argument(
309 '--coords',
310 help='Input file containing Latitude and Longitude'
311 'for geographical selection'
312 )
313 parser.add_argument(
314 '--coords_info',
315 help='output-folder where for each coordinate, coordinate values '
316 ' are being printed in the corresponding outputfile'
317 )
318 parser.add_argument(
319 '--filter',
320 nargs="*",
321 help='Filter list variable#operator#value_s#value_e'
322 )
323 parser.add_argument(
324 '--time',
325 help='select timeseries variable#operator#value_s[#value_e]'
326 )
327 parser.add_argument(
328 '--outfile',
329 help='csv outfile for storing results of the selection'
330 '(valid only when --select)'
331 )
332 parser.add_argument(
333 '--outputdir',
334 help='folder name for storing results with multiple selections'
335 '(valid only when --select)'
336 )
337 parser.add_argument(
338 "-v", "--verbose",
339 help="switch on verbose mode",
340 action="store_true"
341 )
342 parser.add_argument(
343 "--no_missing",
344 help="""Do not take into account possible null/missing values
345 (only valid for single location)""",
346 action="store_true"
347 )
348 args = parser.parse_args()
349
350 p = XarrayTool(args.infile, args.info, args.summary, args.select,
351 args.outfile, args.outputdir, args.latname,
352 args.latvalN, args.latvalS, args.lonname,
353 args.lonvalE, args.lonvalW, args.filter,
354 args.coords, args.time, args.verbose,
355 args.no_missing, args.coords_info, args.tolerance)
356 if args.info:
357 p.info()
358 if args.summary:
359 p.summary()
360 if args.coords:
361 p.selection_from_coords()
362 elif args.select:
363 p.selection()
364 elif args.coords_info:
365 p.get_coords_info()