Mercurial > repos > ecology > xarray_metadata_info
comparison xarray_tool.py @ 5:00de53d18b99 draft default tip
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit fd8ad4d97db7b1fd3876ff63e14280474e06fdf7
author | ecology |
---|---|
date | Sun, 31 Jul 2022 21:22:03 +0000 |
parents | 9bbaab36a5d4 |
children |
comparison
equal
deleted
inserted
replaced
4:9bbaab36a5d4 | 5:00de53d18b99 |
---|---|
1 # xarray tool for: | |
2 # - getting metadata information | |
3 # - select data and save results in csv file for further post-processing | |
4 | |
5 import argparse | |
6 import csv | |
7 import os | |
8 import warnings | |
9 | |
10 import geopandas as gdp | |
11 | |
12 import pandas as pd | |
13 | |
14 from shapely.geometry import Point | |
15 from shapely.ops import nearest_points | |
16 | |
17 import xarray as xr | |
18 | |
19 | |
20 class XarrayTool (): | |
21 def __init__(self, infile, outfile_info="", outfile_summary="", | |
22 select="", outfile="", outputdir="", latname="", | |
23 latvalN="", latvalS="", lonname="", lonvalE="", | |
24 lonvalW="", filter_list="", coords="", time="", | |
25 verbose=False, no_missing=False, coords_info=None, | |
26 tolerance=None): | |
27 self.infile = infile | |
28 self.outfile_info = outfile_info | |
29 self.outfile_summary = outfile_summary | |
30 self.select = select | |
31 self.outfile = outfile | |
32 self.outputdir = outputdir | |
33 self.latname = latname | |
34 if tolerance != "" and tolerance is not None: | |
35 self.tolerance = float(tolerance) | |
36 else: | |
37 self.tolerance = -1 | |
38 if latvalN != "" and latvalN is not None: | |
39 self.latvalN = float(latvalN) | |
40 else: | |
41 self.latvalN = "" | |
42 if latvalS != "" and latvalS is not None: | |
43 self.latvalS = float(latvalS) | |
44 else: | |
45 self.latvalS = "" | |
46 self.lonname = lonname | |
47 if lonvalE != "" and lonvalE is not None: | |
48 self.lonvalE = float(lonvalE) | |
49 else: | |
50 self.lonvalE = "" | |
51 if lonvalW != "" and lonvalW is not None: | |
52 self.lonvalW = float(lonvalW) | |
53 else: | |
54 self.lonvalW = "" | |
55 self.filter = filter_list | |
56 self.time = time | |
57 self.coords = coords | |
58 self.verbose = verbose | |
59 self.no_missing = no_missing | |
60 # initialization | |
61 self.dset = None | |
62 self.gset = None | |
63 self.coords_info = coords_info | |
64 if self.verbose: | |
65 print("infile: ", self.infile) | |
66 print("outfile_info: ", self.outfile_info) | |
67 print("outfile_summary: ", self.outfile_summary) | |
68 print("outfile: ", self.outfile) | |
69 print("select: ", self.select) | |
70 print("outfile: ", self.outfile) | |
71 print("outputdir: ", self.outputdir) | |
72 print("latname: ", self.latname) | |
73 print("latvalN: ", self.latvalN) | |
74 print("latvalS: ", self.latvalS) | |
75 print("lonname: ", self.lonname) | |
76 print("lonvalE: ", self.lonvalE) | |
77 print("lonvalW: ", self.lonvalW) | |
78 print("filter: ", self.filter) | |
79 print("time: ", self.time) | |
80 print("coords: ", self.coords) | |
81 print("coords_info: ", self.coords_info) | |
82 | |
83 def info(self): | |
84 f = open(self.outfile_info, 'w') | |
85 ds = xr.open_dataset(self.infile) | |
86 ds.info(f) | |
87 f.close() | |
88 | |
89 def summary(self): | |
90 f = open(self.outfile_summary, 'w') | |
91 ds = xr.open_dataset(self.infile) | |
92 writer = csv.writer(f, delimiter='\t') | |
93 header = ['VariableName', 'NumberOfDimensions'] | |
94 for idx, val in enumerate(ds.dims.items()): | |
95 header.append('Dim' + str(idx) + 'Name') | |
96 header.append('Dim' + str(idx) + 'Size') | |
97 writer.writerow(header) | |
98 for name, da in ds.data_vars.items(): | |
99 line = [name] | |
100 line.append(len(ds[name].shape)) | |
101 for d, s in zip(da.shape, da.sizes): | |
102 line.append(s) | |
103 line.append(d) | |
104 writer.writerow(line) | |
105 for name, da in ds.coords.items(): | |
106 line = [name] | |
107 line.append(len(ds[name].shape)) | |
108 for d, s in zip(da.shape, da.sizes): | |
109 line.append(s) | |
110 line.append(d) | |
111 writer.writerow(line) | |
112 f.close() | |
113 | |
114 def rowfilter(self, single_filter): | |
115 split_filter = single_filter.split('#') | |
116 filter_varname = split_filter[0] | |
117 op = split_filter[1] | |
118 ll = float(split_filter[2]) | |
119 if (op == 'bi'): | |
120 rl = float(split_filter[3]) | |
121 if filter_varname == self.select: | |
122 # filter on values of the selected variable | |
123 if op == 'bi': | |
124 self.dset = self.dset.where( | |
125 (self.dset <= rl) & (self.dset >= ll) | |
126 ) | |
127 elif op == 'le': | |
128 self.dset = self.dset.where(self.dset <= ll) | |
129 elif op == 'ge': | |
130 self.dset = self.dset.where(self.dset >= ll) | |
131 elif op == 'e': | |
132 self.dset = self.dset.where(self.dset == ll) | |
133 else: # filter on other dimensions of the selected variable | |
134 if op == 'bi': | |
135 self.dset = self.dset.sel({filter_varname: slice(ll, rl)}) | |
136 elif op == 'le': | |
137 self.dset = self.dset.sel({filter_varname: slice(None, ll)}) | |
138 elif op == 'ge': | |
139 self.dset = self.dset.sel({filter_varname: slice(ll, None)}) | |
140 elif op == 'e': | |
141 self.dset = self.dset.sel({filter_varname: ll}, | |
142 method='nearest') | |
143 | |
144 def selection(self): | |
145 if self.dset is None: | |
146 self.ds = xr.open_dataset(self.infile) | |
147 self.dset = self.ds[self.select] # select variable | |
148 if self.time: | |
149 self.datetime_selection() | |
150 if self.filter: | |
151 self.filter_selection() | |
152 | |
153 self.area_selection() | |
154 if self.gset.count() > 1: | |
155 # convert to dataframe if several rows and cols | |
156 self.gset = self.gset.to_dataframe().dropna(how='all'). \ | |
157 reset_index() | |
158 self.gset.to_csv(self.outfile, header=True, sep='\t') | |
159 else: | |
160 data = { | |
161 self.latname: [self.gset[self.latname].values], | |
162 self.lonname: [self.gset[self.lonname].values], | |
163 self.select: [self.gset.values] | |
164 } | |
165 | |
166 df = pd.DataFrame(data, columns=[self.latname, self.lonname, | |
167 self.select]) | |
168 df.to_csv(self.outfile, header=True, sep='\t') | |
169 | |
170 def datetime_selection(self): | |
171 split_filter = self.time.split('#') | |
172 time_varname = split_filter[0] | |
173 op = split_filter[1] | |
174 ll = split_filter[2] | |
175 if (op == 'sl'): | |
176 rl = split_filter[3] | |
177 self.dset = self.dset.sel({time_varname: slice(ll, rl)}) | |
178 elif (op == 'to'): | |
179 self.dset = self.dset.sel({time_varname: slice(None, ll)}) | |
180 elif (op == 'from'): | |
181 self.dset = self.dset.sel({time_varname: slice(ll, None)}) | |
182 elif (op == 'is'): | |
183 self.dset = self.dset.sel({time_varname: ll}, method='nearest') | |
184 | |
185 def filter_selection(self): | |
186 for single_filter in self.filter: | |
187 self.rowfilter(single_filter) | |
188 | |
189 def area_selection(self): | |
190 | |
191 if self.latvalS != "" and self.lonvalW != "": | |
192 # Select geographical area | |
193 self.gset = self.dset.sel({self.latname: | |
194 slice(self.latvalS, self.latvalN), | |
195 self.lonname: | |
196 slice(self.lonvalW, self.lonvalE)}) | |
197 elif self.latvalN != "" and self.lonvalE != "": | |
198 # select nearest location | |
199 if self.no_missing: | |
200 self.nearest_latvalN = self.latvalN | |
201 self.nearest_lonvalE = self.lonvalE | |
202 else: | |
203 # find nearest location without NaN values | |
204 self.nearest_location() | |
205 if self.tolerance > 0: | |
206 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
207 self.lonname: self.nearest_lonvalE}, | |
208 method='nearest', | |
209 tolerance=self.tolerance) | |
210 else: | |
211 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
212 self.lonname: self.nearest_lonvalE}, | |
213 method='nearest') | |
214 else: | |
215 self.gset = self.dset | |
216 | |
217 def nearest_location(self): | |
218 # Build a geopandas dataframe with all first elements in each dimension | |
219 # so we assume null values correspond to a mask that is the same for | |
220 # all dimensions in the dataset. | |
221 dsel_frame = self.dset | |
222 for dim in self.dset.dims: | |
223 if dim != self.latname and dim != self.lonname: | |
224 dsel_frame = dsel_frame.isel({dim: 0}) | |
225 # transform to pandas dataframe | |
226 dff = dsel_frame.to_dataframe().dropna().reset_index() | |
227 # transform to geopandas to collocate | |
228 gdf = gdp.GeoDataFrame(dff, | |
229 geometry=gdp.points_from_xy(dff[self.lonname], | |
230 dff[self.latname])) | |
231 # Find nearest location where values are not null | |
232 point = Point(self.lonvalE, self.latvalN) | |
233 multipoint = gdf.geometry.unary_union | |
234 queried_geom, nearest_geom = nearest_points(point, multipoint) | |
235 self.nearest_latvalN = nearest_geom.y | |
236 self.nearest_lonvalE = nearest_geom.x | |
237 | |
238 def selection_from_coords(self): | |
239 fcoords = pd.read_csv(self.coords, sep='\t') | |
240 for row in fcoords.itertuples(): | |
241 self.latvalN = row[0] | |
242 self.lonvalE = row[1] | |
243 self.outfile = (os.path.join(self.outputdir, | |
244 self.select + '_' + | |
245 str(row.Index) + '.tabular')) | |
246 self.selection() | |
247 | |
248 def get_coords_info(self): | |
249 ds = xr.open_dataset(self.infile) | |
250 for c in ds.coords: | |
251 filename = os.path.join(self.coords_info, | |
252 c.strip() + | |
253 '.tabular') | |
254 pd = ds.coords[c].to_pandas() | |
255 pd.index = range(len(pd)) | |
256 pd.to_csv(filename, header=False, sep='\t') | |
257 | |
258 | |
259 if __name__ == '__main__': | |
260 warnings.filterwarnings("ignore") | |
261 parser = argparse.ArgumentParser() | |
262 | |
263 parser.add_argument( | |
264 'infile', | |
265 help='netCDF input filename' | |
266 ) | |
267 parser.add_argument( | |
268 '--info', | |
269 help='Output filename where metadata information is stored' | |
270 ) | |
271 parser.add_argument( | |
272 '--summary', | |
273 help='Output filename where data summary information is stored' | |
274 ) | |
275 parser.add_argument( | |
276 '--select', | |
277 help='Variable name to select' | |
278 ) | |
279 parser.add_argument( | |
280 '--latname', | |
281 help='Latitude name' | |
282 ) | |
283 parser.add_argument( | |
284 '--latvalN', | |
285 help='North latitude value' | |
286 ) | |
287 parser.add_argument( | |
288 '--latvalS', | |
289 help='South latitude value' | |
290 ) | |
291 parser.add_argument( | |
292 '--lonname', | |
293 help='Longitude name' | |
294 ) | |
295 parser.add_argument( | |
296 '--lonvalE', | |
297 help='East longitude value' | |
298 ) | |
299 parser.add_argument( | |
300 '--lonvalW', | |
301 help='West longitude value' | |
302 ) | |
303 parser.add_argument( | |
304 '--tolerance', | |
305 help='Maximum distance between original and selected value for ' | |
306 ' inexact matches e.g. abs(index[indexer] - target) <= tolerance' | |
307 ) | |
308 parser.add_argument( | |
309 '--coords', | |
310 help='Input file containing Latitude and Longitude' | |
311 'for geographical selection' | |
312 ) | |
313 parser.add_argument( | |
314 '--coords_info', | |
315 help='output-folder where for each coordinate, coordinate values ' | |
316 ' are being printed in the corresponding outputfile' | |
317 ) | |
318 parser.add_argument( | |
319 '--filter', | |
320 nargs="*", | |
321 help='Filter list variable#operator#value_s#value_e' | |
322 ) | |
323 parser.add_argument( | |
324 '--time', | |
325 help='select timeseries variable#operator#value_s[#value_e]' | |
326 ) | |
327 parser.add_argument( | |
328 '--outfile', | |
329 help='csv outfile for storing results of the selection' | |
330 '(valid only when --select)' | |
331 ) | |
332 parser.add_argument( | |
333 '--outputdir', | |
334 help='folder name for storing results with multiple selections' | |
335 '(valid only when --select)' | |
336 ) | |
337 parser.add_argument( | |
338 "-v", "--verbose", | |
339 help="switch on verbose mode", | |
340 action="store_true" | |
341 ) | |
342 parser.add_argument( | |
343 "--no_missing", | |
344 help="""Do not take into account possible null/missing values | |
345 (only valid for single location)""", | |
346 action="store_true" | |
347 ) | |
348 args = parser.parse_args() | |
349 | |
350 p = XarrayTool(args.infile, args.info, args.summary, args.select, | |
351 args.outfile, args.outputdir, args.latname, | |
352 args.latvalN, args.latvalS, args.lonname, | |
353 args.lonvalE, args.lonvalW, args.filter, | |
354 args.coords, args.time, args.verbose, | |
355 args.no_missing, args.coords_info, args.tolerance) | |
356 if args.info: | |
357 p.info() | |
358 if args.summary: | |
359 p.summary() | |
360 if args.coords: | |
361 p.selection_from_coords() | |
362 elif args.select: | |
363 p.selection() | |
364 elif args.coords_info: | |
365 p.get_coords_info() |