Mercurial > repos > ecology > xarray_select
comparison xarray_tool.py @ 0:225d0d275a24 draft
"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit f1455c158011dc4aab0fd469cf794be6f4142992"
author | ecology |
---|---|
date | Fri, 22 May 2020 05:19:15 -0400 |
parents | |
children | 6baac361495b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:225d0d275a24 |
---|---|
1 # xarray tool for: | |
2 # - getting metadata information | |
3 # - select data and save results in csv file for further post-processing | |
4 | |
5 import argparse | |
6 import csv | |
7 import warnings | |
8 | |
9 import geopandas as gdp | |
10 | |
11 import pandas as pd | |
12 | |
13 from shapely.geometry import Point | |
14 from shapely.ops import nearest_points | |
15 | |
16 import xarray as xr | |
17 | |
18 | |
19 class XarrayTool (): | |
20 def __init__(self, infile, outfile_info="", outfile_summary="", | |
21 select="", outfile="", outputdir="", latname="", | |
22 latvalN="", latvalS="", lonname="", lonvalE="", | |
23 lonvalW="", filter_list="", coords="", time="", | |
24 verbose=False | |
25 ): | |
26 self.infile = infile | |
27 self.outfile_info = outfile_info | |
28 self.outfile_summary = outfile_summary | |
29 self.select = select | |
30 self.outfile = outfile | |
31 self.outputdir = outputdir | |
32 self.latname = latname | |
33 if latvalN != "" and latvalN is not None: | |
34 self.latvalN = float(latvalN) | |
35 else: | |
36 self.latvalN = "" | |
37 if latvalS != "" and latvalS is not None: | |
38 self.latvalS = float(latvalS) | |
39 else: | |
40 self.latvalS = "" | |
41 self.lonname = lonname | |
42 if lonvalE != "" and lonvalE is not None: | |
43 self.lonvalE = float(lonvalE) | |
44 else: | |
45 self.lonvalE = "" | |
46 if lonvalW != "" and lonvalW is not None: | |
47 self.lonvalW = float(lonvalW) | |
48 else: | |
49 self.lonvalW = "" | |
50 self.filter = filter_list | |
51 self.time = time | |
52 self.coords = coords | |
53 self.verbose = verbose | |
54 # initialization | |
55 self.dset = None | |
56 self.gset = None | |
57 if self.verbose: | |
58 print("infile: ", self.infile) | |
59 print("outfile_info: ", self.outfile_info) | |
60 print("outfile_summary: ", self.outfile_summary) | |
61 print("outfile: ", self.outfile) | |
62 print("select: ", self.select) | |
63 print("outfile: ", self.outfile) | |
64 print("outputdir: ", self.outputdir) | |
65 print("latname: ", self.latname) | |
66 print("latvalN: ", self.latvalN) | |
67 print("latvalS: ", self.latvalS) | |
68 print("lonname: ", self.lonname) | |
69 print("lonvalE: ", self.lonvalE) | |
70 print("lonvalW: ", self.lonvalW) | |
71 print("filter: ", self.filter) | |
72 print("time: ", self.time) | |
73 print("coords: ", self.coords) | |
74 | |
75 def info(self): | |
76 f = open(self.outfile_info, 'w') | |
77 ds = xr.open_dataset(self.infile) | |
78 ds.info(f) | |
79 f.close() | |
80 | |
81 def summary(self): | |
82 f = open(self.outfile_summary, 'w') | |
83 ds = xr.open_dataset(self.infile) | |
84 writer = csv.writer(f, delimiter='\t') | |
85 header = ['VariableName', 'NumberOfDimensions'] | |
86 for idx, val in enumerate(ds.dims.items()): | |
87 header.append('Dim'+str(idx)+'Name') | |
88 header.append('Dim'+str(idx)+'Size') | |
89 writer.writerow(header) | |
90 for name, da in ds.data_vars.items(): | |
91 line = [name] | |
92 line.append(len(ds[name].shape)) | |
93 for d, s in zip(da.shape, da.sizes): | |
94 line.append(s) | |
95 line.append(d) | |
96 writer.writerow(line) | |
97 for name, da in ds.coords.items(): | |
98 line = [name] | |
99 line.append(len(ds[name].shape)) | |
100 for d, s in zip(da.shape, da.sizes): | |
101 line.append(s) | |
102 line.append(d) | |
103 writer.writerow(line) | |
104 f.close() | |
105 | |
106 def rowfilter(self, single_filter): | |
107 split_filter = single_filter.split('#') | |
108 filter_varname = split_filter[0] | |
109 op = split_filter[1] | |
110 ll = float(split_filter[2]) | |
111 if (op == 'bi'): | |
112 rl = float(split_filter[3]) | |
113 if filter_varname == self.select: | |
114 # filter on values of the selected variable | |
115 if op == 'bi': | |
116 self.dset = self.dset.where((self.dset <= rl) & | |
117 (self.dset >= ll)) | |
118 elif op == 'le': | |
119 self.dset = self.dset.where(self.dset <= ll) | |
120 elif op == 'ge': | |
121 self.dset = self.dset.where(self.dset >= ll) | |
122 elif op == 'e': | |
123 self.dset = self.dset.where(self.dset == ll) | |
124 else: # filter on other dimensions of the selected variable | |
125 if op == 'bi': | |
126 self.dset = self.dset.sel({filter_varname: slice(ll, rl)}) | |
127 elif op == 'le': | |
128 self.dset = self.dset.sel({filter_varname: slice(None, ll)}) | |
129 elif op == 'ge': | |
130 self.dset = self.dset.sel({filter_varname: slice(ll, None)}) | |
131 elif op == 'e': | |
132 self.dset = self.dset.sel({filter_varname: ll}, | |
133 method='nearest') | |
134 | |
135 def selection(self): | |
136 if self.dset is None: | |
137 self.ds = xr.open_dataset(self.infile) | |
138 self.dset = self.ds[self.select] # select variable | |
139 if self.time: | |
140 self.datetime_selection() | |
141 if self.filter: | |
142 self.filter_selection() | |
143 | |
144 self.area_selection() | |
145 # convert to dataframe | |
146 self.gset = self.gset.to_dataframe().dropna(how='all').reset_index() | |
147 self.gset.to_csv(self.outfile, header=True, sep='\t') | |
148 | |
149 def datetime_selection(self): | |
150 split_filter = self.time.split('#') | |
151 time_varname = split_filter[0] | |
152 op = split_filter[1] | |
153 ll = split_filter[2] | |
154 if (op == 'sl'): | |
155 rl = split_filter[3] | |
156 self.dset = self.dset.sel({time_varname: slice(ll, rl)}) | |
157 elif (op == 'to'): | |
158 self.dset = self.dset.sel({time_varname: slice(None, ll)}) | |
159 elif (op == 'from'): | |
160 self.dset = self.dset.sel({time_varname: slice(ll, None)}) | |
161 elif (op == 'is'): | |
162 self.dset = self.dset.sel({time_varname: ll}, method='nearest') | |
163 | |
164 def filter_selection(self): | |
165 for single_filter in self.filter: | |
166 self.rowfilter(single_filter) | |
167 | |
168 def area_selection(self): | |
169 if self.latvalS != "" and self.lonvalW != "": | |
170 # Select geographical area | |
171 self.gset = self.dset.sel({self.latname: | |
172 slice(self.latvalS, self.latvalN), | |
173 self.lonname: | |
174 slice(self.lonvalW, self.lonvalE)}) | |
175 elif self.latvalN != "" and self.lonvalE != "": | |
176 # select nearest location | |
177 self.nearest_location() # find nearest location without NaN values | |
178 self.gset = self.dset.sel({self.latname: self.nearest_latvalN, | |
179 self.lonname: self.nearest_lonvalE}, | |
180 method='nearest') | |
181 else: | |
182 self.gset = self.dset | |
183 | |
184 def nearest_location(self): | |
185 # Build a geopandas dataframe with all first elements in each dimension | |
186 # so we assume null values correspond to a mask that is the same for | |
187 # all dimensions in the dataset. | |
188 dsel_frame = self.dset | |
189 for dim in self.dset.dims: | |
190 if dim != self.latname and dim != self.lonname: | |
191 dsel_frame = dsel_frame.isel({dim: 0}) | |
192 # transform to pandas dataframe | |
193 dff = dsel_frame.to_dataframe().dropna().reset_index() | |
194 # transform to geopandas to collocate | |
195 gdf = gdp.GeoDataFrame(dff, | |
196 geometry=gdp.points_from_xy(dff[self.lonname], | |
197 dff[self.latname])) | |
198 # Find nearest location where values are not null | |
199 point = Point(self.lonvalE, self.latvalN) | |
200 multipoint = gdf.geometry.unary_union | |
201 queried_geom, nearest_geom = nearest_points(point, multipoint) | |
202 self.nearest_latvalN = nearest_geom.y | |
203 self.nearest_lonvalE = nearest_geom.x | |
204 | |
205 def selection_from_coords(self): | |
206 fcoords = pd.read_csv(self.coords, sep='\t') | |
207 for row in fcoords.itertuples(): | |
208 self.latvalN = row[0] | |
209 self.lonvalE = row[1] | |
210 self.outfile = (self.outputdir + '/' + self.select + '_' | |
211 + str(row.Index) + '.tabular') | |
212 self.selection() | |
213 | |
214 | |
215 if __name__ == '__main__': | |
216 warnings.filterwarnings("ignore") | |
217 parser = argparse.ArgumentParser() | |
218 | |
219 parser.add_argument( | |
220 'infile', | |
221 help='netCDF input filename' | |
222 ) | |
223 parser.add_argument( | |
224 '--info', | |
225 help='Output filename where metadata information is stored' | |
226 ) | |
227 parser.add_argument( | |
228 '--summary', | |
229 help='Output filename where data summary information is stored' | |
230 ) | |
231 parser.add_argument( | |
232 '--select', | |
233 help='Variable name to select' | |
234 ) | |
235 parser.add_argument( | |
236 '--latname', | |
237 help='Latitude name' | |
238 ) | |
239 parser.add_argument( | |
240 '--latvalN', | |
241 help='North latitude value' | |
242 ) | |
243 parser.add_argument( | |
244 '--latvalS', | |
245 help='South latitude value' | |
246 ) | |
247 parser.add_argument( | |
248 '--lonname', | |
249 help='Longitude name' | |
250 ) | |
251 parser.add_argument( | |
252 '--lonvalE', | |
253 help='East longitude value' | |
254 ) | |
255 parser.add_argument( | |
256 '--lonvalW', | |
257 help='West longitude value' | |
258 ) | |
259 parser.add_argument( | |
260 '--coords', | |
261 help='Input file containing Latitude and Longitude' | |
262 'for geographical selection' | |
263 ) | |
264 parser.add_argument( | |
265 '--filter', | |
266 nargs="*", | |
267 help='Filter list variable#operator#value_s#value_e' | |
268 ) | |
269 parser.add_argument( | |
270 '--time', | |
271 help='select timeseries variable#operator#value_s[#value_e]' | |
272 ) | |
273 parser.add_argument( | |
274 '--outfile', | |
275 help='csv outfile for storing results of the selection' | |
276 '(valid only when --select)' | |
277 ) | |
278 parser.add_argument( | |
279 '--outputdir', | |
280 help='folder name for storing results with multiple selections' | |
281 '(valid only when --select)' | |
282 ) | |
283 parser.add_argument( | |
284 "-v", "--verbose", | |
285 help="switch on verbose mode", | |
286 action="store_true" | |
287 ) | |
288 args = parser.parse_args() | |
289 | |
290 p = XarrayTool(args.infile, args.info, args.summary, args.select, | |
291 args.outfile, args.outputdir, args.latname, | |
292 args.latvalN, args.latvalS, args.lonname, | |
293 args.lonvalE, args.lonvalW, args.filter, | |
294 args.coords, args.time, args.verbose) | |
295 if args.info: | |
296 p.info() | |
297 if args.summary: | |
298 p.summary() | |
299 if args.coords: | |
300 p.selection_from_coords() | |
301 elif args.select: | |
302 p.selection() |