comparison xarray_tool.py @ 1:7edbe5ae8b72 draft

"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit f1455c158011dc4aab0fd469cf794be6f4142992"
author ecology
date Fri, 22 May 2020 05:18:55 -0400
parents
children e8650cdf092f
comparison
equal deleted inserted replaced
0:965fcab0cd9f 1:7edbe5ae8b72
1 # xarray tool for:
2 # - getting metadata information
3 # - select data and save results in csv file for further post-processing
4
5 import argparse
6 import csv
7 import warnings
8
9 import geopandas as gdp
10
11 import pandas as pd
12
13 from shapely.geometry import Point
14 from shapely.ops import nearest_points
15
16 import xarray as xr
17
18
19 class XarrayTool ():
20 def __init__(self, infile, outfile_info="", outfile_summary="",
21 select="", outfile="", outputdir="", latname="",
22 latvalN="", latvalS="", lonname="", lonvalE="",
23 lonvalW="", filter_list="", coords="", time="",
24 verbose=False
25 ):
26 self.infile = infile
27 self.outfile_info = outfile_info
28 self.outfile_summary = outfile_summary
29 self.select = select
30 self.outfile = outfile
31 self.outputdir = outputdir
32 self.latname = latname
33 if latvalN != "" and latvalN is not None:
34 self.latvalN = float(latvalN)
35 else:
36 self.latvalN = ""
37 if latvalS != "" and latvalS is not None:
38 self.latvalS = float(latvalS)
39 else:
40 self.latvalS = ""
41 self.lonname = lonname
42 if lonvalE != "" and lonvalE is not None:
43 self.lonvalE = float(lonvalE)
44 else:
45 self.lonvalE = ""
46 if lonvalW != "" and lonvalW is not None:
47 self.lonvalW = float(lonvalW)
48 else:
49 self.lonvalW = ""
50 self.filter = filter_list
51 self.time = time
52 self.coords = coords
53 self.verbose = verbose
54 # initialization
55 self.dset = None
56 self.gset = None
57 if self.verbose:
58 print("infile: ", self.infile)
59 print("outfile_info: ", self.outfile_info)
60 print("outfile_summary: ", self.outfile_summary)
61 print("outfile: ", self.outfile)
62 print("select: ", self.select)
63 print("outfile: ", self.outfile)
64 print("outputdir: ", self.outputdir)
65 print("latname: ", self.latname)
66 print("latvalN: ", self.latvalN)
67 print("latvalS: ", self.latvalS)
68 print("lonname: ", self.lonname)
69 print("lonvalE: ", self.lonvalE)
70 print("lonvalW: ", self.lonvalW)
71 print("filter: ", self.filter)
72 print("time: ", self.time)
73 print("coords: ", self.coords)
74
75 def info(self):
76 f = open(self.outfile_info, 'w')
77 ds = xr.open_dataset(self.infile)
78 ds.info(f)
79 f.close()
80
81 def summary(self):
82 f = open(self.outfile_summary, 'w')
83 ds = xr.open_dataset(self.infile)
84 writer = csv.writer(f, delimiter='\t')
85 header = ['VariableName', 'NumberOfDimensions']
86 for idx, val in enumerate(ds.dims.items()):
87 header.append('Dim'+str(idx)+'Name')
88 header.append('Dim'+str(idx)+'Size')
89 writer.writerow(header)
90 for name, da in ds.data_vars.items():
91 line = [name]
92 line.append(len(ds[name].shape))
93 for d, s in zip(da.shape, da.sizes):
94 line.append(s)
95 line.append(d)
96 writer.writerow(line)
97 for name, da in ds.coords.items():
98 line = [name]
99 line.append(len(ds[name].shape))
100 for d, s in zip(da.shape, da.sizes):
101 line.append(s)
102 line.append(d)
103 writer.writerow(line)
104 f.close()
105
106 def rowfilter(self, single_filter):
107 split_filter = single_filter.split('#')
108 filter_varname = split_filter[0]
109 op = split_filter[1]
110 ll = float(split_filter[2])
111 if (op == 'bi'):
112 rl = float(split_filter[3])
113 if filter_varname == self.select:
114 # filter on values of the selected variable
115 if op == 'bi':
116 self.dset = self.dset.where((self.dset <= rl) &
117 (self.dset >= ll))
118 elif op == 'le':
119 self.dset = self.dset.where(self.dset <= ll)
120 elif op == 'ge':
121 self.dset = self.dset.where(self.dset >= ll)
122 elif op == 'e':
123 self.dset = self.dset.where(self.dset == ll)
124 else: # filter on other dimensions of the selected variable
125 if op == 'bi':
126 self.dset = self.dset.sel({filter_varname: slice(ll, rl)})
127 elif op == 'le':
128 self.dset = self.dset.sel({filter_varname: slice(None, ll)})
129 elif op == 'ge':
130 self.dset = self.dset.sel({filter_varname: slice(ll, None)})
131 elif op == 'e':
132 self.dset = self.dset.sel({filter_varname: ll},
133 method='nearest')
134
135 def selection(self):
136 if self.dset is None:
137 self.ds = xr.open_dataset(self.infile)
138 self.dset = self.ds[self.select] # select variable
139 if self.time:
140 self.datetime_selection()
141 if self.filter:
142 self.filter_selection()
143
144 self.area_selection()
145 # convert to dataframe
146 self.gset = self.gset.to_dataframe().dropna(how='all').reset_index()
147 self.gset.to_csv(self.outfile, header=True, sep='\t')
148
149 def datetime_selection(self):
150 split_filter = self.time.split('#')
151 time_varname = split_filter[0]
152 op = split_filter[1]
153 ll = split_filter[2]
154 if (op == 'sl'):
155 rl = split_filter[3]
156 self.dset = self.dset.sel({time_varname: slice(ll, rl)})
157 elif (op == 'to'):
158 self.dset = self.dset.sel({time_varname: slice(None, ll)})
159 elif (op == 'from'):
160 self.dset = self.dset.sel({time_varname: slice(ll, None)})
161 elif (op == 'is'):
162 self.dset = self.dset.sel({time_varname: ll}, method='nearest')
163
164 def filter_selection(self):
165 for single_filter in self.filter:
166 self.rowfilter(single_filter)
167
168 def area_selection(self):
169 if self.latvalS != "" and self.lonvalW != "":
170 # Select geographical area
171 self.gset = self.dset.sel({self.latname:
172 slice(self.latvalS, self.latvalN),
173 self.lonname:
174 slice(self.lonvalW, self.lonvalE)})
175 elif self.latvalN != "" and self.lonvalE != "":
176 # select nearest location
177 self.nearest_location() # find nearest location without NaN values
178 self.gset = self.dset.sel({self.latname: self.nearest_latvalN,
179 self.lonname: self.nearest_lonvalE},
180 method='nearest')
181 else:
182 self.gset = self.dset
183
184 def nearest_location(self):
185 # Build a geopandas dataframe with all first elements in each dimension
186 # so we assume null values correspond to a mask that is the same for
187 # all dimensions in the dataset.
188 dsel_frame = self.dset
189 for dim in self.dset.dims:
190 if dim != self.latname and dim != self.lonname:
191 dsel_frame = dsel_frame.isel({dim: 0})
192 # transform to pandas dataframe
193 dff = dsel_frame.to_dataframe().dropna().reset_index()
194 # transform to geopandas to collocate
195 gdf = gdp.GeoDataFrame(dff,
196 geometry=gdp.points_from_xy(dff[self.lonname],
197 dff[self.latname]))
198 # Find nearest location where values are not null
199 point = Point(self.lonvalE, self.latvalN)
200 multipoint = gdf.geometry.unary_union
201 queried_geom, nearest_geom = nearest_points(point, multipoint)
202 self.nearest_latvalN = nearest_geom.y
203 self.nearest_lonvalE = nearest_geom.x
204
205 def selection_from_coords(self):
206 fcoords = pd.read_csv(self.coords, sep='\t')
207 for row in fcoords.itertuples():
208 self.latvalN = row[0]
209 self.lonvalE = row[1]
210 self.outfile = (self.outputdir + '/' + self.select + '_'
211 + str(row.Index) + '.tabular')
212 self.selection()
213
214
215 if __name__ == '__main__':
216 warnings.filterwarnings("ignore")
217 parser = argparse.ArgumentParser()
218
219 parser.add_argument(
220 'infile',
221 help='netCDF input filename'
222 )
223 parser.add_argument(
224 '--info',
225 help='Output filename where metadata information is stored'
226 )
227 parser.add_argument(
228 '--summary',
229 help='Output filename where data summary information is stored'
230 )
231 parser.add_argument(
232 '--select',
233 help='Variable name to select'
234 )
235 parser.add_argument(
236 '--latname',
237 help='Latitude name'
238 )
239 parser.add_argument(
240 '--latvalN',
241 help='North latitude value'
242 )
243 parser.add_argument(
244 '--latvalS',
245 help='South latitude value'
246 )
247 parser.add_argument(
248 '--lonname',
249 help='Longitude name'
250 )
251 parser.add_argument(
252 '--lonvalE',
253 help='East longitude value'
254 )
255 parser.add_argument(
256 '--lonvalW',
257 help='West longitude value'
258 )
259 parser.add_argument(
260 '--coords',
261 help='Input file containing Latitude and Longitude'
262 'for geographical selection'
263 )
264 parser.add_argument(
265 '--filter',
266 nargs="*",
267 help='Filter list variable#operator#value_s#value_e'
268 )
269 parser.add_argument(
270 '--time',
271 help='select timeseries variable#operator#value_s[#value_e]'
272 )
273 parser.add_argument(
274 '--outfile',
275 help='csv outfile for storing results of the selection'
276 '(valid only when --select)'
277 )
278 parser.add_argument(
279 '--outputdir',
280 help='folder name for storing results with multiple selections'
281 '(valid only when --select)'
282 )
283 parser.add_argument(
284 "-v", "--verbose",
285 help="switch on verbose mode",
286 action="store_true"
287 )
288 args = parser.parse_args()
289
290 p = XarrayTool(args.infile, args.info, args.summary, args.select,
291 args.outfile, args.outputdir, args.latname,
292 args.latvalN, args.latvalS, args.lonname,
293 args.lonvalE, args.lonvalW, args.filter,
294 args.coords, args.time, args.verbose)
295 if args.info:
296 p.info()
297 if args.summary:
298 p.summary()
299 if args.coords:
300 p.selection_from_coords()
301 elif args.select:
302 p.selection()