Mercurial > repos > ecology > xarray_select
diff xarray_tool.py @ 2:123a9a629bef draft
"planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/data_manipulation/xarray/ commit 57b6d23e3734d883e71081c78e77964d61be82ba"
author | ecology |
---|---|
date | Sun, 06 Jun 2021 08:51:41 +0000 |
parents | 6baac361495b |
children | bf595d613af4 |
line wrap: on
line diff
--- a/xarray_tool.py Sat Oct 31 11:00:25 2020 +0000 +++ b/xarray_tool.py Sun Jun 06 08:51:41 2021 +0000 @@ -4,6 +4,7 @@ import argparse import csv +import os import warnings import geopandas as gdp @@ -21,8 +22,8 @@ select="", outfile="", outputdir="", latname="", latvalN="", latvalS="", lonname="", lonvalE="", lonvalW="", filter_list="", coords="", time="", - verbose=False - ): + verbose=False, no_missing=False, coords_info=None, + tolerance=None): self.infile = infile self.outfile_info = outfile_info self.outfile_summary = outfile_summary @@ -30,6 +31,10 @@ self.outfile = outfile self.outputdir = outputdir self.latname = latname + if tolerance != "" and tolerance is not None: + self.tolerance = float(tolerance) + else: + self.tolerance = -1 if latvalN != "" and latvalN is not None: self.latvalN = float(latvalN) else: @@ -51,9 +56,11 @@ self.time = time self.coords = coords self.verbose = verbose + self.no_missing = no_missing # initialization self.dset = None self.gset = None + self.coords_info = coords_info if self.verbose: print("infile: ", self.infile) print("outfile_info: ", self.outfile_info) @@ -71,6 +78,7 @@ print("filter: ", self.filter) print("time: ", self.time) print("coords: ", self.coords) + print("coords_info: ", self.coords_info) def info(self): f = open(self.outfile_info, 'w') @@ -113,7 +121,9 @@ if filter_varname == self.select: # filter on values of the selected variable if op == 'bi': - self.dset = self.dset.where((self.dset <= rl) & (self.dset >= ll)) + self.dset = self.dset.where( + (self.dset <= rl) & (self.dset >= ll) + ) elif op == 'le': self.dset = self.dset.where(self.dset <= ll) elif op == 'ge': @@ -141,9 +151,21 @@ self.filter_selection() self.area_selection() - # convert to dataframe - self.gset = self.gset.to_dataframe().dropna(how='all').reset_index() - self.gset.to_csv(self.outfile, header=True, sep='\t') + if self.gset.count() > 1: + # convert to dataframe if several rows and cols + self.gset = self.gset.to_dataframe().dropna(how='all'). \ + reset_index() + self.gset.to_csv(self.outfile, header=True, sep='\t') + else: + data = { + self.latname: [self.gset[self.latname].values], + self.lonname: [self.gset[self.lonname].values], + self.select: [self.gset.values] + } + + df = pd.DataFrame(data, columns=[self.latname, self.lonname, + self.select]) + df.to_csv(self.outfile, header=True, sep='\t') def datetime_selection(self): split_filter = self.time.split('#') @@ -165,6 +187,7 @@ self.rowfilter(single_filter) def area_selection(self): + if self.latvalS != "" and self.lonvalW != "": # Select geographical area self.gset = self.dset.sel({self.latname: @@ -173,10 +196,21 @@ slice(self.lonvalW, self.lonvalE)}) elif self.latvalN != "" and self.lonvalE != "": # select nearest location - self.nearest_location() # find nearest location without NaN values - self.gset = self.dset.sel({self.latname: self.nearest_latvalN, - self.lonname: self.nearest_lonvalE}, - method='nearest') + if self.no_missing: + self.nearest_latvalN = self.latvalN + self.nearest_lonvalE = self.lonvalE + else: + # find nearest location without NaN values + self.nearest_location() + if self.tolerance > 0: + self.gset = self.dset.sel({self.latname: self.nearest_latvalN, + self.lonname: self.nearest_lonvalE}, + method='nearest', + tolerance=self.tolerance) + else: + self.gset = self.dset.sel({self.latname: self.nearest_latvalN, + self.lonname: self.nearest_lonvalE}, + method='nearest') else: self.gset = self.dset @@ -206,9 +240,21 @@ for row in fcoords.itertuples(): self.latvalN = row[0] self.lonvalE = row[1] - self.outfile = (self.outputdir + '/' + self.select + '_' + str(row.Index) + '.tabular') + self.outfile = (os.path.join(self.outputdir, + self.select + '_' + + str(row.Index) + '.tabular')) self.selection() + def get_coords_info(self): + ds = xr.open_dataset(self.infile) + for c in ds.coords: + filename = os.path.join(self.coords_info, + c.strip() + + '.tabular') + pd = ds.coords[c].to_pandas() + pd.index = range(len(pd)) + pd.to_csv(filename, header=False, sep='\t') + if __name__ == '__main__': warnings.filterwarnings("ignore") @@ -255,11 +301,21 @@ help='West longitude value' ) parser.add_argument( + '--tolerance', + help='Maximum distance between original and selected value for ' + ' inexact matches e.g. abs(index[indexer] - target) <= tolerance' + ) + parser.add_argument( '--coords', help='Input file containing Latitude and Longitude' 'for geographical selection' ) parser.add_argument( + '--coords_info', + help='output-folder where for each coordinate, coordinate values ' + ' are being printed in the corresponding outputfile' + ) + parser.add_argument( '--filter', nargs="*", help='Filter list variable#operator#value_s#value_e' @@ -283,13 +339,20 @@ help="switch on verbose mode", action="store_true" ) + parser.add_argument( + "--no_missing", + help="""Do not take into account possible null/missing values + (only valid for single location)""", + action="store_true" + ) args = parser.parse_args() p = XarrayTool(args.infile, args.info, args.summary, args.select, args.outfile, args.outputdir, args.latname, args.latvalN, args.latvalS, args.lonname, args.lonvalE, args.lonvalW, args.filter, - args.coords, args.time, args.verbose) + args.coords, args.time, args.verbose, + args.no_missing, args.coords_info, args.tolerance) if args.info: p.info() if args.summary: @@ -298,3 +361,5 @@ p.selection_from_coords() elif args.select: p.selection() + elif args.coords_info: + p.get_coords_info()