Source code for xbout.load

from copy import copy
from warnings import warn
from pathlib import Path
from functools import partial
from itertools import chain

from boutdata.data import BoutOptionsFile
import xarray as xr

from natsort import natsorted

from . import geometries
from .utils import (
    _set_attrs_on_all_vars,
    _separate_metadata,
    _check_filetype,
    _is_path,
    _is_dir,
)

_BOUT_GEOMETRY_VARS = [
    "ixseps1",
    "ixseps2",
    "jyseps1_1",
    "jyseps2_1",
    "jyseps1_2",
    "jyseps2_2",
    "nx",
    "ny",
    "ny_inner",
]


# This code should run whenever any function from this module is imported
# Set all attrs to survive all mathematical operations
# (see https://github.com/pydata/xarray/pull/2482)
try:
    xr.set_options(keep_attrs=True)
except ValueError:
    raise ImportError(
        "For dataset attributes to be permanent you need to be "
        "using the development version of xarray - found at "
        "https://github.com/pydata/xarray/"
    )
try:
    xr.set_options(file_cache_maxsize=256)
except ValueError:
    raise ImportError(
        "For open and closing of netCDF files correctly you need"
        " to be using the development version of xarray - found"
        " at https://github.com/pydata/xarray/"
    )


[docs] def open_boutdataset( datapath="./BOUT.dmp.*.nc", inputfilepath=None, geometry=None, gridfilepath=None, grid_mismatch="raise", chunks=None, keep_xboundaries=True, keep_yboundaries=False, run_name=None, info=True, is_restart=None, **kwargs, ): """Load a dataset from a set of BOUT output files, including the input options file. Can also load from a grid file or from restart files. Note that when reloading a Dataset that was saved by xBOUT, the state of the saved Dataset is restored, and the values of ``keep_xboundaries``, ``keep_yboundaries``, and ``run_name`` are ignored. ``geometry`` is treated specially, and can be passed when reloading a Dataset (along with ``gridfilepath`` if needed). Troubleshooting --------------- Variable conflicts: sometimes, for example when loading data from multiple restarts, some variables may have conflicts (e.g. a source term was changed between some of the restarts, but the source term is saved as time-independent, without a t-dimension). In this case one workaround is to pass a list of variable names to the keyword argument ``drop_vars`` to ignore the variables with conflicts, e.g. if ``"S1"`` and ``"S2"`` have conflicts:: ds = open_boutdataset("data*/boutdata.nc", drop_variables=["S1", "S2"]) will open a Dataset which is missing ``"S1"`` and ``"S2"`` (``drop_variables`` is an argument of `xarray.open_dataset` that is passed down through ``kwargs``.) Parameters ---------- datapath : str or (list or tuple of xr.Dataset), optional Path to the data to open. Can point to either a set of one or more dump files, or a single grid file. To specify multiple dump files you must enter the path to them as a single glob, e.g. './BOUT.dmp.*.nc', or for multiple consecutive runs in different directories (in order) then './run*/BOUT.dmp.*.nc'. If a list or tuple of xr.Dataset is passed, they will be combined with xr.combine_nested() instead of loading data from disk (intended for unit testing). chunks : dict, optional inputfilepath : str, optional geometry : str, optional The geometry type of the grid data. This will specify what type of coordinates to add to the dataset, e.g. 'toroidal' or 'cylindrical'. If not specified then will attempt to read it from the file attrs. If still not found then a warning will be thrown, which can be suppressed by passing ``info=False``. To define a new type of geometry you need to use the `register_geometry` decorator. You are encouraged to do this for your own BOUT++ physics module, to apply relevant normalisations. gridfilepath : str, optional The path to a grid file, containing any variables needed to apply the geometry specified by the 'geometry' option, which are not contained in the dump files. This may either be the path of the grid file itself, or the directory relative to which the grid from the settings file can be found. grid_mismatch : str, optional How to handle if the grid is not the grid that has been used for the simulation. Can be "raise" to raise a RuntimeError, "warn" to raise a warning, or ignore to ignore the mismatch silently. keep_xboundaries : bool, optional If true, keep x-direction boundary cells (the cells past the physical edges of the grid, where boundary conditions are set); increases the size of the x dimension in the returned data-set. If false, trim these cells. keep_yboundaries : bool, optional If true, keep y-direction boundary cells (the cells past the physical edges of the grid, where boundary conditions are set); increases the size of the y dimension in the returned data-set. If false, trim these cells. run_name : str, optional Name to give to the whole dataset, e.g. 'JET_ELM_high_resolution'. Useful if you are going to open multiple simulations and compare the results. info : bool or "terse", optional is_restart : bool, optional Restart files require some special handling (e.g. working around variables that are not present in restart files). By default, this special handling is enabled if the files do not have a time dimension and ``restart`` is present in the file name in ``datapath``. This option can be set to True or False to explicitly enable or disable the restart file handling. kwargs : optional Keyword arguments are passed down to `xarray.open_mfdataset`, which in turn passes extra kwargs down to `xarray.open_dataset`. Returns ------- ds : xarray.Dataset """ if chunks is None: chunks = {} input_type = _check_dataset_type(datapath) if is_restart is None: is_restart = input_type == "restart" elif is_restart is True: input_type = "restart" if "reload" in input_type: if input_type == "reload": if isinstance(datapath, Path): # xr.open_mfdataset only accepts glob patterns as # strings, not Path objects datapath = str(datapath) ds = xr.open_mfdataset( datapath, chunks=chunks, combine="by_coords", data_vars="minimal", **kwargs, ) elif input_type == "reload_fake": ds = xr.combine_by_coords(datapath, data_vars="minimal").chunk(chunks) else: raise ValueError(f"internal error: unexpected input_type={input_type}") def attrs_to_dict(obj, section): result = {} section = section + ":" sectionlength = len(section) for key in list(obj.attrs): if key[:sectionlength] == section: val = obj.attrs.pop(key) if isinstance(val, bytes): val = val.decode() result[key[sectionlength:]] = val return result def attrs_remove_section(obj, section): section = section + ":" sectionlength = len(section) has_metadata = False for key in list(obj.attrs): if key[:sectionlength] == section: has_metadata = True del obj.attrs[key] return has_metadata # Restore metadata from attrs metadata = attrs_to_dict(ds, "metadata") if "is_restart" not in metadata: # Loading data that was saved with a version of xbout from before # "is_restart" was added, so need to add it to the metadata. metadata["is_restart"] = int(is_restart) ds.attrs["metadata"] = metadata # Must do this for all variables and coordinates in dataset too for da in chain(ds.data_vars.values(), ds.coords.values()): if attrs_remove_section(da, "metadata"): da.attrs["metadata"] = metadata ds = _add_options(ds, inputfilepath) # If geometry was set, apply geometry again if geometry is not None: if "geometry" != ds.attrs.get("geometry", None): warn( f'open_boutdataset() called with geometry="{geometry}", but we are ' f"reloading a Dataset that was saved after being loaded with " f'geometry="{ds.attrs.get("geometry", None)}". Applying ' f'geometry="{geometry}" from the argument.' ) if gridfilepath is not None: grid = _open_grid( gridfilepath, chunks=chunks, keep_xboundaries=ds.metadata["keep_xboundaries"], keep_yboundaries=ds.metadata["keep_yboundaries"], mxg=ds.metadata["MXG"], ) else: grid = None ds = geometries.apply_geometry(ds, geometry, grid=grid) elif "geometry" in ds.attrs: ds = geometries.apply_geometry(ds, ds.attrs["geometry"]) else: ds = geometries.apply_geometry(ds, None) if info == "terse": print("Read in dataset from {}".format(str(Path(datapath)))) elif info: print("Read in:\n{}".format(ds.bout)) return ds # Determine if file is a grid file or data dump files remove_yboundaries = False if "dump" in input_type or "restart" in input_type: # Gather pointers to all numerical data from BOUT++ output files ds, remove_yboundaries = _auto_open_mfboutdataset( datapath=datapath, chunks=chunks, keep_xboundaries=keep_xboundaries, keep_yboundaries=keep_yboundaries, is_restart=is_restart, **kwargs, ) elif "grid" in input_type: # Its a grid file ds = _open_grid( datapath, chunks=chunks, keep_xboundaries=keep_xboundaries, keep_yboundaries=keep_yboundaries, **kwargs, ) else: raise ValueError(f"internal error: unexpected input_type={input_type}") ds, metadata = _separate_metadata(ds) # Store as ints because netCDF doesn't support bools, so we can't save # bool attributes metadata["keep_xboundaries"] = int(keep_xboundaries) metadata["keep_yboundaries"] = int(keep_yboundaries) metadata["is_restart"] = int(is_restart) ds = _set_attrs_on_all_vars(ds, "metadata", metadata) if remove_yboundaries: # If remove_yboundaries is True, we need to keep y-boundaries when opening the # grid file, as they will be removed from the full Dataset below keep_yboundaries = True ds = _add_options(ds, inputfilepath) if geometry is None: if geometry in ds.attrs: geometry = ds.attrs.get("geometry") if geometry: if info: print("Applying {} geometry conventions".format(geometry)) if _is_dir(gridfilepath): if "grid" in ds.options: gridfilepath += "/" + ds.options["grid"] else: warn( "gridfilepath set to a directory, but no grid used " "in simulation. Continuing without grid." ) if gridfilepath is not None: grid = _open_grid( gridfilepath, chunks=chunks, keep_xboundaries=keep_xboundaries, keep_yboundaries=keep_yboundaries, mxg=ds.metadata["MXG"], ) else: grid = None else: grid = None if info: warn("No geometry type found, no physical coordinates will be added") if grid: grididoptions = ds.metadata.get("grid_id", None) grididfile = grid.get("grid_id", None) if grididoptions and grididfile and grididfile != grididoptions: msg = f"""The grid used for the simulation is not the one used. For the simulation {grididoptions} was used, but we did load {grididfile}.""" if grid_mismatch == "warn": warn(msg) elif grid_mismatch == "ignore": pass else: raise ValueError(msg) for v in _BOUT_GEOMETRY_VARS: if v not in ds.metadata and v in grid: ds.metadata[v] = grid[v].values # Update coordinates to match particular geometry of grid ds = geometries.apply_geometry(ds, geometry, grid=grid) if remove_yboundaries: ds = ds.bout.remove_yboundaries() # TODO read and store git commit hashes from output files if run_name: ds.name = run_name # Set some default settings that are only used in post-processing # by xBOUT, not by BOUT++ ds.bout.fine_interpolation_factor = 8 if ("dump" in input_type or "restart" in input_type) and ds.metadata[ "BOUT_VERSION" ] < 4.0: # Add workarounds for missing information or different # conventions in data saved by BOUT++ v3.x. for v in ds: if ds.metadata["bout_zdim"] in ds[v].dims: # All fields saved on aligned grid for BOUT-3 ds[v].attrs["direction_y"] = "Aligned" added_location = False if any( d in ds[v].dims for d in ( ds.metadata["bout_xdim"], ds.metadata["bout_ydim"], ds.metadata["bout_zdim"], ) ): # zShift, etc. did not support staggered grids in # BOUT++ v3 anyway, so just treat all variables as if # they were at CELL_CENTRE ds[v].attrs["cell_location"] = "CELL_CENTRE" added_location = True if added_location: warn( "Detected data from BOUT++ v3.x. Treating all variables" " as being at `CELL_CENTRE`. Should be similar to what" " BOUT++ v3.x did, but if your code uses staggered grids," " this may produce unexpected effects in some places." ) if "nz" not in ds.metadata: # `nz` used to be stored as `MZ` and `MZ` used to include # an extra buffer point that was not used for data. ds.metadata["nz"] = ds.metadata["MZ"] - 1 if info == "terse": print("Read in dataset from {}".format(str(Path(datapath)))) elif info: print("Read in:\n{}".format(ds.bout)) return ds
def _add_options(ds, inputfilepath): if inputfilepath: # Use Ben's options class to store all input file options options = BoutOptionsFile( inputfilepath, nx=ds.metadata["nx"], ny=ds.metadata["ny"], nz=ds.metadata["nz"], ) else: options = None ds = _set_attrs_on_all_vars(ds, "options", options) return ds
[docs] def collect( varname, xind=None, yind=None, zind=None, tind=None, path=".", yguards=False, xguards=True, info=True, prefix="BOUT.dmp", ): """Extract the data pertaining to a specified variable in a BOUT++ data set Parameters ---------- varname : str Name of the variable xind, yind, zind, tind : int, slice or list of int, optional Range of X, Y, Z or time indices to collect. Either a single index to collect, a list containing [start, end] (inclusive end), or a slice object (usual python indexing). Default is to fetch all indices path : str, optional Path to data files (default: ".") prefix : str, optional File prefix (default: "BOUT.dmp") yguards : bool, optional Collect Y boundary guard cells? (default: False) xguards : bool, optional Collect X boundary guard cells? (default: True) (Set to True to be consistent with the definition of nx) info : bool, optional Print information about collect? (default: True) Notes ---------- strict : This option found in boutdata.collect() is not present in this function it is assumed that the varname given is correct, if variable does not exist the function will fail tind_auto : This option is not required when using _auto_open_mfboutdataset as an automatic failure if datasets are different lengths is included Returns ---------- ds : numpy.ndarray """ from os.path import join datapath = join(path, prefix + "*.nc") ds, _ = _auto_open_mfboutdataset( datapath, keep_xboundaries=xguards, keep_yboundaries=yguards, info=info ) if varname not in ds: raise KeyError("No variable, {} was found in {}.".format(varname, datapath)) dims = list(ds.dims) inds = [tind, xind, yind, zind] selection = {} # Convert indexing values to an isel suitable format for dim, ind in zip(dims, inds): if isinstance(ind, int): indexer = [ind] elif isinstance(ind, list): start, end = ind indexer = slice(start, end + 1) elif ind is not None: indexer = ind else: indexer = None if indexer: selection[dim] = indexer try: version = ds["BOUT_VERSION"] except KeyError: # If BOUT Version is not saved in the dataset version = 0 # Subtraction of z-dimensional data occurs in boutdata.collect # if BOUT++ version is old - same feature added here if (version < 3.5) and ("z" in dims): zsize = int(ds["nz"]) - 1 ds = ds.isel(z=slice(zsize)) if selection: ds = ds.isel(selection) result = ds[varname].values # Close netCDF files to ensure they are not locked if collect is called again ds.close() return result
def _check_dataset_type(datapath): """ Check what type of files we have. Could be: (i) produced by xBOUT - one or several files, include metadata attributes, e.g. 'metadata:keep_yboundaries' (ii) grid file - only one file, and no time dimension (iii) produced by BOUT++ - one or several files (iv) restart files produced by BOUT++ - one or several files, no time dimension, filenames include `restart` """ if not _is_path(datapath): # not a filepath glob, so presumably Dataset or list of Datasets used for # testing if isinstance(datapath, xr.Dataset): if "metadata:keep_yboundaries" in datapath.attrs: # (i) return "reload_fake" elif "t" in datapath.dims: # (iii) return "dump_fake" else: # (ii) return "grid_fake" elif len(datapath) > 1: if "metadata:keep_yboundaries" in datapath[0].attrs: # (i) return "reload_fake" else: # (iii) return "dump_fake" else: # Single element list of Datasets, or nested list of Datasets return _check_dataset_type(datapath[0]) filepaths, filetype = _expand_filepaths(datapath) ds = xr.open_dataset(filepaths[0], engine=filetype) ds.close() if "metadata:keep_yboundaries" in ds.attrs: # (i) return "reload" elif "t" in ds.dims: # (iii) return "dump" elif all(["restart" in Path(p).name for p in filepaths]): # (iv) return "restart" elif len(filepaths) == 1: # (ii) return "grid" else: # fall back to opening as dump files return "dump" def _auto_open_mfboutdataset( datapath, chunks=None, info=True, keep_xboundaries=False, keep_yboundaries=False, is_restart=False, **kwargs, ): if chunks is None: chunks = {} if _is_path(datapath): filepaths, filetype = _expand_filepaths(datapath) # Open just one file to read processor splitting nxpe, nype, mxg, myg, mxsub, mysub, is_squashed_doublenull = _read_splitting( filepaths[0], info, keep_yboundaries ) if is_squashed_doublenull: # Need to remove y-boundaries after loading: (i) in case # we are loading a squashed data-set, in which case we # cannot easily remove the upper boundary cells in # _trim(); (ii) because using the remove_yboundaries() # method for non-squashed data-sets is simpler than # replicating that logic in _trim(). remove_yboundaries = not keep_yboundaries keep_yboundaries = True else: remove_yboundaries = False # Create a partial application of _trim # Calls to _preprocess will call _trim to trim guard / boundary cells # from datasets before merging. _preprocess = partial( _trim, guards={"x": mxg, "y": myg}, keep_boundaries={"x": keep_xboundaries, "y": keep_yboundaries}, nxpe=nxpe, nype=nype, is_restart=is_restart, ) paths_grid, concat_dims = _arrange_for_concatenation(filepaths, nxpe, nype) ds = xr.open_mfdataset( paths_grid, concat_dim=concat_dims, combine="nested", preprocess=_preprocess, engine=filetype, chunks=chunks, # Only data variables in which the dimension already # appears are concatenated. data_vars="minimal", # Only coordinates in which the dimension already appears # are concatenated. coords="minimal", # Duplicate data taken from first dataset compat="override", # Duplicate attributes taken from first dataset combine_attrs="override", # Don't align. Raise ValueError when indexes to be aligned # are not equal join="exact", **kwargs, ) else: # datapath was nested list of Datasets if isinstance(datapath, xr.Dataset): # normalise as one-element list datapath = [datapath] mxg = int(datapath[0]["MXG"]) myg = int(datapath[0]["MYG"]) nxpe = int(datapath[0]["NXPE"]) nype = int(datapath[0]["NYPE"]) is_squashed_doublenull = ( len(datapath) == 1 and (datapath[0]["jyseps2_1"] != datapath[0]["jyseps1_2"]).values ) if is_squashed_doublenull: # Need to remove y-boundaries after loading when loading a # squashed data-set, in which case we cannot easily remove # the upper boundary cells in _trim(). remove_yboundaries = not keep_yboundaries keep_yboundaries = True else: remove_yboundaries = False _preprocess = partial( _trim, guards={"x": mxg, "y": myg}, keep_boundaries={"x": keep_xboundaries, "y": keep_yboundaries}, nxpe=nxpe, nype=nype, is_restart=is_restart, ) datapath = [_preprocess(x) for x in datapath] ds_grid, concat_dims = _arrange_for_concatenation(datapath, nxpe, nype) ds = xr.combine_nested( ds_grid, concat_dim=concat_dims, join="exact", # Only data variables in which the dimension already # appears are concatenated. data_vars="minimal", # Only coordinates in which the dimension already appears # are concatenated. coords="minimal", # Duplicate data taken from first dataset compat="override", # Duplicate attributes taken from first dataset combine_attrs="override", ) return ds, remove_yboundaries def _expand_filepaths(datapath): """Determines filetypes and opens all dump files.""" path = Path(datapath) filetype = _check_filetype(path) filepaths = _expand_wildcards(path) if not filepaths: raise IOError("No datafiles found matching datapath={}".format(datapath)) if len(filepaths) > 128: warn( "Trying to open a large number of files - setting xarray's" " `file_cache_maxsize` global option to {} to accommodate this. " "Recommend using `xr.set_options(file_cache_maxsize=NUM)`" " to explicitly set this to a large enough value.".format( str(len(filepaths)) ) ) xr.set_options(file_cache_maxsize=len(filepaths)) return filepaths, filetype def _expand_wildcards(path): """Return list of filepaths matching wildcard""" # Find first parent directory which does not contain a wildcard base_dir = Path(path.anchor) # Find path relative to parent search_pattern = str(path.relative_to(base_dir)) # Search this relative path from the parent directory # for all files matching user input filepaths = list(base_dir.glob(search_pattern)) # Sort by numbers in filepath before returning # Use "natural" sort to avoid lexicographic ordering of numbers # e.g. ['0', '1', '10', '11', etc.] return natsorted(filepaths, key=lambda filepath: str(filepath)) def _read_splitting(filepath, info, keep_yboundaries): ds = xr.open_dataset(str(filepath)) # Account for case of no parallelisation, when nxpe etc won't be in dataset def get_nonnegative_scalar(ds, key, default=1, info=True): if key in ds: val = ds[key].values if val < 0: raise ValueError( f"{key} read from dump files is {val}, but negative" f" values are not valid" ) else: return val else: if info is True: print(f"{key} not found, setting to {default}") if default < 0: raise ValueError( f"Default for {key} is {val}," f" but negative values are not valid" ) return default nxpe = get_nonnegative_scalar(ds, "NXPE", default=1, info=info) nype = get_nonnegative_scalar(ds, "NYPE", default=1, info=info) mxg = get_nonnegative_scalar(ds, "MXG", default=2, info=info) myg = get_nonnegative_scalar(ds, "MYG", default=0, info=info) mxsub = get_nonnegative_scalar( ds, "MXSUB", default=ds.sizes["x"] - 2 * mxg, info=info ) mysub = get_nonnegative_scalar( ds, "MYSUB", default=ds.sizes["y"] - 2 * myg, info=info ) # Check whether this is a single file squashed from the multiple # output files of a parallel run (i.e. NXPE*NYPE > 1 even though # there is only a single file to read). if "nx" in ds: nx = ds["nx"].values else: # Workaround for older data files nx = ds["MXSUB"].values * ds["NXPE"].values + 2 * ds["MXG"].values if "ny" in ds: ny = ds["ny"].values else: # Workaround for older data files ny = ds["MYSUB"].values * ds["NYPE"].values nx_file = ds.sizes["x"] ny_file = ds.sizes["y"] is_squashed_doublenull = False if nxpe > 1 or nype > 1: # if nxpe = nype = 1, was only one process anyway, so no need # to check for squashing if nx_file == nx or nx_file == nx - 2 * mxg: has_xboundaries = nx_file == nx if not has_xboundaries: mxg = 0 # Check if there are two divertor targets present # Note: if jyseps2_1 and jyseps1_2 are not in ds it probably # indicates older data and likely the upper target boundary cells # were not saved anyway, so continue as if they were not. if "jyseps2_1" in ds and ds["jyseps1_2"] > ds["jyseps2_1"]: upper_target_cells = myg else: upper_target_cells = 0 if ny_file == ny or ny_file == ny + 2 * myg + 2 * upper_target_cells: # This file contains all the points, possibly # including guard cells has_yboundaries = not (ny_file == ny) if not has_yboundaries: myg = 0 nxpe = 1 nype = 1 if "jyseps2_1" in ds: is_squashed_doublenull = (ds["jyseps2_1"] != ds["jyseps1_2"]).values else: # For older data with no jyseps2_1 or jyseps1_2 in the # dataset, probably do not need to handle double null data # squashed with upper target points. is_squashed_doublenull = False elif ny_file == ny + 2 * myg: # Older squashed file from double-null grid but # containing only lower target boundary cells. if keep_yboundaries: raise ValueError( "Cannot keep y-boundary points: squashed file is missing upper " "target boundary points." ) has_yboundaries = not (ny_file == ny) if not has_yboundaries: myg = 0 nxpe = 1 nype = 1 # For this case, do not need the special handling # enabled by is_squashed_doublenull=True, as keeping # y-boundaries is not allowed is_squashed_doublenull = False # Avoid trying to open this file twice ds.close() return nxpe, nype, mxg, myg, mxsub, mysub, is_squashed_doublenull def _arrange_for_concatenation(filepaths, nxpe=1, nype=1): """ Arrange filepaths into a nested list-of-lists which represents their ordering across different processors and consecutive simulation runs. Filepaths must be a sorted list. Uses the fact that BOUT's output files are named as num = nxpe*i + j, where i={0, ..., nype}, j={0, ..., nxpe}. Also assumes that any consecutive simulation runs are in directories which when sorted are in the correct order (e.g. /run0/*, /run1/*, ...). """ nprocs = nxpe * nype n_runs = int(len(filepaths) / nprocs) runids = [] def getrunid(fp): if _is_path(fp): try: with xr.open_dataset(fp) as tmp: return tmp.get("run_id", None) except FileNotFoundError: return None return fp.get("run_id", None) for fp in filepaths: thisrunid = getrunid(fp) if thisrunid is None: runids = None break runids.append(thisrunid) if not runids: if len(filepaths) < nprocs: if len(filepaths) == 1: raise ValueError( "A parallel simulation was loaded, but only a single " "file was loaded. Please ensure to pass in all files " "by specifing e.g. `BOUT.dmp.*.nc` rather than " "`BOUT.dmp.0.nc`." ) raise ValueError( f"A parallel simulation was loaded, but only {len(filepaths)} " "files were loaded. Please ensure to pass in all files " "by specifing e.g. `BOUT.dmp.*.nc`" ) if len(filepaths) % nprocs != 0: raise ValueError( "Each run directory does not contain an equal number " "of output files. If the parallelization scheme of " "your simulation changed partway-through, then please " "load each directory separately and concatenate them " "along the time dimension with xarray.concat()." ) # Create list of lists of filepaths, so that xarray knows how # they should be concatenated by xarray.open_mfdataset() paths = iter(filepaths) paths_grid = [ [[next(paths) for x in range(nxpe)] for y in range(nype)] for t in range(n_runs) ] else: paths_sorted = [] lastid = None for path, gid in zip(filepaths, runids): if lastid != gid: lastid = gid paths_sorted.append([]) paths_sorted[-1].append(path) paths_grid = [] for paths in paths_sorted: if len(paths) != nprocs: with xr.open_dataset(paths[0]) as tmp: if tmp["PE_XIND"] != 0 or tmp["PE_YIND"] != 0: # The first file is missing. warn( f"Ignoring {len(paths)} files as the first" f" seems to be missing: {paths}" ) continue assert tmp["NXPE"] == nxpe assert tmp["NYPE"] == nype raise ValueError( f"Something is wrong. We expected {nprocs} files" f" but found {len(paths)} files." ) paths = iter(paths) paths_grid.append([[next(paths) for x in range(nxpe)] for y in range(nype)]) # Dimensions along which no concatenation is needed are still present as # single-element lists, so need to concatenation along dim=None for those concat_dims = [None, None, None] if len(filepaths) > nprocs: concat_dims[0] = "t" if nype > 1: concat_dims[1] = "y" if nxpe > 1: concat_dims[2] = "x" return paths_grid, concat_dims def _trim(ds, *, guards, keep_boundaries, nxpe, nype, is_restart): """Trims all guard (and optionally boundary) cells off a single dataset read from a single BOUT dump file, to prepare for concatenation. Variables that store timing information, which are different for each process, are not trimmed but are taken from the first processor during concatenation. Parameters ---------- guards : dict Number of guard cells along each dimension, e.g. {'x': 2, 't': 0} keep_boundaries : dict Whether or not to preserve the boundary cells along each dimension, e.g. {'x': True, 'y': False} nxpe : int Number of processors in x direction nype : int Number of processors in y direction is_restart : bool Is data being loaded from restart files? """ if any(keep_boundaries.values()): # Work out if this particular dataset contains any boundary cells lower_boundaries, upper_boundaries = _infer_contains_boundaries(ds, nxpe, nype) else: lower_boundaries, upper_boundaries = {}, {} selection = {} for dim in ds.dims: lower = _get_limit("lower", dim, keep_boundaries, lower_boundaries, guards) upper = _get_limit("upper", dim, keep_boundaries, upper_boundaries, guards) selection[dim] = slice(lower, upper) trimmed_ds = ds.isel(**selection) # Ignore FieldPerps for now for name in trimmed_ds: if trimmed_ds[name].dims == ("x", "z") or trimmed_ds[name].dims == ( "t", "x", "z", ): trimmed_ds = trimmed_ds.drop_vars(name) return trimmed_ds def _infer_contains_boundaries(ds, nxpe, nype): """Uses the processor indices and BOUT++'s topology indices to work out whether this dataset contains boundary cells, and on which side. """ if nxpe * nype == 1: # single file, always contains boundaries return {"x": True, "y": True}, {"x": True, "y": True} try: xproc = int(ds["PE_XIND"]) yproc = int(ds["PE_YIND"]) except KeyError: # output file from BOUT++ earlier than 4.3 # Use knowledge that BOUT names its output files as # /folder/prefix.num.nc, with a numbering scheme # num = nxpe*i + j, where i={0, ..., nype}, j={0, ..., nxpe} filename = ds.encoding["source"] *prefix, filenum, extension = Path(filename).suffixes filenum = int(filenum.replace(".", "")) xproc = filenum % nxpe yproc = filenum // nxpe lower_boundaries, upper_boundaries = {}, {} lower_boundaries["x"] = xproc == 0 upper_boundaries["x"] = xproc == nxpe - 1 lower_boundaries["y"] = yproc == 0 upper_boundaries["y"] = yproc == nype - 1 jyseps2_1 = int(ds["jyseps2_1"]) jyseps1_2 = int(ds["jyseps1_2"]) if jyseps1_2 > jyseps2_1: # second divertor present ny_inner = int(ds["ny_inner"]) mysub = int(ds["MYSUB"]) if mysub * (yproc + 1) == ny_inner: upper_boundaries["y"] = True elif mysub * yproc == ny_inner: lower_boundaries["y"] = True return lower_boundaries, upper_boundaries def _get_limit(side, dim, keep_boundaries, boundaries, guards): # Check for boundary cells, otherwise use guard cells, else leave alone if keep_boundaries.get(dim, False): if boundaries.get(dim, False): limit = None else: limit = guards[dim] if side == "lower" else -guards[dim] elif guards.get(dim, False): limit = guards[dim] if side == "lower" else -guards[dim] else: limit = None if limit == 0: # 0 would give incorrect result as an upper limit limit = None return limit def _open_grid(datapath, chunks, keep_xboundaries, keep_yboundaries, mxg=2, **kwargs): """ Opens a single grid file. Implements slightly different logic for boundaries to deal with different conventions in a BOUT grid file. """ acceptable_dims = ["x", "y", "z"] # Passing 'chunks' with dimensions that are not present in the # dataset causes an error. A gridfile will be missing 't' and may # be missing 'z' dimensions that dump files have, so we must # remove them from 'chunks'. grid_chunks = copy(chunks) unrecognised_chunk_dims = list(set(grid_chunks.keys()) - set(acceptable_dims)) for dim in unrecognised_chunk_dims: del grid_chunks[dim] if _is_path(datapath): gridfilepath = Path(datapath) grid = xr.open_dataset( gridfilepath, engine=_check_filetype(gridfilepath), **kwargs ) else: grid = datapath unrecognised_dims = list(set(grid.dims) - set(acceptable_dims)) if len(unrecognised_dims) > 0: # Weird string formatting is a workaround to deal with possible bug in # pytest warnings capture - doesn't match strings containing brackets warn( "Will drop all variables containing the dimensions {} because " "they are not recognised".format(str(unrecognised_dims)[1:-1]) ) grid = grid.drop_dims(unrecognised_dims) if keep_xboundaries: # Set MXG so that it is picked up in metadata - needed for # applying geometry, etc. grid["MXG"] = mxg else: xboundaries = mxg if xboundaries > 0: grid = grid.isel(x=slice(xboundaries, -xboundaries, None)) # Set MXG so that it is picked up in metadata - needed for # applying geometry, etc. grid["MXG"] = 0 try: yboundaries = int(grid["y_boundary_guards"]) except KeyError: # y_boundary_guards variable not in grid file - older grid files # never had y-boundary cells yboundaries = 0 if keep_yboundaries: # Set MYG so that it is picked up in metadata # - needed for applying geometry, etc. grid["MYG"] = yboundaries else: if yboundaries > 0: # Remove y-boundary cells from first divertor target grid = grid.isel(y=slice(yboundaries, -yboundaries, None)) if grid["jyseps1_2"] > grid["jyseps2_1"]: # There is a second divertor target, remove y-boundary # cells there too nin = int(grid["ny_inner"]) grid_lower = grid.isel(y=slice(None, nin, None)) grid_upper = grid.isel(y=slice(nin + 2 * yboundaries, None, None)) grid = xr.concat( (grid_lower, grid_upper), dim="y", data_vars="minimal", compat="identical", join="exact", ) # Set MYG so that it is picked up in metadata # - needed for applying geometry, etc. grid["MYG"] = 0 if "z" in grid_chunks and "z" not in grid.dims: del grid_chunks["z"] grid = grid.chunk(grid_chunks) return grid