Source code for scmdata.database._database

Database for handling large datasets in a performant, but flexible way
import os
import os.path

import pandas as pd
import six
import tqdm.autonotebook as tqdman

from scmdata import run_append
from scmdata.database._utils import _check_is_subdir
from scmdata.database.backends import BaseDatabaseBackend, backend_classes

[docs]class ScmDatabase: """ On-disk database handler for outputs from SCMs Data is split into groups as specified by :attr:`levels`. This allows for fast reading and writing of new subsets of data when a single output file is no longer performant or data cannot all fit in memory. """
[docs] def __init__( self, root_dir, levels=("climate_model", "variable", "region", "scenario"), backend="netcdf", backend_config=None, ): """ Initialise the database .. note:: Creating a new :class:`ScmDatabase` does not modify any existing data on disk. To load an existing database ensure that the :attr:`root_dir`. :attr:`levels` and backend settings are the same as the previous instance. Parameters ---------- root_dir : str The root directory of the database levels : tuple of str Specifies how the runs should be stored on disk. The data will be grouped by ``levels``. These levels should be adapted to best match the input data and desired access pattern. If there are any additional varying dimensions, they will be stored as dimensions. backend: str or :class:`BaseDatabaseBackend<scmdata.database.backends.BaseDatabaseBackend>` Determine the backend to serialize and deserialize data Defaults to using :class:`NetCDFDatabaseBackend<scmdata.database.backends.NetCDFDatabaseBackend>` which reads and writes data as netCDF files. Note that this requires the optional dependency of netCDF4 to be installed. If a custom backend class is being used, it must extend the :class:`BaseDatabaseBackend<scmdata.database.backends.BaseDatabaseBackend>` class. backend_config: dict Additional configuration to pass to the backend See the documentation for the target backend to determine which configuration options are available. """ self._root_dir = root_dir self.levels = tuple(levels) backend_config = backend_config if backend_config else {} for key in ["levels", "root_dir"]: if key in backend_config: raise ValueError("backend_config cannot contain key `{}`".format(key)) backend_config["levels"] = self.levels backend_config["root_dir"] = root_dir self._backend = self._get_backend(backend, backend_config)
def _get_backend(self, backend, backend_config): if isinstance(backend, six.string_types): try: cls = backend_classes[backend.lower()] return cls(**backend_config) except KeyError: raise ValueError("Unknown database backend: {}".format(backend)) else: if not isinstance(backend, BaseDatabaseBackend): raise ValueError( "Backend must be an instance of scmdata.database.BaseDatabaseBackend" ) return backend def __repr__(self): return "<scmdata.database.SCMDatabase (root_dir: {}, levels: {})>".format( self._root_dir, self.levels ) @property def root_dir(self): """ Root directory of the database. Returns ------- str """ return self._root_dir def _clean_filters(self, filters): for level in filters: if level not in self.levels: raise ValueError("Unknown level: {}".format(level)) if os.sep in filters[level]: filters[level] = filters[level].replace(os.sep, "_") return filters
[docs] def save(self, scmrun, disable_tqdm=False): """ Save data to the database The results are saved with one file for each unique combination of :attr:`levels` in a directory structure underneath ``root_dir``. Use :meth:`available_data` to see what data is available. Subsets of data can then be loaded as an :class:`scmdata.ScmRun <>` using :meth:`load`. Parameters ---------- scmrun : :class:`scmdata.ScmRun <>` Data to save. The timeseries in this run should have valid metadata for each of the columns specified in ``levels``. disable_tqdm: bool If True, do not show the progress bar Raises ------ KeyError If a filter for a level not in :attr:`levels` is specified """ for r in tqdman.tqdm( scmrun.groupby(self.levels), leave=False, desc="Saving to database", disable=disable_tqdm, ):
[docs] def load(self, disable_tqdm=False, **filters): """ Load data from the database Parameters ---------- disable_tqdm: bool If True, do not show the progress bar filters: dict of str : [str, list[str]] Filters for the data to load. Defaults to loading all values for a level if it isn't specified. If a filter is a list then OR logic is applied within the level. For example, if we have ``scenario=["ssp119", "ssp126"]`` then both the ssp119 and ssp126 scenarios will be loaded. Returns ------- :class:`scmdata.ScmRun` Loaded data Raises ------ ValueError If a filter for a level not in :attr:`levels` is specified If no data matching ``filters`` is found """ filters = self._clean_filters(filters) load_files = self._backend.get(filters) return run_append( [ self._backend.load(f) for f in tqdman.tqdm( load_files, desc="Loading files", leave=False, disable=disable_tqdm, ) ] )
[docs] def delete(self, **filters): """ Delete data from the database Parameters ---------- filters: dict of str Filters for the data to load. Defaults to deleting all data if nothing is specified. Raises ------ ValueError If a filter for a level not in :attr:`levels` is specified """ filters = self._clean_filters(filters) targets = self._backend.get(filters) for t in targets: _check_is_subdir(self._root_dir, t) self._backend.delete(t)
[docs] def available_data(self): """ Get all the data which is available to be loaded If metadata includes non-alphanumeric characters then it might appear modified in the returned table. The original metadata values can still be used to filter data. Returns ------- :class:`pd.DataFrame` """ all_files = self._backend.get({}) file_meta = [] for f in all_files: dirnames = f.split(os.sep)[:-1] file_meta.append(dirnames[-len(self.levels) :]) data = pd.DataFrame(file_meta, columns=self.levels) return data.sort_values(by=data.columns.to_list()).reset_index(drop=True)