Source code for scmdata.database._database

"""
Database for handling large datasets in a performant, but flexible way
"""
import os
import os.path

import pandas as pd
import six
import tqdm.autonotebook as tqdman

from scmdata import run_append
from scmdata.database._utils import _check_is_subdir
from scmdata.database.backends import BaseDatabaseBackend, backend_classes


[docs]class ScmDatabase:
    """
    On-disk database handler for outputs from SCMs

    Data is split into groups as specified by :attr:`levels`. This allows for fast
    reading and writing of new subsets of data when a single output file is no longer
    performant or data cannot all fit in memory.
    """

[docs]    def __init__(
        self,
        root_dir,
        levels=("climate_model", "variable", "region", "scenario"),
        backend="netcdf",
        backend_config=None,
    ):
        """
        Initialise the database

        .. note::

            Creating a new :class:`ScmDatabase` does not modify any existing data on
            disk. To load an existing database ensure that the :attr:`root_dir`.
            :attr:`levels` and backend settings are the same as the previous instance.

        Parameters
        ----------
        root_dir : str
            The root directory of the database

        levels : tuple of str
            Specifies how the runs should be stored on disk.

            The data will be grouped by ``levels``. These levels should be adapted to
            best match the input data and desired access pattern. If there are any
            additional varying dimensions, they will be stored as dimensions.

        backend: str or :class:`BaseDatabaseBackend<scmdata.database.backends.BaseDatabaseBackend>`
            Determine the backend to serialize and deserialize data

            Defaults to using :class:`NetCDFDatabaseBackend<scmdata.database.backends.NetCDFDatabaseBackend>`
            which reads and writes data as netCDF files. Note that this requires the
            optional dependency of netCDF4 to be installed.

            If a custom backend class is being used, it must extend the
            :class:`BaseDatabaseBackend<scmdata.database.backends.BaseDatabaseBackend>` class.

        backend_config: dict
            Additional configuration to pass to the backend

            See the documentation for the target backend to determine which configuration
            options are available.


        """
        self._root_dir = root_dir
        self.levels = tuple(levels)

        backend_config = backend_config if backend_config else {}
        for key in ["levels", "root_dir"]:
            if key in backend_config:
                raise ValueError("backend_config cannot contain key `{}`".format(key))
        backend_config["levels"] = self.levels
        backend_config["root_dir"] = root_dir

        self._backend = self._get_backend(backend, backend_config)

    def _get_backend(self, backend, backend_config):
        if isinstance(backend, six.string_types):
            try:
                cls = backend_classes[backend.lower()]
                return cls(**backend_config)
            except KeyError:
                raise ValueError("Unknown database backend: {}".format(backend))
        else:
            if not isinstance(backend, BaseDatabaseBackend):
                raise ValueError(
                    "Backend must be an instance of scmdata.database.BaseDatabaseBackend"
                )
            return backend

    def __repr__(self):
        return "<scmdata.database.SCMDatabase (root_dir: {}, levels: {})>".format(
            self._root_dir, self.levels
        )

    @property
    def root_dir(self):
        """
        Root directory of the database.

        Returns
        -------
        str
        """
        return self._root_dir

    def _clean_filters(self, filters):
        for level in filters:
            if level not in self.levels:
                raise ValueError("Unknown level: {}".format(level))
            if os.sep in filters[level]:
                filters[level] = filters[level].replace(os.sep, "_")
        return filters

[docs]    def save(self, scmrun, disable_tqdm=False):
        """
        Save data to the database

        The results are saved with one file for each unique combination of
        :attr:`levels` in a directory structure underneath ``root_dir``.

        Use :meth:`available_data` to see what data is available. Subsets of
        data can then be loaded as an :class:`scmdata.ScmRun <scmdata.run.ScmRun>` using :meth:`load`.

        Parameters
        ----------
        scmrun : :class:`scmdata.ScmRun <scmdata.run.ScmRun>`
            Data to save.

            The timeseries in this run should have valid metadata for each
            of the columns specified in ``levels``.
        disable_tqdm: bool
            If True, do not show the progress bar

        Raises
        ------
        KeyError
            If a filter for a level not in :attr:`levels` is specified
        """
        for r in tqdman.tqdm(
            scmrun.groupby(self.levels),
            leave=False,
            desc="Saving to database",
            disable=disable_tqdm,
        ):
            self._backend.save(r)

[docs]    def load(self, disable_tqdm=False, **filters):
        """
        Load data from the database

        Parameters
        ----------
        disable_tqdm: bool
            If True, do not show the progress bar
        filters: dict of str : [str, list[str]]
            Filters for the data to load.

            Defaults to loading all values for a level if it isn't specified.

            If a filter is a list then OR logic is applied within the level.
            For example, if we have ``scenario=["ssp119", "ssp126"]`` then
            both the ssp119 and ssp126 scenarios will be loaded.

        Returns
        -------
        :class:`scmdata.ScmRun`
            Loaded data

        Raises
        ------
        ValueError
            If a filter for a level not in :attr:`levels` is specified

            If no data matching ``filters`` is found
        """
        filters = self._clean_filters(filters)

        load_files = self._backend.get(filters)

        return run_append(
            [
                self._backend.load(f)
                for f in tqdman.tqdm(
                    load_files,
                    desc="Loading files",
                    leave=False,
                    disable=disable_tqdm,
                )
            ]
        )

[docs]    def delete(self, **filters):
        """
        Delete data from the database

        Parameters
        ----------
        filters: dict of str
            Filters for the data to load.

            Defaults to deleting all data if nothing is specified.

        Raises
        ------
        ValueError
            If a filter for a level not in :attr:`levels` is specified
        """
        filters = self._clean_filters(filters)
        targets = self._backend.get(filters)

        for t in targets:
            _check_is_subdir(self._root_dir, t)
            self._backend.delete(t)

[docs]    def available_data(self):
        """
        Get all the data which is available to be loaded

        If metadata includes non-alphanumeric characters then it
        might appear modified in the returned table. The original
        metadata values can still be used to filter data.

        Returns
        -------
        :class:`pd.DataFrame`
        """
        all_files = self._backend.get({})

        file_meta = []
        for f in all_files:
            dirnames = f.split(os.sep)[:-1]
            file_meta.append(dirnames[-len(self.levels) :])

        data = pd.DataFrame(file_meta, columns=self.levels)

        return data.sort_values(by=data.columns.to_list()).reset_index(drop=True)