Source code for scmdata.groupby

"""
Functionality for grouping and filtering ScmRun objects
"""
from __future__ import annotations

import warnings
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Callable, Generic, Iterator, TypeVar, Union

import numpy as np
import pandas as pd
from numpy.typing import NDArray
from xarray.core import ops
from xarray.core.common import ImplementsArrayReduce

from scmdata._typing import MetadataValue
from scmdata.run import BaseScmRun, GenericRun

if TYPE_CHECKING:
    from pandas.core.groupby.generic import DataFrameGroupBy
    from typing_extensions import Concatenate, ParamSpec

    P = ParamSpec("P")
    Q = ParamSpec("Q")
    RunLike = TypeVar("RunLike", bound=BaseScmRun)
    ApplyCallableReturnType = Union[RunLike, pd.DataFrame, None]
    ApplyCallable = Callable[Concatenate[RunLike, Q], ApplyCallableReturnType[RunLike]]
    ParallelProcessor = Callable[
        Concatenate[
            ApplyCallable[RunLike, Q],
            Iterable[RunLike],
            Q,
        ],
        Iterable[ApplyCallableReturnType[RunLike]],
    ]


[docs]class RunGroupBy(ImplementsArrayReduce, Generic[GenericRun]): """ GroupBy object specialized to grouping ScmRun objects """ def __init__( self, run: GenericRun, groups: Iterable[str], na_fill_value: float = -10000 ): self.run = run self.group_keys = groups m = run.meta.reset_index(drop=True) self.na_fill_value = float(na_fill_value) # Work around the bad handling of NaN values in groupbys if any([np.issubdtype(m[c].dtype, np.number) for c in m]): if (m == na_fill_value).any(axis=None): raise ValueError( "na_fill_value conflicts with data value. Choose a na_fill_value " "not in meta" ) else: m = m.fillna(na_fill_value) self._grouper: DataFrameGroupBy = m.groupby(list(groups), group_keys=True) def _iter_grouped(self) -> Iterator[GenericRun]: def _try_fill_value(v: MetadataValue) -> MetadataValue: try: if float(v) == float(self.na_fill_value): return np.nan except ValueError: pass return v groups: Iterable[ MetadataValue | tuple[MetadataValue, ...] ] = self._grouper.groups for indices in groups: if not isinstance(indices, Iterable) or isinstance(indices, str): indices_clean: tuple[MetadataValue, ...] = (indices,) else: indices_clean = indices indices_clean = tuple(_try_fill_value(v) for v in indices_clean) filter_kwargs = {k: v for k, v in zip(self.group_keys, indices_clean)} res = self.run.filter(**filter_kwargs) # type: ignore if not len(res): raise ValueError( f"Empty group for {list(zip(self.group_keys, indices_clean))}" ) yield res def __iter__(self) -> Iterator[GenericRun]: """ Iterate over the groups """ return self._iter_grouped()
[docs] def apply( self, func: Callable[Concatenate[GenericRun, P], GenericRun | (pd.DataFrame | None)], *args: P.args, **kwargs: P.kwargs, ) -> GenericRun: """ Apply a function to each group and append the results `func` is called like `func(ar, *args, **kwargs)` for each :class:`ScmRun <scmdata.run.ScmRun>` group. If the result of this function call is ``None``, than it is excluded from the results. The results are appended together using :func:`run_append`. The function can change the size of the input :class:`ScmRun <scmdata.run.ScmRun>` as long as :func:`run_append` can be applied to all results. Examples -------- .. code:: python >>> from scmdata import ScmRun >>> def show_var_and_convert_unit(arr: scmdata.ScmRun) -> None: ... variable = arr.get_unique_meta("variable", True) ... unit = arr.get_unique_meta("unit", True) ... print(f"{variable}'s original unit was {unit}") ... ... return arr.convert_unit("MtC") >>> df = ScmRun( ... data=[[1, 2], [3, 4]], ... index=[2010, 2020], ... columns={ ... "variable": ["v1", "v2"], ... "model": "model", ... "scenario": "scenario", ... "region": "World", ... "unit": ["tC", "GtC"], ... }, ... ) >>> df.groupby("variable").apply(show_var_and_convert_unit) v1's original unit was tC v2's original unit was GtC <ScmRun (timeseries: 2, timepoints: 2)> Time: Start: 2010-01-01T00:00:00 End: 2020-01-01T00:00:00 Meta: model region scenario unit variable 0 model World scenario MtC v1 1 model World scenario MtC v2 Parameters ---------- func Callable to apply to each group. *args Positional arguments passed to `func`. **kwargs Keyword arguments passed to `func`. Returns ------- The result of applying and combining. """ grouped = self._iter_grouped() applied = [func(arr, *args, **kwargs) for arr in grouped] return self._combine(applied)
[docs] def apply_parallel( self, func: ApplyCallable[GenericRun, P], parallel_processor: ParallelProcessor[GenericRun, P] | None = None, *args: P.args, **kwargs: P.kwargs, ) -> GenericRun: """ Apply a function to each group in parallel and append the results Provides the same functionality as :func:`~apply` except that parallel processing can be used via the ``parallel_processor`` argument. By default, :mod:`joblib` is used to apply `func` to each group in parallel. This can be slower than using :func:`~apply` for small numbers of groups or in the case where `func` is fast as there is overhead setting up the processing pool. See Also -------- :func:`~apply` Parameters ---------- func Callable to apply to each group. parallel_processor Parallel processor to use to process the groups. If not provided, a default joblib parallel processor is used (for details, see :func:`get_joblib_parallel_processor`). *args Positional arguments passed to `func`. **kwargs Keyword arguments passed to `func`. Returns ------- The result of applying and combining. """ if parallel_processor is None: parallel_processor = get_joblib_parallel_processor() grouped = self._iter_grouped() applied = parallel_processor(func, grouped, *args, **kwargs) return self._combine(applied)
[docs] def map(self, func, *args, **kwargs): """ Apply a function to each group and append the results .. deprecated:: 0.14.2 :func:`map` will be removed in scmdata 1.0.0, it is renamed to :func:`apply` with identical functionality. See Also -------- :func:`apply` """ warnings.warn("Use RunGroupby.apply instead", DeprecationWarning) return self.apply(func, *args, **kwargs)
def _combine( self, applied: Iterable[GenericRun | (pd.DataFrame | None)] ) -> GenericRun: """ Recombine the applied objects like the original. """ from scmdata.run import run_append # Remove all None values applied_clean = [df for df in applied if df is not None] if len(applied_clean) == 0: return self.run.__class__() else: return run_append(applied_clean)
[docs] def reduce( self, func: Callable[Concatenate[NDArray[np.float_], P], NDArray[np.float_]], dim: str | Iterable[str] | None = None, axis: str | Iterable[int] | None = None, *args: P.args, **kwargs: P.kwargs, ) -> GenericRun: """ Reduce the items in this group by applying `func` along some dimension(s). Parameters ---------- func : function Function which can be called in the form `func(x, axis=axis, **kwargs)` to return the result of collapsing an np.ndarray over an integer valued axis. dim : `...`, str or sequence of str, optional Not used in this implementation axis : int or sequence of int, optional Axis(es) over which to apply `func`. Only one of the 'dimension' and 'axis' arguments can be supplied. If neither are supplied, then `func` is calculated over all dimension for each group item. **kwargs : dict Additional keyword arguments passed on to `func`. Returns ------- reduced : :class:`ScmRun <scmdata.run.ScmRun>` Array with summarized data and the indicated dimension(s) removed. """ if dim is not None and dim != "time": raise ValueError("Only reduction along the time dimension is supported") def reduce_array(ar): return ar.reduce(func, dim, axis, **kwargs) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) return self.apply(reduce_array)
[docs]def get_joblib_parallel_processor( n_jobs: int = -1, backend: str = "loky", *args: Any, **kwargs: Any, ) -> ParallelProcessor[RunLike, Q]: """ Get parallel processor using :mod:`joblib` as the backend. Parameters ---------- n_jobs Number of jobs to run in parallel. If `-1` all CPUs are used. backend Backend used for parallelisation. Defaults to 'loky' which uses separate processes for each worker. See :class:`joblib.Parallel` for a more complete description of the available options. *args Passed to initialiser of :class:`joblib.Parallel` **kwargs Passed to initialiser of :class:`joblib.Parallel` Returns ------- Function that can be used for parallel processing in :meth:`RunGroupBy.apply_parallel` """ try: import joblib except ImportError as e: # pragma: no cover raise ImportError("joblib is not installed. Run 'pip install joblib'") from e processor = joblib.Parallel(*args, n_jobs=n_jobs, backend=backend, **kwargs) def joblib_parallel_processor( func: ApplyCallable[RunLike, Q], groups: Iterable[RunLike], /, *args: Q.args, **kwargs: Q.kwargs, ) -> Iterable[ApplyCallableReturnType[RunLike]]: prepped_groups = ( joblib.delayed(func)(group, *args, **kwargs) for group in groups ) applied = processor(prepped_groups) return applied return joblib_parallel_processor
ops.inject_reduce_methods(RunGroupBy)