import pathlib

NetCDF handling

NetCDF formatted files are much faster to read and write for large datasets. In order to make the most of this, the ScmRun objects have the ability to read and write netCDF files.

import traceback
from tempfile import TemporaryDirectory

import numpy as np
import seaborn as sns
import xarray as xr

from scmdata.netcdf import nc_to_run
from scmdata.run import ScmRun, run_append
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/database/_database.py:9: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  import tqdm.autonotebook as tqdman

Helper bits and piecs

temp_directory = TemporaryDirectory()
generator = np.random.default_rng(0)
OUTPUT_DIR = pathlib.Path(temp_directory.name)

OUT_FNAME = OUTPUT_DIR / "out_runs.nc"
def new_timeseries(  # noqa: PLR0913
    n=100,
    count=1,
    model="example",
    scenario="ssp119",
    variable="Surface Temperature",
    unit="K",
    region="World",
    cls=ScmRun,
    **kwargs,
):
    """
    Create an example timeseries
    """
    data = generator.random((n, count)) * np.arange(n)[:, np.newaxis]
    index = 2000 + np.arange(n)
    return cls(
        data,
        columns={
            "model": model,
            "scenario": scenario,
            "variable": variable,
            "region": region,
            "unit": unit,
            **kwargs,
        },
        index=index,
    )

Let’s create an ScmRun which contains a few variables and a number of runs. Such a dataframe would be used to store the results from an ensemble of simple climate model runs.

runs = run_append(
    [
        new_timeseries(
            count=3,
            variable=[
                "Surface Temperature",
                "Atmospheric Concentrations|CO2",
                "Radiative Forcing",
            ],
            unit=["K", "ppm", "W/m^2"],
            run_id=run_id,
        )
        for run_id in range(10)
    ]
)
runs.metadata["source"] = "fake data"
runs
<ScmRun (timeseries: 30, timepoints: 100)>
Time:
	Start: 2000-01-01T00:00:00
	End: 2099-01-01T00:00:00
Meta:
	      model region  run_id scenario   unit                        variable
	0   example  World       0   ssp119      K             Surface Temperature
	1   example  World       0   ssp119    ppm  Atmospheric Concentrations|CO2
	2   example  World       0   ssp119  W/m^2               Radiative Forcing
	3   example  World       1   ssp119      K             Surface Temperature
	4   example  World       1   ssp119    ppm  Atmospheric Concentrations|CO2
	5   example  World       1   ssp119  W/m^2               Radiative Forcing
	6   example  World       2   ssp119      K             Surface Temperature
	7   example  World       2   ssp119    ppm  Atmospheric Concentrations|CO2
	8   example  World       2   ssp119  W/m^2               Radiative Forcing
	9   example  World       3   ssp119      K             Surface Temperature
	10  example  World       3   ssp119    ppm  Atmospheric Concentrations|CO2
	11  example  World       3   ssp119  W/m^2               Radiative Forcing
	12  example  World       4   ssp119      K             Surface Temperature
	13  example  World       4   ssp119    ppm  Atmospheric Concentrations|CO2
	14  example  World       4   ssp119  W/m^2               Radiative Forcing
	15  example  World       5   ssp119      K             Surface Temperature
	16  example  World       5   ssp119    ppm  Atmospheric Concentrations|CO2
	17  example  World       5   ssp119  W/m^2               Radiative Forcing
	18  example  World       6   ssp119      K             Surface Temperature
	19  example  World       6   ssp119    ppm  Atmospheric Concentrations|CO2
	20  example  World       6   ssp119  W/m^2               Radiative Forcing
	21  example  World       7   ssp119      K             Surface Temperature
	22  example  World       7   ssp119    ppm  Atmospheric Concentrations|CO2
	23  example  World       7   ssp119  W/m^2               Radiative Forcing
	24  example  World       8   ssp119      K             Surface Temperature
	25  example  World       8   ssp119    ppm  Atmospheric Concentrations|CO2
	26  example  World       8   ssp119  W/m^2               Radiative Forcing
	27  example  World       9   ssp119      K             Surface Temperature
	28  example  World       9   ssp119    ppm  Atmospheric Concentrations|CO2
	29  example  World       9   ssp119  W/m^2               Radiative Forcing

Reading/Writing to NetCDF4

Basics

Writing the runs to disk is easy. The one trick is that each variable and dimension combination must have unique metadata. If they do not, you will receive an error message like the below.

try:
    runs.to_nc(OUT_FNAME, dimensions=["region"])
except ValueError:
    traceback.print_exc(limit=0, chain=False)
ValueError: dimensions: `['region']` and extras: `[]` do not uniquely define the timeseries, please add extra dimensions and/or extras

In our dataset, there is more than one “run_id” per variable hence we need to use a different dimension, run_id, because this will result in each variable’s remaining metadata being unique.

runs.to_nc(OUT_FNAME, dimensions=["run_id"])
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)

The output netCDF file can be read using the from_nc method, nc_to_run function or directly using xarray.

runs_netcdf = ScmRun.from_nc(OUT_FNAME)
runs_netcdf
<ScmRun (timeseries: 30, timepoints: 100)>
Time:
	Start: 2000-01-01T00:00:00
	End: 2099-01-01T00:00:00
Meta:
	      model region  run_id scenario   unit                        variable
	0   example  World       0   ssp119      K             Surface Temperature
	1   example  World       0   ssp119    ppm  Atmospheric Concentrations|CO2
	2   example  World       0   ssp119  W/m^2               Radiative Forcing
	3   example  World       1   ssp119      K             Surface Temperature
	4   example  World       1   ssp119    ppm  Atmospheric Concentrations|CO2
	5   example  World       1   ssp119  W/m^2               Radiative Forcing
	6   example  World       2   ssp119      K             Surface Temperature
	7   example  World       2   ssp119    ppm  Atmospheric Concentrations|CO2
	8   example  World       2   ssp119  W/m^2               Radiative Forcing
	9   example  World       3   ssp119      K             Surface Temperature
	10  example  World       3   ssp119    ppm  Atmospheric Concentrations|CO2
	11  example  World       3   ssp119  W/m^2               Radiative Forcing
	12  example  World       4   ssp119      K             Surface Temperature
	13  example  World       4   ssp119    ppm  Atmospheric Concentrations|CO2
	14  example  World       4   ssp119  W/m^2               Radiative Forcing
	15  example  World       5   ssp119      K             Surface Temperature
	16  example  World       5   ssp119    ppm  Atmospheric Concentrations|CO2
	17  example  World       5   ssp119  W/m^2               Radiative Forcing
	18  example  World       6   ssp119      K             Surface Temperature
	19  example  World       6   ssp119    ppm  Atmospheric Concentrations|CO2
	20  example  World       6   ssp119  W/m^2               Radiative Forcing
	21  example  World       7   ssp119      K             Surface Temperature
	22  example  World       7   ssp119    ppm  Atmospheric Concentrations|CO2
	23  example  World       7   ssp119  W/m^2               Radiative Forcing
	24  example  World       8   ssp119      K             Surface Temperature
	25  example  World       8   ssp119    ppm  Atmospheric Concentrations|CO2
	26  example  World       8   ssp119  W/m^2               Radiative Forcing
	27  example  World       9   ssp119      K             Surface Temperature
	28  example  World       9   ssp119    ppm  Atmospheric Concentrations|CO2
	29  example  World       9   ssp119  W/m^2               Radiative Forcing
nc_to_run(ScmRun, OUT_FNAME)
<ScmRun (timeseries: 30, timepoints: 100)>
Time:
	Start: 2000-01-01T00:00:00
	End: 2099-01-01T00:00:00
Meta:
	      model region  run_id scenario   unit                        variable
	0   example  World       0   ssp119      K             Surface Temperature
	1   example  World       0   ssp119    ppm  Atmospheric Concentrations|CO2
	2   example  World       0   ssp119  W/m^2               Radiative Forcing
	3   example  World       1   ssp119      K             Surface Temperature
	4   example  World       1   ssp119    ppm  Atmospheric Concentrations|CO2
	5   example  World       1   ssp119  W/m^2               Radiative Forcing
	6   example  World       2   ssp119      K             Surface Temperature
	7   example  World       2   ssp119    ppm  Atmospheric Concentrations|CO2
	8   example  World       2   ssp119  W/m^2               Radiative Forcing
	9   example  World       3   ssp119      K             Surface Temperature
	10  example  World       3   ssp119    ppm  Atmospheric Concentrations|CO2
	11  example  World       3   ssp119  W/m^2               Radiative Forcing
	12  example  World       4   ssp119      K             Surface Temperature
	13  example  World       4   ssp119    ppm  Atmospheric Concentrations|CO2
	14  example  World       4   ssp119  W/m^2               Radiative Forcing
	15  example  World       5   ssp119      K             Surface Temperature
	16  example  World       5   ssp119    ppm  Atmospheric Concentrations|CO2
	17  example  World       5   ssp119  W/m^2               Radiative Forcing
	18  example  World       6   ssp119      K             Surface Temperature
	19  example  World       6   ssp119    ppm  Atmospheric Concentrations|CO2
	20  example  World       6   ssp119  W/m^2               Radiative Forcing
	21  example  World       7   ssp119      K             Surface Temperature
	22  example  World       7   ssp119    ppm  Atmospheric Concentrations|CO2
	23  example  World       7   ssp119  W/m^2               Radiative Forcing
	24  example  World       8   ssp119      K             Surface Temperature
	25  example  World       8   ssp119    ppm  Atmospheric Concentrations|CO2
	26  example  World       8   ssp119  W/m^2               Radiative Forcing
	27  example  World       9   ssp119      K             Surface Temperature
	28  example  World       9   ssp119    ppm  Atmospheric Concentrations|CO2
	29  example  World       9   ssp119  W/m^2               Radiative Forcing
xr.load_dataset(OUT_FNAME)
<xarray.Dataset>
Dimensions:                          (time: 100, run_id: 10)
Coordinates:
  * time                             (time) datetime64[ns] 2000-01-01 ... 209...
  * run_id                           (run_id) int64 0 1 2 3 4 5 6 7 8 9
Data variables:
    Surface_Temperature              (run_id, time) float64 0.0 ... 2.459
    Atmospheric_Concentrations__CO2  (run_id, time) float64 0.0 0.8133 ... 20.56
    Radiative_Forcing                (run_id, time) float64 0.0 0.9128 ... 29.75
Attributes:
    scmdata_metadata_scenario:  ssp119
    scmdata_metadata_model:     example
    scmdata_metadata_region:    World
    created_at:                 2024-01-29T07:18:01.855754
    _scmdata_version:           1.0.0
    source:                     fake data

The additional metadata in runs is also serialized and deserialized in the netCDF files. The metadata of the loaded ScmRun will also contain some additional fields about the file creation.

assert "source" in runs_netcdf.metadata
runs_netcdf.metadata
{'created_at': '2024-01-29T07:18:01.855754',
 '_scmdata_version': '1.0.0',
 'source': 'fake data'}

Splitting your data

Sometimes if you have complicated ensemble runs it might be more efficient to split the data into smaller subsets.

In the below example we iterate over scenarios to produce a netCDF file per scenario.

large_run = []

# 10 runs for each scenario
for sce in ["ssp119", "ssp370", "ssp585"]:
    large_run.extend(
        [
            new_timeseries(
                count=3,
                scenario=sce,
                variable=[
                    "Surface Temperature",
                    "Atmospheric Concentrations|CO2",
                    "Radiative Forcing",
                ],
                unit=["K", "ppm", "W/m^2"],
                paraset_id=paraset_id,
            )
            for paraset_id in range(10)
        ]
    )

large_run = run_append(large_run)

# also set a run_id (often we'd have paraset_id and run_id,
# one which keeps track of the parameter set we've run and
# the other which keeps track of the run in a large ensemble)
large_run["run_id"] = large_run.meta.index.values
large_run
<ScmRun (timeseries: 90, timepoints: 100)>
Time:
	Start: 2000-01-01T00:00:00
	End: 2099-01-01T00:00:00
Meta:
	      model  paraset_id region  run_id scenario   unit  \
	0   example           0  World       0   ssp119      K   
	1   example           0  World       1   ssp119    ppm   
	2   example           0  World       2   ssp119  W/m^2   
	3   example           1  World       3   ssp119      K   
	4   example           1  World       4   ssp119    ppm   
	..      ...         ...    ...     ...      ...    ...   
	85  example           8  World      85   ssp585    ppm   
	86  example           8  World      86   ssp585  W/m^2   
	87  example           9  World      87   ssp585      K   
	88  example           9  World      88   ssp585    ppm   
	89  example           9  World      89   ssp585  W/m^2   
	
	                          variable  
	0              Surface Temperature  
	1   Atmospheric Concentrations|CO2  
	2                Radiative Forcing  
	3              Surface Temperature  
	4   Atmospheric Concentrations|CO2  
	..                             ...  
	85  Atmospheric Concentrations|CO2  
	86               Radiative Forcing  
	87             Surface Temperature  
	88  Atmospheric Concentrations|CO2  
	89               Radiative Forcing  
	
	[90 rows x 7 columns]

Data for each scenario can then be loaded independently instead of having to load all the data and then filtering

for sce_run in large_run.groupby("scenario"):
    sce = sce_run.get_unique_meta("scenario", True)
    sce_run.to_nc(
        OUTPUT_DIR / f"out-{sce}-sparse.nc",
        dimensions=["run_id", "paraset_id"],
    )
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)
ScmRun.from_nc(OUTPUT_DIR / "out-ssp585-sparse.nc").filter(
    variable="Surface Temperature"
).line_plot()
../_images/1b5fb09684f52f8d5b2c0e7c794b80b1ddff94c63e9622dedc33c194115e9f00.png

For such a data set, since both run_id and paraset_id vary, both could be added as dimensions in the file.

The one problem with this approach is that you get very sparse arrays because the data is written on a 100 x 30 x 90 (time points x paraset_id x run_id) grid but there’s only 90 timeseries so you end up with 180 timeseries worth of nans (although this is a relatively small problem because the netCDF files use compression to minismise the impact of the extra nan values).

xr.load_dataset(OUTPUT_DIR / "out-ssp585-sparse.nc")
<xarray.Dataset>
Dimensions:                          (time: 100, run_id: 30, paraset_id: 10)
Coordinates:
  * time                             (time) datetime64[ns] 2000-01-01 ... 209...
  * run_id                           (run_id) int64 60 61 62 63 ... 86 87 88 89
  * paraset_id                       (paraset_id) int64 0 1 2 3 4 5 6 7 8 9
Data variables:
    Surface_Temperature              (run_id, paraset_id, time) float64 0.0 ....
    Atmospheric_Concentrations__CO2  (run_id, paraset_id, time) float64 nan ....
    Radiative_Forcing                (run_id, paraset_id, time) float64 nan ....
Attributes:
    scmdata_metadata_scenario:  ssp585
    scmdata_metadata_model:     example
    scmdata_metadata_region:    World
    created_at:                 2024-01-29T07:18:02.481132
    _scmdata_version:           1.0.0
# Load all scenarios
run_append([ScmRun.from_nc(fname) for fname in OUTPUT_DIR.glob("out-ssp*-sparse.nc")])
<ScmRun (timeseries: 90, timepoints: 100)>
Time:
	Start: 2000-01-01T00:00:00
	End: 2099-01-01T00:00:00
Meta:
	      model  paraset_id region  run_id scenario   unit  \
	0   example           0  World      30   ssp370      K   
	1   example           0  World      31   ssp370    ppm   
	2   example           0  World      32   ssp370  W/m^2   
	3   example           1  World      33   ssp370      K   
	4   example           1  World      34   ssp370    ppm   
	..      ...         ...    ...     ...      ...    ...   
	85  example           8  World      85   ssp585    ppm   
	86  example           8  World      86   ssp585  W/m^2   
	87  example           9  World      87   ssp585      K   
	88  example           9  World      88   ssp585    ppm   
	89  example           9  World      89   ssp585  W/m^2   
	
	                          variable  
	0              Surface Temperature  
	1   Atmospheric Concentrations|CO2  
	2                Radiative Forcing  
	3              Surface Temperature  
	4   Atmospheric Concentrations|CO2  
	..                             ...  
	85  Atmospheric Concentrations|CO2  
	86               Radiative Forcing  
	87             Surface Temperature  
	88  Atmospheric Concentrations|CO2  
	89               Radiative Forcing  
	
	[90 rows x 7 columns]

An alternative to the sparse arrays is to specify the variables in the extras attribute. If possible, this adds the metadata to the netCDF file as an extra co-ordinate, which uses one of the dimensions as it’s co-ordinate. If using one of the dimensions as a co-ordinate would not specify the metadata uniquely, we add the extra as an additional co-ordinate, which itself has co-ordinates of _id. This _id co-ordinate provides a unique mapping between the extra metadata and the timeseries.

for sce_run in large_run.groupby("scenario"):
    sce = sce_run.get_unique_meta("scenario", True)
    sce_run.to_nc(
        OUTPUT_DIR / f"out-{sce}-extras.nc",
        dimensions=["run_id"],
        extras=["paraset_id"],
    )
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:201: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  max_count = checker.groupby(col2).count().max()[0]
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:201: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  max_count = checker.groupby(col2).count().max()[0]
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:201: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  max_count = checker.groupby(col2).count().max()[0]
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)

paraset_id is uniquely defined by run_id so we don’t end up with an extra _id co-ordinate.

xr.load_dataset(OUTPUT_DIR / "out-ssp585-extras.nc")
<xarray.Dataset>
Dimensions:                          (time: 100, run_id: 30)
Coordinates:
  * time                             (time) datetime64[ns] 2000-01-01 ... 209...
  * run_id                           (run_id) int64 60 61 62 63 ... 86 87 88 89
    paraset_id                       (run_id) int64 0 0 0 1 1 1 ... 8 8 8 9 9 9
Data variables:
    Surface_Temperature              (run_id, time) float64 0.0 0.6824 ... nan
    Atmospheric_Concentrations__CO2  (run_id, time) float64 nan nan ... nan nan
    Radiative_Forcing                (run_id, time) float64 nan nan ... 9.579
Attributes:
    scmdata_metadata_scenario:  ssp585
    scmdata_metadata_model:     example
    scmdata_metadata_region:    World
    created_at:                 2024-01-29T07:18:03.360352
    _scmdata_version:           1.0.0
ScmRun.from_nc(OUTPUT_DIR / "out-ssp585-extras.nc").filter(
    variable="Surface Temperature"
).line_plot()
../_images/1b5fb09684f52f8d5b2c0e7c794b80b1ddff94c63e9622dedc33c194115e9f00.png

If we use dimensions and extra such that our extra co-ordinates are not uniquely defined by the regions, an _id dimension is automatically added to ensure we don’t lose any information.

large_run.to_nc(
    OUTPUT_DIR / "out-extras-sparse.nc",
    dimensions=["scenario"],
    extras=["paraset_id", "run_id"],
)
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:201: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  max_count = checker.groupby(col2).count().max()[0]
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:201: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  max_count = checker.groupby(col2).count().max()[0]
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:234: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  timeseries.T.stack([*dimensions, "_id"])
xr.load_dataset(OUTPUT_DIR / "out-extras-sparse.nc")
<xarray.Dataset>
Dimensions:                          (time: 100, scenario: 3, _id: 90)
Coordinates:
  * time                             (time) datetime64[ns] 2000-01-01 ... 209...
  * scenario                         (scenario) <U6 'ssp119' 'ssp370' 'ssp585'
  * _id                              (_id) int64 0 1 2 3 4 5 ... 85 86 87 88 89
    paraset_id                       (_id) int64 0 0 0 1 1 1 2 ... 7 8 8 8 9 9 9
    run_id                           (_id) int64 0 1 2 3 4 5 ... 85 86 87 88 89
Data variables:
    Surface_Temperature              (scenario, time, _id) float64 0.0 ... nan
    Atmospheric_Concentrations__CO2  (scenario, time, _id) float64 nan ... nan
    Radiative_Forcing                (scenario, time, _id) float64 nan ... 9.579
Attributes:
    scmdata_metadata_model:   example
    scmdata_metadata_region:  World
    created_at:               2024-01-29T07:18:04.000759
    _scmdata_version:         1.0.0

Multi-dimensional data

scmdata can also handle having more than one dimension. This can be especially helpful if you have output from a number of models (IAMs), scenarios, regions and runs.

multi_dimensional_run = []

for model in ["AIM", "GCAM", "MESSAGE", "REMIND"]:
    for sce in ["ssp119", "ssp370", "ssp585"]:
        for region in ["World", "R5LAM", "R5MAF", "R5ASIA", "R5OECD", "R5REF"]:
            multi_dimensional_run.extend(
                [
                    new_timeseries(
                        count=3,
                        model=model,
                        scenario=sce,
                        region=region,
                        variable=[
                            "Surface Temperature",
                            "Atmospheric Concentrations|CO2",
                            "Radiative Forcing",
                        ],
                        unit=["K", "ppm", "W/m^2"],
                        paraset_id=paraset_id,
                    )
                    for paraset_id in range(10)
                ]
            )

multi_dimensional_run = run_append(multi_dimensional_run)

multi_dimensional_run
<ScmRun (timeseries: 2160, timepoints: 100)>
Time:
	Start: 2000-01-01T00:00:00
	End: 2099-01-01T00:00:00
Meta:
	       model  paraset_id region scenario   unit  \
	0        AIM           0  World   ssp119      K   
	1        AIM           0  World   ssp119    ppm   
	2        AIM           0  World   ssp119  W/m^2   
	3        AIM           1  World   ssp119      K   
	4        AIM           1  World   ssp119    ppm   
	...      ...         ...    ...      ...    ...   
	2155  REMIND           8  R5REF   ssp585    ppm   
	2156  REMIND           8  R5REF   ssp585  W/m^2   
	2157  REMIND           9  R5REF   ssp585      K   
	2158  REMIND           9  R5REF   ssp585    ppm   
	2159  REMIND           9  R5REF   ssp585  W/m^2   
	
	                            variable  
	0                Surface Temperature  
	1     Atmospheric Concentrations|CO2  
	2                  Radiative Forcing  
	3                Surface Temperature  
	4     Atmospheric Concentrations|CO2  
	...                              ...  
	2155  Atmospheric Concentrations|CO2  
	2156               Radiative Forcing  
	2157             Surface Temperature  
	2158  Atmospheric Concentrations|CO2  
	2159               Radiative Forcing  
	
	[2160 rows x 6 columns]
multi_dim_outfile = OUTPUT_DIR / "out-multi-dimensional.nc"
multi_dimensional_run.to_nc(
    multi_dim_outfile,
    dimensions=("region", "model", "scenario", "paraset_id"),
)
/home/docs/checkouts/readthedocs.org/user_builds/scmdata/checkouts/stable/src/scmdata/_xarray.py:236: FutureWarning: The previous implementation of stack is deprecated and will be removed in a future version of pandas. See the What's New notes for pandas 2.1.0 for details. Specify future_stack=True to adopt the new implementation and silence this warning.
  else timeseries.T.stack(dimensions)
xr.load_dataset(multi_dim_outfile)
<xarray.Dataset>
Dimensions:                          (time: 100, scenario: 3, region: 6,
                                      paraset_id: 10, model: 4)
Coordinates:
  * time                             (time) datetime64[ns] 2000-01-01 ... 209...
  * scenario                         (scenario) <U6 'ssp119' 'ssp370' 'ssp585'
  * region                           (region) <U6 'R5ASIA' 'R5LAM' ... 'World'
  * paraset_id                       (paraset_id) int64 0 1 2 3 4 5 6 7 8 9
  * model                            (model) <U7 'AIM' 'GCAM' 'MESSAGE' 'REMIND'
Data variables:
    Surface_Temperature              (region, model, scenario, paraset_id, time) float64 ...
    Atmospheric_Concentrations__CO2  (region, model, scenario, paraset_id, time) float64 ...
    Radiative_Forcing                (region, model, scenario, paraset_id, time) float64 ...
Attributes:
    created_at:        2024-01-29T07:18:09.510062
    _scmdata_version:  1.0.0
multi_dim_loaded_co2_conc = ScmRun.from_nc(multi_dim_outfile).filter(
    variable="Atmospheric Concentrations|CO2"
)

seaborn_df = multi_dim_loaded_co2_conc.long_data()
seaborn_df.head()
model paraset_id region scenario unit variable time value
0 AIM 0 R5ASIA ssp119 ppm Atmospheric Concentrations|CO2 2000-01-01 0.000000
1 AIM 0 R5ASIA ssp119 ppm Atmospheric Concentrations|CO2 2001-01-01 0.730551
2 AIM 0 R5ASIA ssp119 ppm Atmospheric Concentrations|CO2 2002-01-01 0.142061
3 AIM 0 R5ASIA ssp119 ppm Atmospheric Concentrations|CO2 2003-01-01 2.977609
4 AIM 0 R5ASIA ssp119 ppm Atmospheric Concentrations|CO2 2004-01-01 0.785949
sns.relplot(
    data=seaborn_df,
    x="time",
    y="value",
    units="paraset_id",
    estimator=None,
    hue="scenario",
    style="model",
    col="region",
    col_wrap=3,
    kind="line",
)
<seaborn.axisgrid.FacetGrid at 0x7f353ebbc490>
../_images/4bb1339d0487ea8f7538738be0a5eb0e9b549dda7cf92ee68003be2aa72d6bb0.png