Skip to content

Commit 196c6e4

Browse files
committed
Merge branch 'main' into add-intake-esgf-support
2 parents 8ae8f43 + 60b4f8d commit 196c6e4

File tree

32 files changed

+661
-8
lines changed

32 files changed

+661
-8
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,13 @@ repos:
3333
- id: codespell
3434
additional_dependencies: [tomli] # required for Python 3.10
3535
- repo: https://github.com/astral-sh/ruff-pre-commit
36-
rev: "v0.12.4"
36+
rev: "v0.12.7"
3737
hooks:
3838
- id: ruff-check
3939
args: [--fix]
4040
- id: ruff-format
4141
- repo: https://github.com/pre-commit/mirrors-mypy
42-
rev: 'v1.17.0'
42+
rev: 'v1.17.1'
4343
hooks:
4444
- id: mypy
4545
additional_dependencies:

environment.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
- nodefaults
66

77
dependencies:
8+
- aiohttp
89
- cartopy
910
- cf-units
1011
- cftime
@@ -13,12 +14,14 @@ dependencies:
1314
- distributed
1415
- esgf-pyclient >=0.3.1
1516
- esmpy
17+
- esmvaltool-sample-data
1618
- filelock
1719
- fiona
1820
- fire
1921
- geopy
2022
- humanfriendly
2123
- intake-esgf
24+
- intake-esm
2225
- iris >=3.12.2 # https://github.com/SciTools/iris/issues/6417
2326
- iris-esmf-regrid >=0.11.0
2427
- iris-grib >=0.20.0 # github.com/ESMValGroup/ESMValCore/issues/2535
@@ -47,6 +50,7 @@ dependencies:
4750
- shapely >=2.0.0
4851
- xarray
4952
- yamale
53+
- zarr >3
5054
# Python packages needed for building docs
5155
- autodocsumm >=0.2.2
5256
- ipython <9.0 # github.com/ESMValGroup/ESMValCore/issues/2680
@@ -67,5 +71,3 @@ dependencies:
6771
- pydocstyle
6872
- pylint
6973
# Not on conda forge - vprof
70-
- pip:
71-
- ESMValTool_sample_data

esmvalcore/preprocessor/_io.py

Lines changed: 89 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@
99
from itertools import groupby
1010
from pathlib import Path
1111
from typing import TYPE_CHECKING, Any
12+
from urllib.parse import urlparse
1213

14+
import fsspec
1315
import iris
1416
import ncdata
1517
import xarray as xr
@@ -82,6 +84,7 @@ def load(
8284
| xr.Dataset
8385
| ncdata.NcData,
8486
ignore_warnings: list[dict[str, Any]] | None = None,
87+
backend_kwargs: dict[str, Any] | None = None,
8588
) -> CubeList:
8689
"""Load Iris cubes.
8790
@@ -90,10 +93,19 @@ def load(
9093
file:
9194
File to be loaded. If ``file`` is already a loaded dataset, return it
9295
as a :class:`~iris.cube.CubeList`.
96+
File as ``Path`` object could be a Zarr store.
9397
ignore_warnings:
9498
Keyword arguments passed to :func:`warnings.filterwarnings` used to
9599
ignore warnings issued by :func:`iris.load_raw`. Each list element
96100
corresponds to one call to :func:`warnings.filterwarnings`.
101+
backend_kwargs:
102+
Dict to hold info needed by storage backend e.g. to access
103+
a PRIVATE S3 bucket containing object stores (e.g. netCDF4 files);
104+
needed by ``fsspec`` and its extensions e.g. ``s3fs``, so
105+
most of the times this will include ``storage_options``. Note that Zarr
106+
files are opened via ``http`` extension of ``fsspec``, so no need
107+
for ``storage_options`` in that case (ie anon/anon). Currently only used
108+
in Zarr file opening.
97109
98110
Returns
99111
-------
@@ -108,10 +120,22 @@ def load(
108120
Invalid type for ``file``.
109121
110122
"""
111-
if isinstance(file, DataElement):
123+
if isinstance(file, (str, Path)):
124+
extension = (
125+
file.suffix
126+
if isinstance(file, Path)
127+
else os.path.splitext(file)[1]
128+
)
129+
if "zarr" not in extension:
130+
cubes = _load_from_file(file, ignore_warnings=ignore_warnings)
131+
else:
132+
cubes = _load_zarr(
133+
file,
134+
ignore_warnings=ignore_warnings,
135+
backend_kwargs=backend_kwargs,
136+
)
137+
elif isinstance(file, DataElement):
112138
cubes = file.to_iris()
113-
elif isinstance(file, (str, Path)):
114-
cubes = _load_from_file(file, ignore_warnings=ignore_warnings)
115139
elif isinstance(file, Cube):
116140
cubes = CubeList([file])
117141
elif isinstance(file, CubeList):
@@ -143,6 +167,68 @@ def load(
143167
return cubes
144168

145169

170+
def _load_zarr(
171+
file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData,
172+
ignore_warnings: list[dict[str, Any]] | None = None,
173+
backend_kwargs: dict[str, Any] | None = None,
174+
) -> CubeList:
175+
# note on ``chunks`` kwarg to ``xr.open_dataset()``
176+
# docs.xarray.dev/en/stable/generated/xarray.open_dataset.html
177+
# this is very important because with ``chunks=None`` (default)
178+
# data will be realized as Numpy arrays and transferred in memory;
179+
# ``chunks={}`` loads the data with dask using the engine preferred
180+
# chunk size, generally identical to the formats chunk size. If not
181+
# available, a single chunk for all arrays; testing shows this is the
182+
# "best guess" compromise for typically CMIP-like chunked data.
183+
# see https://github.com/pydata/xarray/issues/10612 and
184+
# https://github.com/pp-mo/ncdata/issues/139
185+
186+
time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)
187+
open_kwargs = {
188+
"consolidated": False,
189+
"decode_times": time_coder,
190+
"engine": "zarr",
191+
"chunks": {},
192+
"backend_kwargs": backend_kwargs,
193+
}
194+
195+
# case 1: Zarr store is on remote object store
196+
# file's URI will always be either http or https
197+
if urlparse(str(file)).scheme in ["http", "https"]:
198+
# basic test that opens the Zarr/.zmetadata file for Zarr2
199+
# or Zarr/zarr.json for Zarr3
200+
fs = fsspec.filesystem("http")
201+
valid_zarr = True
202+
try:
203+
fs.open(str(file) + "/zarr.json", "rb") # Zarr3
204+
except Exception: # noqa: BLE001
205+
try:
206+
fs.open(str(file) + "/.zmetadata", "rb") # Zarr2
207+
except Exception: # noqa: BLE001
208+
valid_zarr = False
209+
# we don't want to catch any specific aiohttp/fsspec exception
210+
# bottom line is that that file has issues, so raise
211+
if not valid_zarr:
212+
msg = (
213+
f"File '{file}' can not be opened as Zarr file at the moment."
214+
)
215+
raise ValueError(msg)
216+
217+
open_kwargs["consolidated"] = True
218+
zarr_xr = xr.open_dataset(file, **open_kwargs)
219+
# case 2: Zarr store is local to the file system
220+
else:
221+
zarr_xr = xr.open_dataset(file, **open_kwargs)
222+
223+
# avoid possible
224+
# ValueError: Object has inconsistent chunks along dimension time.
225+
# This can be fixed by calling unify_chunks().
226+
# when trying to access the ``chunks`` store
227+
zarr_xr = zarr_xr.unify_chunks()
228+
229+
return dataset_to_iris(zarr_xr, ignore_warnings=ignore_warnings)
230+
231+
146232
def _load_from_file(
147233
file: str | Path,
148234
ignore_warnings: list[dict[str, Any]] | None = None,

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dynamic = [
3232
"version",
3333
]
3434
dependencies = [
35+
"aiohttp",
3536
"cartopy",
3637
"cf-units",
3738
"dask[array,distributed]>=2025", # Core/issues/2503
@@ -45,6 +46,7 @@ dependencies = [
4546
"geopy",
4647
"humanfriendly",
4748
"intake-esgf",
49+
"intake-esm",
4850
"iris-grib>=0.20.0", # github.com/ESMValGroup/ESMValCore/issues/2535
4951
"isodate>=0.7.0",
5052
"jinja2",
@@ -69,6 +71,7 @@ dependencies = [
6971
"stratify>=0.3",
7072
"xarray",
7173
"yamale",
74+
"zarr>3",
7275
]
7376
description = "A community tool for pre-processing data from Earth system models in CMIP and running analysis scripts"
7477
license = {text = "Apache License, Version 2.0"}
@@ -84,7 +87,7 @@ test = [
8487
"pytest-metadata>=1.5.1",
8588
"pytest-mock",
8689
"pytest-xdist",
87-
"ESMValTool_sample_data==0.0.3",
90+
"ESMValTool_sample_data==0.0.4",
8891
]
8992
doc = [
9093
"autodocsumm>=0.2.2",

0 commit comments

Comments
 (0)