From ffe0ebf7b4a31e1867945507db9c3504e3865403 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:15:04 +0200 Subject: [PATCH 01/69] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 45b79435b..6f07f7d82 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ out-* *.pyc **/*.zarr/* .DS_Store +*.parquet .vscode .env From 9ebb653cca1164009a7ad03de71cc727d3fc2c12 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:17:27 +0200 Subject: [PATCH 02/69] Disable zarr writing --- pyproject.toml | 1 + src/parcels/_core/particlefile.py | 27 ++++++------- tests/test_advection.py | 1 + tests/test_fieldset.py | 1 + tests/test_particlefile.py | 63 +++++++++++++++++++++++++++++++ tests/test_uxadvection.py | 1 + 6 files changed, 81 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85aba3a67..0d4d56784 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ markers = [ # can be skipped by doing `pytest -m "not slow"` etc. "v4alpha: failing tests that should work for v4alpha", "v4future: failing tests that should work for a future release of v4", "v4remove: failing tests that should probably be removed later", + "uses_old_zarr: tests that need to be migrated to the new particleset format" ] filterwarnings = [ diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 788c6e572..0d4e0d53b 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -222,21 +222,22 @@ def _write_particle_data(self, *, particle_data, pclass, time_interval, time, in dims = ["trajectory", "obs"] ds[var.name] = xr.DataArray(data=data, dims=dims, attrs=attrs[var.name]) ds[var.name].encoding["chunks"] = self.chunks[0] if var.to_write == "once" else self.chunks - ds.to_zarr(store, mode="w") + # ds.to_zarr(store, mode="w") self._create_new_zarrfile = False else: - Z = zarr.group(store=store, overwrite=False) - obs = particle_data["obs_written"][indices_to_write] - for var in vars_to_write: - if self._maxids > Z[var.name].shape[0]: - self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=0) - if var.to_write == "once": - if len(once_ids) > 0: - Z[var.name].vindex[ids_once] = particle_data[var.name][indices_to_write_once] - else: - if max(obs) >= Z[var.name].shape[1]: - self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=1) - Z[var.name].vindex[ids, obs] = particle_data[var.name][indices_to_write] + pass + # Z = zarr.group(store=store, overwrite=False) + # obs = particle_data["obs_written"][indices_to_write] + # for var in vars_to_write: + # if self._maxids > Z[var.name].shape[0]: + # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=0) + # if var.to_write == "once": + # if len(once_ids) > 0: + # Z[var.name].vindex[ids_once] = particle_data[var.name][indices_to_write_once] + # else: + # if max(obs) >= Z[var.name].shape[1]: + # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=1) + # Z[var.name].vindex[ids, obs] = particle_data[var.name][indices_to_write] particle_data["obs_written"][indices_to_write] = obs + 1 diff --git a/tests/test_advection.py b/tests/test_advection.py index d8c6d2a45..06e46e231 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -60,6 +60,7 @@ def test_advection_zonal(mesh, npart=10): np.testing.assert_allclose(pset.lat, startlat, atol=1e-5) +@pytest.mark.uses_old_zarr def test_advection_zonal_with_particlefile(tmp_store): """Particles at high latitude move geographically faster due to the pole correction.""" npart = 10 diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index b2b05d33f..e51d13f38 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -95,6 +95,7 @@ def test_fieldset_gridset(fieldset): assert len(fieldset.gridset) == 2 +@pytest.mark.uses_old_zarr def test_fieldset_no_UV(tmp_zarrfile): grid = XGrid.from_dataset(ds, mesh="flat") fieldset = FieldSet([Field("P", ds["U_A_grid"], grid, interp_method=XLinear)]) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 84cb90ffa..5c5f6d566 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta import numpy as np +import pyarrow as pa import pytest import xarray as xr from zarr.storage import MemoryStore @@ -21,6 +22,7 @@ XGrid, ) from parcels._core.particle import Particle, create_particle_data, get_default_particle +from parcels._core.particlefile import _get_schema from parcels._core.utils.time import TimeInterval, timedelta_to_float from parcels._datasets.structured.generated import peninsula_dataset from parcels._datasets.structured.generic import datasets @@ -44,6 +46,7 @@ def fieldset() -> FieldSet: # TODO v4: Move into a `conftest.py` file and remov ) +@pytest.mark.uses_old_zarr def test_metadata(fieldset, tmp_zarrfile): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) @@ -54,6 +57,7 @@ def test_metadata(fieldset, tmp_zarrfile): assert ds.attrs["parcels_kernels"].lower() == "DoNothing".lower() +@pytest.mark.uses_old_zarr def test_pfile_array_write_zarr_memorystore(fieldset): """Check that writing to a Zarr MemoryStore works.""" npart = 10 @@ -72,6 +76,7 @@ def test_pfile_array_write_zarr_memorystore(fieldset): assert ds.sizes["trajectory"] == npart +@pytest.mark.uses_old_zarr def test_write_fieldset_without_time(tmp_zarrfile): ds = peninsula_dataset() # DataSet without time assert "time" not in ds.dims @@ -87,6 +92,7 @@ def test_write_fieldset_without_time(tmp_zarrfile): assert ds.time.values[0, 1] == np.timedelta64(1, "s") +@pytest.mark.uses_old_zarr def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): npart = 10 pset = ParticleSet( @@ -108,6 +114,7 @@ def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) +@pytest.mark.uses_old_zarr @pytest.mark.parametrize("chunks_obs", [1, None]) def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): npart = 10 @@ -135,6 +142,7 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): assert np.all(np.isnan(ds["time"][:, 1:])) +@pytest.mark.uses_old_zarr def test_variable_write_double(fieldset, tmp_zarrfile): def Update_lon(particles, fieldset): # pragma: no cover particles.dlon += 0.1 @@ -150,6 +158,7 @@ def Update_lon(particles, fieldset): # pragma: no cover assert isinstance(lons.values[0, 0], np.float64) +@pytest.mark.uses_old_zarr def test_write_dtypes_pfile(fieldset, tmp_zarrfile): dtypes = [ np.float32, @@ -226,6 +235,7 @@ def IncrLon(particles, fieldset): # pragma: no cover assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB +@pytest.mark.uses_old_zarr def test_file_warnings(fieldset, tmp_zarrfile): pset = ParticleSet(fieldset, lon=[0, 0], lat=[0, 0], time=[np.timedelta64(0, "s"), np.timedelta64(1, "s")]) pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(2, "s")) @@ -250,6 +260,7 @@ def test_outputdt_types(outputdt, expectation, tmp_zarrfile): assert pfile.outputdt == timedelta_to_float(outputdt) +@pytest.mark.uses_old_zarr def test_write_timebackward(fieldset, tmp_zarrfile): release_time = fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(3)] pset = ParticleSet(fieldset, lat=[0, 1, 2], lon=[0, 0, 0], time=release_time) @@ -322,6 +333,7 @@ def SampleP(particles, fieldset): # pragma: no cover assert fieldset.U.grid.lat[yi] <= lat < fieldset.U.grid.lat[yi + 1] +@pytest.mark.uses_old_zarr @pytest.mark.parametrize("outputdt", [np.timedelta64(1, "s"), np.timedelta64(2, "s"), np.timedelta64(3, "s")]) def test_time_is_age(fieldset, tmp_zarrfile, outputdt): # Test that particle age is same as time - initial_time @@ -346,6 +358,7 @@ def IncreaseAge(particles, fieldset): # pragma: no cover np.testing.assert_equal(age, ds_timediff) +@pytest.mark.uses_old_zarr def test_reset_dt(fieldset, tmp_zarrfile): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions @@ -362,6 +375,7 @@ def Update_lon(particles, fieldset): # pragma: no cover assert np.allclose(pset.lon, 0.6) +@pytest.mark.uses_old_zarr def test_correct_misaligned_outputdt_dt(fieldset, tmp_zarrfile): """Testing that outputdt does not need to be a multiple of dt.""" @@ -398,6 +412,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg return ds +@pytest.mark.uses_old_zarr def test_pset_execute_outputdt_forwards(fieldset): """Testing output data dt matches outputdt in forward time.""" outputdt = timedelta(hours=1) @@ -409,6 +424,7 @@ def test_pset_execute_outputdt_forwards(fieldset): assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) +@pytest.mark.uses_old_zarr def test_pset_execute_output_time_forwards(fieldset): """Testing output times start at initial time and end at initial time + runtime.""" outputdt = np.timedelta64(1, "h") @@ -423,6 +439,7 @@ def test_pset_execute_output_time_forwards(fieldset): ) +@pytest.mark.uses_old_zarr def test_pset_execute_outputdt_backwards(fieldset): """Testing output data dt matches outputdt in backwards time.""" outputdt = timedelta(hours=1) @@ -434,6 +451,7 @@ def test_pset_execute_outputdt_backwards(fieldset): assert np.all(file_outputdt == np.timedelta64(-outputdt)) +@pytest.mark.uses_old_zarr def test_pset_execute_outputdt_backwards_fieldset_timevarying(): """test_pset_execute_outputdt_backwards() still passed despite #1722 as it doesn't account for time-varying fields, which for some reason #1722 @@ -469,6 +487,7 @@ def test_particlefile_init_invalid(tmp_store): # TODO: Add test for read only s ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=1) +@pytest.mark.uses_old_zarr def test_particlefile_write_particle_data(tmp_store): nparticles = 100 @@ -535,3 +554,47 @@ def Update_lon(particles, fieldset): # pragma: no cover # For pytest purposes, we need to reset to original status pset.set_variable_write_status("z", True) pset.set_variable_write_status("lat", True) + + +@pytest.mark.parametrize( + "particle", + [ + Particle, + parcels.ParticleClass( + variables=[ + Variable( + "lon", + dtype=np.float32, + attrs={"standard_name": "longitude", "units": "degrees_east", "axis": "X"}, + ), + Variable( + "lat", + dtype=np.float32, + attrs={"standard_name": "latitude", "units": "degrees_north", "axis": "Y"}, + ), + Variable( + "z", + dtype=np.float32, + attrs={"standard_name": "vertical coordinate", "units": "m", "positive": "down"}, + ), + ] + ), + ], +) +def test_particle_schema(particle): + s = _get_schema(particle, {}) + + written_variables = [v for v in particle.variables if v.to_write] + + assert len(s.names) == len(written_variables), ( + "Number of particles in the output schema should be the same as the writable variables in the ParticleClass object." + ) + + for variable, pyarrow_field in zip( + written_variables, + s, + strict=False, + ): + assert variable.name == pyarrow_field.name + assert variable.attrs == {k.decode(): v.decode() for k, v in pyarrow_field.metadata.items()} + assert pa.from_numpy_dtype(variable.dtype) == pyarrow_field.type diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 3f27536f8..95e517140 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -11,6 +11,7 @@ ) +@pytest.mark.uses_old_zarr @pytest.mark.parametrize("integrator", [AdvectionEE, AdvectionRK2, AdvectionRK4]) def test_ux_constant_flow_face_centered_2D(integrator, tmp_zarrfile): ds = datasets_unstructured["ux_constant_flow_face_centered_2D"] From 0218c28f4d8f13af0ce9946c52915693322ab3d7 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 15:41:28 +0200 Subject: [PATCH 03/69] Fix parquet writing --- pyproject.toml | 2 +- src/parcels/_core/particlefile.py | 225 ++++++------------------------ src/parcels/_core/particleset.py | 9 +- src/parcels/_reprs.py | 7 +- tests/conftest.py | 5 + tests/test_advection.py | 11 +- 6 files changed, 64 insertions(+), 195 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d4d56784..d0e6f7ba1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ markers = [ # can be skipped by doing `pytest -m "not slow"` etc. "v4alpha: failing tests that should work for v4alpha", "v4future: failing tests that should work for a future release of v4", "v4remove: failing tests that should probably be removed later", - "uses_old_zarr: tests that need to be migrated to the new particleset format" + "uses_old_zarr: tests that need to be migrated to the new particleset format", ] filterwarnings = [ diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 0d4e0d53b..125a925c7 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -2,16 +2,14 @@ from __future__ import annotations -import os from datetime import datetime, timedelta from pathlib import Path -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal import cftime import numpy as np -import xarray as xr -import zarr -from zarr.storage import DirectoryStore +import pyarrow as pa +import pyarrow.parquet as pq import parcels from parcels._core.particle import ParticleClass @@ -25,20 +23,19 @@ __all__ = ["ParticleFile"] -_DATATYPES_TO_FILL_VALUES = { - np.dtype(np.float16): np.nan, - np.dtype(np.float32): np.nan, - np.dtype(np.float64): np.nan, - np.dtype(np.bool_): np.iinfo(np.int8).max, - np.dtype(np.int8): np.iinfo(np.int8).max, - np.dtype(np.int16): np.iinfo(np.int16).max, - np.dtype(np.int32): np.iinfo(np.int32).max, - np.dtype(np.int64): np.iinfo(np.int64).min, - np.dtype(np.uint8): np.iinfo(np.uint8).max, - np.dtype(np.uint16): np.iinfo(np.uint16).max, - np.dtype(np.uint32): np.iinfo(np.uint32).max, - np.dtype(np.uint64): np.iinfo(np.uint64).max, -} + +def _get_schema(particle: parcels.ParticleClass, file_metadata: dict[Any, Any]) -> pa.Schema: + return pa.schema( + [ + pa.field( + v.name, + pa.from_numpy_dtype(v.dtype), + metadata=v.attrs, + ) + for v in _get_vars_to_write(particle) + ], + metadata=file_metadata.copy(), + ) class ParticleFile: @@ -54,10 +51,6 @@ class ParticleFile: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a numpy.timedelta64, a datimetime.timedelta object or a positive float (in seconds). - chunks : - Tuple (trajs, obs) to control the size of chunks in the zarr output. - create_new_zarrfile : bool - Whether to create a new file. Default is True Returns ------- @@ -65,7 +58,7 @@ class ParticleFile: ParticleFile object that can be used to write particle data to file """ - def __init__(self, store, outputdt, chunks=None, create_new_zarrfile=True): + def __init__(self, path: Path, outputdt): if not isinstance(outputdt, (np.timedelta64, timedelta, float)): raise ValueError( f"Expected outputdt to be a np.timedelta64, datetime.timedelta or float (in seconds), got {type(outputdt)}" @@ -78,21 +71,19 @@ def __init__(self, store, outputdt, chunks=None, create_new_zarrfile=True): self._outputdt = outputdt - _assert_valid_chunks_tuple(chunks) - self._chunks = chunks + self._path = Path( + path + ) # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? + self._writer: pq.ParquetWriter | None = None + if path.exists(): + # TODO: Add logic for recovering/appending to existing parquet file + raise ValueError(f"{path=!r} already exists. Either delete this file or use a path that doesn't exist.") + if not path.parent.exists(): + raise ValueError(f"Folder location for {path=!r} does not exist. Create the folder location first.") + self._maxids = 0 self._pids_written = {} - self.metadata = {} - self._create_new_zarrfile = create_new_zarrfile - - if not isinstance(store, zarr.storage.Store): - store = _get_store_from_pathlike(store) - - self._store = store - - # TODO v4: Enable once updating to zarr v3 - # if store.read_only: - # raise ValueError(f"Store {store} is read-only. Please provide a writable store.") + self.extra_metadata = {} # TODO v4: Add check that if create_new_zarrfile is False, the store already exists @@ -100,7 +91,7 @@ def __repr__(self) -> str: return particlefile_repr(self) def set_metadata(self, parcels_grid_mesh: Literal["spherical", "flat"]): - self.metadata.update( + self.extra_metadata.update( { "feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -115,31 +106,8 @@ def outputdt(self): return self._outputdt @property - def chunks(self): - return self._chunks - - @property - def store(self): - return self._store - - @property - def create_new_zarrfile(self): - return self._create_new_zarrfile - - def _extend_zarr_dims(self, Z, store, dtype, axis): # noqa: N803 - if axis == 1: - a = np.full((Z.shape[0], self.chunks[1]), _DATATYPES_TO_FILL_VALUES[dtype], dtype=dtype) - obs = zarr.group(store=store, overwrite=False)["obs"] - if len(obs) == Z.shape[1]: - obs.append(np.arange(self.chunks[1]) + obs[-1] + 1) - else: - extra_trajs = self._maxids - Z.shape[0] - if len(Z.shape) == 2: - a = np.full((extra_trajs, Z.shape[1]), _DATATYPES_TO_FILL_VALUES[dtype], dtype=dtype) - else: - a = np.full((extra_trajs,), _DATATYPES_TO_FILL_VALUES[dtype], dtype=dtype) - Z.append(a, axis=axis) - zarr.consolidate_metadata(store) + def path(self): + return self._path def write(self, pset: ParticleSet, time, indices=None): """Write all data from one time step to the zarr file, @@ -156,125 +124,35 @@ def write(self, pset: ParticleSet, time, indices=None): time_interval = pset.fieldset.time_interval particle_data = pset._data - self._write_particle_data( - particle_data=particle_data, pclass=pclass, time_interval=time_interval, time=time, indices=indices - ) + if self._writer is None: + assert not self.path.exists(), "If the file exists, the writer should already be set" + self._writer = pq.ParquetWriter(self.path, _get_schema(pclass, self.extra_metadata)) - def _write_particle_data(self, *, particle_data, pclass, time_interval, time, indices=None): - # if pset._data._ncount == 0: - # warnings.warn( - # f"ParticleSet is empty on writing as array at time {time:g}", - # RuntimeWarning, - # stacklevel=2, - # ) - # return if isinstance(time, (np.timedelta64, np.datetime64)): time = timedelta_to_float(time - time_interval.left) - nparticles = len(particle_data["trajectory"]) vars_to_write = _get_vars_to_write(pclass) if indices is None: indices_to_write = _to_write_particles(particle_data, time) else: indices_to_write = indices + + self._writer.write_table( + pa.table({v.name: pa.array(particle_data[v.name][indices_to_write]) for v in vars_to_write}), + ) - if len(indices_to_write) == 0: - return - - pids = particle_data["trajectory"][indices_to_write] - to_add = sorted(set(pids) - set(self._pids_written.keys())) - for i, pid in enumerate(to_add): - self._pids_written[pid] = self._maxids + i - ids = np.array([self._pids_written[p] for p in pids], dtype=int) - self._maxids = len(self._pids_written) - - once_ids = np.where(particle_data["obs_written"][indices_to_write] == 0)[0] - if len(once_ids) > 0: - ids_once = ids[once_ids] - indices_to_write_once = indices_to_write[once_ids] - - store = self.store - if self.create_new_zarrfile: - if self.chunks is None: - self._chunks = (nparticles, 1) - if (self._maxids > len(ids)) or (self._maxids > self.chunks[0]): - arrsize = (self._maxids, self.chunks[1]) - else: - arrsize = (len(ids), self.chunks[1]) - ds = xr.Dataset( - attrs=self.metadata, - coords={"trajectory": ("trajectory", pids), "obs": ("obs", np.arange(arrsize[1], dtype=np.int32))}, - ) - attrs = _create_variables_attribute_dict(pclass, time_interval) - obs = np.zeros((self._maxids), dtype=np.int32) - for var in vars_to_write: - if var.name not in ["trajectory"]: # because 'trajectory' is written as coordinate - if var.to_write == "once": - data = np.full( - (arrsize[0],), - _DATATYPES_TO_FILL_VALUES[var.dtype], - dtype=var.dtype, - ) - data[ids_once] = particle_data[var.name][indices_to_write_once] - dims = ["trajectory"] - else: - data = np.full(arrsize, _DATATYPES_TO_FILL_VALUES[var.dtype], dtype=var.dtype) - data[ids, 0] = particle_data[var.name][indices_to_write] - dims = ["trajectory", "obs"] - ds[var.name] = xr.DataArray(data=data, dims=dims, attrs=attrs[var.name]) - ds[var.name].encoding["chunks"] = self.chunks[0] if var.to_write == "once" else self.chunks - # ds.to_zarr(store, mode="w") - self._create_new_zarrfile = False - else: - pass - # Z = zarr.group(store=store, overwrite=False) - # obs = particle_data["obs_written"][indices_to_write] - # for var in vars_to_write: - # if self._maxids > Z[var.name].shape[0]: - # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=0) - # if var.to_write == "once": - # if len(once_ids) > 0: - # Z[var.name].vindex[ids_once] = particle_data[var.name][indices_to_write_once] - # else: - # if max(obs) >= Z[var.name].shape[1]: - # self._extend_zarr_dims(Z[var.name], store, dtype=var.dtype, axis=1) - # Z[var.name].vindex[ids, obs] = particle_data[var.name][indices_to_write] - - particle_data["obs_written"][indices_to_write] = obs + 1 - - -def _get_store_from_pathlike(path: Path | str) -> DirectoryStore: - path = str(Path(path)) # Ensure valid path, and convert to string - extension = os.path.splitext(path)[1] - if extension != ".zarr": - raise ValueError(f"ParticleFile name must end with '.zarr' extension. Got path {path!r}.") + # if len(indices_to_write) == 0: # TODO: Remove this? + # return - return DirectoryStore(path) + def close(self): + if self._writer is not None: + self._writer.close() + self._writer = None def _get_vars_to_write(particle: ParticleClass) -> list[Variable]: return [v for v in particle.variables if v.to_write is not False] -def _create_variables_attribute_dict(particle: ParticleClass, time_interval: TimeInterval) -> dict: - """Creates the dictionary with variable attributes. - - Notes - ----- - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - attrs = {} - - vars = [var for var in particle.variables if var.to_write is not False] - for var in vars: - fill_value = {"_FillValue": _DATATYPES_TO_FILL_VALUES[var.dtype]} - - attrs[var.name] = {**var.attrs, **fill_value} - - attrs["time"].update(_get_calendar_and_units(time_interval)) - - return attrs - - def _to_write_particles(particle_data, time): """Return the Particles that need to be written at time: if particle.time is between time-dt/2 and time+dt (/2)""" return np.where( @@ -299,7 +177,7 @@ def _to_write_particles(particle_data, time): )[0] -def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: +def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # TODO: Remove? calendar = None units = "seconds" if time_interval: @@ -316,16 +194,3 @@ def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: attrs["calendar"] = calendar return attrs - - -def _assert_valid_chunks_tuple(chunks): - e = ValueError(f"chunks must be a tuple of integers with length 2, got {chunks=!r} instead.") - if chunks is None: - return - - if not isinstance(chunks, tuple): - raise e - if len(chunks) != 2: - raise e - if not all(isinstance(c, int) for c in chunks): - raise e diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index 5483ffbe4..e4ecb252b 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -20,7 +20,7 @@ ) from parcels._core.warnings import ParticleSetWarning from parcels._logger import logger -from parcels._reprs import _format_zarr_output_location, particleset_repr +from parcels._reprs import particleset_repr __all__ = ["ParticleSet"] @@ -395,7 +395,7 @@ def execute( if output_file is not None: output_file.set_metadata(self.fieldset.gridset[0]._mesh) - output_file.metadata["parcels_kernels"] = self._kernel.funcname + output_file.extra_metadata["parcels_kernels"] = self._kernel.funcname dt, sign_dt = _convert_dt_to_float(dt) self._data["dt"][:] = dt @@ -415,7 +415,7 @@ def execute( # Set up pbar if output_file: - logger.info(f"Output files are stored in {_format_zarr_output_location(output_file.store)}") + logger.info(f"Output files are stored in {output_file.path}") if verbose_progress: pbar = tqdm(total=end_time - start_time, file=sys.stdout) @@ -451,6 +451,9 @@ def execute( time = next_time + if output_file is not None: + output_file.close() + if verbose_progress: pbar.close() diff --git a/src/parcels/_reprs.py b/src/parcels/_reprs.py index ad6d0cca2..34b6814a0 100644 --- a/src/parcels/_reprs.py +++ b/src/parcels/_reprs.py @@ -128,7 +128,7 @@ def timeinterval_repr(ti: Any) -> str: def particlefile_repr(pfile: Any) -> str: """Return a pretty repr for ParticleFile""" out = f"""<{type(pfile).__name__}> - store : {_format_zarr_output_location(pfile.store)} + path : {pfile.path} outputdt : {pfile.outputdt!r} chunks : {pfile.chunks!r} create_new_zarrfile : {pfile.create_new_zarrfile!r} @@ -178,11 +178,6 @@ def _format_list_items_multiline(items: list[str] | dict, level: int = 1, with_b return "\n".join([textwrap.indent(e, indentation_str) for e in entries]) -def _format_zarr_output_location(zarr_obj): - if isinstance(zarr_obj, DirectoryStore): - return zarr_obj.path - return repr(zarr_obj) - def is_builtin_object(obj): return obj.__class__.__module__ == "builtins" diff --git a/tests/conftest.py b/tests/conftest.py index 82020c37e..3308d7e3e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,3 +11,8 @@ def tmp_zarrfile(tmp_path, request): @pytest.fixture def tmp_store(): return MemoryStore() + +@pytest.fixture +def tmp_parquet(tmp_path): + return tmp_path / 'tmp.parquet' + diff --git a/tests/test_advection.py b/tests/test_advection.py index 06e46e231..365e9e6f4 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -4,6 +4,7 @@ import parcels import parcels.tutorial +import pandas as pd from parcels import ( Field, FieldSet, @@ -60,8 +61,7 @@ def test_advection_zonal(mesh, npart=10): np.testing.assert_allclose(pset.lat, startlat, atol=1e-5) -@pytest.mark.uses_old_zarr -def test_advection_zonal_with_particlefile(tmp_store): +def test_advection_zonal_with_particlefile(tmp_parquet): """Particles at high latitude move geographically faster due to the pole correction.""" npart = 10 ds = simple_UV_dataset(mesh="flat") @@ -69,12 +69,13 @@ def test_advection_zonal_with_particlefile(tmp_store): fieldset = FieldSet.from_sgrid_conventions(ds, mesh="flat") pset = ParticleSet(fieldset, lon=np.zeros(npart) + 20.0, lat=np.linspace(0, 80, npart)) - pfile = ParticleFile(tmp_store, outputdt=np.timedelta64(30, "m")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(30, "m")) pset.execute(AdvectionRK4, runtime=np.timedelta64(2, "h"), dt=np.timedelta64(15, "m"), output_file=pfile) assert (np.diff(pset.lon) < 1.0e-4).all() - ds = xr.open_zarr(tmp_store) - np.testing.assert_allclose(ds.isel(obs=-1).lon.values, pset.lon) + df = pd.read_parquet(tmp_parquet) + final_time = df["time"].max() + np.testing.assert_allclose(df[df["time"] == final_time]["lon"].values, pset.lon, atol=1e-5) def periodicBC(particles, fieldset): From 4e7de3eab2af85c3c33e2d38fc1872606db5d601 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:48:53 +0200 Subject: [PATCH 04/69] Remove test_vriable_write_double From bc653f1ec5ac2691572aa22d2c802a4885224a71 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:08:16 +0200 Subject: [PATCH 05/69] Fix all "uses_old_zarr" tests --- tests-v3/test_advection.py | 6 +- tests-v3/test_fieldset_sampling.py | 6 +- tests-v3/test_particlesets.py | 6 +- tests/conftest.py | 2 +- tests/test_fieldset.py | 10 +-- tests/test_particlefile.py | 112 ++++++++++++++--------------- tests/test_uxadvection.py | 10 +-- 7 files changed, 76 insertions(+), 76 deletions(-) diff --git a/tests-v3/test_advection.py b/tests-v3/test_advection.py index 3d8f06bac..77abee900 100644 --- a/tests-v3/test_advection.py +++ b/tests-v3/test_advection.py @@ -79,7 +79,7 @@ def test_analyticalAgrid(): @pytest.mark.parametrize("v", [1, -0.3, 0, -1]) @pytest.mark.parametrize("w", [None, 1, -0.3, 0, -1]) @pytest.mark.parametrize("direction", [1, -1]) -def test_uniform_analytical(u, v, w, direction, tmp_zarrfile): +def test_uniform_analytical(u, v, w, direction, tmp_parquet): lon = np.arange(0, 15, dtype=np.float32) lat = np.arange(0, 15, dtype=np.float32) if w is not None: @@ -99,14 +99,14 @@ def test_uniform_analytical(u, v, w, direction, tmp_zarrfile): x0, y0, z0 = 6.1, 6.2, 20 pset = ParticleSet(fieldset, pclass=Particle, lon=x0, lat=y0, depth=z0) - outfile = pset.ParticleFile(name=tmp_zarrfile, outputdt=1, chunks=(1, 1)) + outfile = pset.ParticleFile(name=tmp_parquet, outputdt=1, chunks=(1, 1)) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) assert np.abs(pset.lon - x0 - pset.time * u) < 1e-6 assert np.abs(pset.lat - y0 - pset.time * v) < 1e-6 if w is not None: assert np.abs(pset.depth - z0 - pset.time * w) < 1e-4 - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) times = (direction * ds["time"][:]).values.astype("timedelta64[s]")[0] timeref = np.arange(1, 5).astype("timedelta64[s]") assert np.allclose(times, timeref, atol=np.timedelta64(1, "ms")) diff --git a/tests-v3/test_fieldset_sampling.py b/tests-v3/test_fieldset_sampling.py index 291c27b88..5f25e355c 100644 --- a/tests-v3/test_fieldset_sampling.py +++ b/tests-v3/test_fieldset_sampling.py @@ -773,7 +773,7 @@ def test_multiple_grid_addlater_error(): assert fail -def test_fieldset_sampling_updating_order(tmp_zarrfile): +def test_fieldset_sampling_updating_order(tmp_parquet): def calc_p(t, y, x): return 10 * t + x + 0.2 * y @@ -805,10 +805,10 @@ def SampleP(particle, fieldset, time): # pragma: no cover kernels = [AdvectionRK4, SampleP] - pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) + pfile = pset.ParticleFile(tmp_parquet, outputdt=1) pset.execute(kernels, endtime=1, dt=1, output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) for t in range(len(ds["obs"])): for i in range(len(ds["trajectory"])): assert np.isclose( diff --git a/tests-v3/test_particlesets.py b/tests-v3/test_particlesets.py index ed884f595..5c0f2495f 100644 --- a/tests-v3/test_particlesets.py +++ b/tests-v3/test_particlesets.py @@ -39,7 +39,7 @@ def test_pset_create_list_with_customvariable(fieldset): @pytest.mark.parametrize("restart", [True, False]) -def test_pset_create_fromparticlefile(fieldset, restart, tmp_zarrfile): +def test_pset_create_fromparticlefile(fieldset, restart, tmp_parquet): lon = np.linspace(0, 1, 10, dtype=np.float32) lat = np.linspace(1, 0, 10, dtype=np.float32) @@ -48,7 +48,7 @@ def test_pset_create_fromparticlefile(fieldset, restart, tmp_zarrfile): TestParticle = TestParticle.add_variable("p3", np.float64, to_write="once") pset = ParticleSet(fieldset, lon=lon, lat=lat, depth=[4] * len(lon), pclass=TestParticle, p3=np.arange(len(lon))) - pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) + pfile = pset.ParticleFile(tmp_parquet, outputdt=1) def Kernel(particle, fieldset, time): # pragma: no cover particle.p = 2.0 @@ -58,7 +58,7 @@ def Kernel(particle, fieldset, time): # pragma: no cover pset.execute(Kernel, runtime=2, dt=1, output_file=pfile) pset_new = ParticleSet.from_particlefile( - fieldset, pclass=TestParticle, filename=tmp_zarrfile, restart=restart, repeatdt=1 + fieldset, pclass=TestParticle, filename=tmp_parquet, restart=restart, repeatdt=1 ) for var in ["lon", "lat", "depth", "time", "p", "p2", "p3"]: diff --git a/tests/conftest.py b/tests/conftest.py index 3308d7e3e..56bbfa480 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ @pytest.fixture() -def tmp_zarrfile(tmp_path, request): +def tmp_parquet(tmp_path, request): test_name = request.node.name yield tmp_path / f"{test_name}-output.zarr" diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index e51d13f38..d9b8d6000 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -95,8 +95,8 @@ def test_fieldset_gridset(fieldset): assert len(fieldset.gridset) == 2 -@pytest.mark.uses_old_zarr -def test_fieldset_no_UV(tmp_zarrfile): + +def test_fieldset_no_UV(tmp_parquet): grid = XGrid.from_dataset(ds, mesh="flat") fieldset = FieldSet([Field("P", ds["U_A_grid"], grid, interp_method=XLinear)]) @@ -104,11 +104,11 @@ def SampleP(particles, fieldset): particles.dlon += fieldset.P[particles] pset = ParticleSet(fieldset, lon=0, lat=0) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(SampleP, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds_out = xr.open_zarr(tmp_zarrfile) - assert ds_out["lon"].shape == (1, 2) + df = xr.open_zarr(tmp_parquet) + assert len(df["lon"]) == 2 @pytest.mark.parametrize("ds", [pytest.param(ds, id=k) for k, ds in datasets_structured.items()]) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 5c5f6d566..67775fa30 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -46,18 +46,18 @@ def fieldset() -> FieldSet: # TODO v4: Move into a `conftest.py` file and remov ) -@pytest.mark.uses_old_zarr -def test_metadata(fieldset, tmp_zarrfile): + +def test_metadata(fieldset, tmp_parquet): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert ds.attrs["parcels_kernels"].lower() == "DoNothing".lower() -@pytest.mark.uses_old_zarr + def test_pfile_array_write_zarr_memorystore(fieldset): """Check that writing to a Zarr MemoryStore works.""" npart = 10 @@ -76,8 +76,8 @@ def test_pfile_array_write_zarr_memorystore(fieldset): assert ds.sizes["trajectory"] == npart -@pytest.mark.uses_old_zarr -def test_write_fieldset_without_time(tmp_zarrfile): + +def test_write_fieldset_without_time(tmp_parquet): ds = peninsula_dataset() # DataSet without time assert "time" not in ds.dims grid = XGrid.from_dataset(ds, mesh="flat") @@ -85,15 +85,15 @@ def test_write_fieldset_without_time(tmp_zarrfile): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert ds.time.values[0, 1] == np.timedelta64(1, "s") -@pytest.mark.uses_old_zarr -def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): + +def test_pfile_array_remove_particles(fieldset, tmp_parquet): npart = 10 pset = ParticleSet( fieldset, @@ -102,21 +102,21 @@ def test_pfile_array_remove_particles(fieldset, tmp_zarrfile): lat=0.5 * np.ones(npart), time=fieldset.time_interval.left, ) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset._data["time"][:] = 0 pfile.write(pset, time=fieldset.time_interval.left) pset.remove_indices(3) new_time = 86400 # s in a day pset._data["time"][:] = new_time pfile.write(pset, new_time) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) timearr = ds["time"][:] assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) -@pytest.mark.uses_old_zarr + @pytest.mark.parametrize("chunks_obs", [1, None]) -def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): +def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): npart = 10 pset = ParticleSet( fieldset, @@ -126,14 +126,14 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_zarrfile): time=fieldset.time_interval.left, ) chunks = (npart, chunks_obs) if chunks_obs else None - pfile = ParticleFile(tmp_zarrfile, chunks=chunks, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, chunks=chunks, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=0) for _ in range(npart): pset.remove_indices(-1) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(1, "D")) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(2, "D")) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) if chunks_obs is not None: assert ds["time"][:].shape == chunks @@ -158,8 +158,8 @@ def Update_lon(particles, fieldset): # pragma: no cover assert isinstance(lons.values[0, 0], np.float64) -@pytest.mark.uses_old_zarr -def test_write_dtypes_pfile(fieldset, tmp_zarrfile): + +def test_write_dtypes_pfile(fieldset, tmp_parquet): dtypes = [ np.float32, np.float64, @@ -178,11 +178,11 @@ def test_write_dtypes_pfile(fieldset, tmp_zarrfile): MyParticle = Particle.add_variable(extra_vars) pset = ParticleSet(fieldset, pclass=MyParticle, lon=0, lat=0, time=fieldset.time_interval.left) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=fieldset.time_interval.left) ds = xr.open_zarr( - tmp_zarrfile, mask_and_scale=False + tmp_parquet, mask_and_scale=False ) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float for d in dtypes: assert ds[f"v_{d.__name__}"].dtype == d @@ -196,7 +196,7 @@ def test_variable_written_once(): @pytest.mark.skip(reason="Pending ParticleFile refactor; see issue #2386") @pytest.mark.parametrize("dt", [-np.timedelta64(1, "s"), np.timedelta64(1, "s")]) @pytest.mark.parametrize("maxvar", [2, 4, 10]) -def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_zarrfile, dt, maxvar): +def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_parquet, dt, maxvar): """Tests that if particles are released and deleted based on age that resulting output file is correct.""" npart = 10 fieldset.add_constant("maxvar", maxvar) @@ -212,7 +212,7 @@ def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_zarrfile, d pclass=MyParticle, time=fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(npart)], ) - pfile = ParticleFile(tmp_zarrfile, outputdt=abs(dt), chunks=(1, 1)) + pfile = ParticleFile(tmp_parquet, outputdt=abs(dt), chunks=(1, 1)) def IncrLon(particles, fieldset): # pragma: no cover particles.sample_var += 1.0 @@ -225,20 +225,20 @@ def IncrLon(particles, fieldset): # pragma: no cover for _ in range(npart): pset.execute(IncrLon, dt=dt, runtime=np.timedelta64(1, "s"), output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) samplevar = ds["sample_var"][:] assert samplevar.shape == (npart, min(maxvar, npart + 1)) # test whether samplevar[:, k] = k for k in range(samplevar.shape[1]): assert np.allclose([p for p in samplevar[:, k] if np.isfinite(p)], k + 1) - filesize = os.path.getsize(str(tmp_zarrfile)) + filesize = os.path.getsize(str(tmp_parquet)) assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB -@pytest.mark.uses_old_zarr -def test_file_warnings(fieldset, tmp_zarrfile): + +def test_file_warnings(fieldset, tmp_parquet): pset = ParticleSet(fieldset, lon=[0, 0], lat=[0, 0], time=[np.timedelta64(0, "s"), np.timedelta64(1, "s")]) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(2, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(2, "s")) with pytest.warns(ParticleSetWarning, match="Some of the particles have a start time difference.*"): pset.execute(AdvectionRK4, runtime=3, dt=1, output_file=pfile) @@ -254,20 +254,20 @@ def test_file_warnings(fieldset, tmp_zarrfile): (-np.timedelta64(5, "s"), pytest.raises(ValueError)), ], ) -def test_outputdt_types(outputdt, expectation, tmp_zarrfile): +def test_outputdt_types(outputdt, expectation, tmp_parquet): with expectation: - pfile = ParticleFile(tmp_zarrfile, outputdt=outputdt) + pfile = ParticleFile(tmp_parquet, outputdt=outputdt) assert pfile.outputdt == timedelta_to_float(outputdt) -@pytest.mark.uses_old_zarr -def test_write_timebackward(fieldset, tmp_zarrfile): + +def test_write_timebackward(fieldset, tmp_parquet): release_time = fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(3)] pset = ParticleSet(fieldset, lat=[0, 1, 2], lon=[0, 0, 0], time=release_time) - pfile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(3, "s"), dt=-np.timedelta64(1, "s"), output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) trajs = ds["trajectory"][:] output_time = ds["time"][:].values @@ -280,7 +280,7 @@ def test_write_timebackward(fieldset, tmp_zarrfile): @pytest.mark.xfail @pytest.mark.v4alpha -def test_write_xiyi(fieldset, tmp_zarrfile): +def test_write_xiyi(fieldset, tmp_parquet): fieldset.U.data[:] = 1 # set a non-zero zonal velocity fieldset.add_field( Field(name="P", data=np.zeros((3, 20)), lon=np.linspace(0, 1, 20), lat=[-2, 0, 2], interp_method=XLinear) @@ -311,10 +311,10 @@ def SampleP(particles, fieldset): # pragma: no cover _ = fieldset.P[particles] # To trigger sampling of the P field pset = ParticleSet(fieldset, pclass=XiYiParticle, lon=[0, 0.2], lat=[0.2, 1]) - pfile = ParticleFile(tmp_zarrfile, outputdt=dt) + pfile = ParticleFile(tmp_parquet, outputdt=dt) pset.execute([SampleP, Get_XiYi, AdvectionRK4], endtime=10 * dt, dt=dt, output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) pxi0 = ds["pxi0"][:].values.astype(np.int32) pxi1 = ds["pxi1"][:].values.astype(np.int32) lons = ds["lon"][:].values @@ -333,9 +333,9 @@ def SampleP(particles, fieldset): # pragma: no cover assert fieldset.U.grid.lat[yi] <= lat < fieldset.U.grid.lat[yi + 1] -@pytest.mark.uses_old_zarr + @pytest.mark.parametrize("outputdt", [np.timedelta64(1, "s"), np.timedelta64(2, "s"), np.timedelta64(3, "s")]) -def test_time_is_age(fieldset, tmp_zarrfile, outputdt): +def test_time_is_age(fieldset, tmp_parquet, outputdt): # Test that particle age is same as time - initial_time npart = 10 @@ -346,11 +346,11 @@ def IncreaseAge(particles, fieldset): # pragma: no cover time = fieldset.time_interval.left + np.arange(npart) * np.timedelta64(1, "s") pset = ParticleSet(fieldset, pclass=AgeParticle, lon=npart * [0], lat=npart * [0], time=time) - ofile = ParticleFile(tmp_zarrfile, outputdt=outputdt) + ofile = ParticleFile(tmp_parquet, outputdt=outputdt) pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) age = ds["age"][:].values.astype("timedelta64[s]") ds_timediff = np.zeros_like(age) for i in range(npart): @@ -358,8 +358,8 @@ def IncreaseAge(particles, fieldset): # pragma: no cover np.testing.assert_equal(age, ds_timediff) -@pytest.mark.uses_old_zarr -def test_reset_dt(fieldset, tmp_zarrfile): + +def test_reset_dt(fieldset, tmp_parquet): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions dt = np.timedelta64(20, "s") @@ -369,14 +369,14 @@ def Update_lon(particles, fieldset): # pragma: no cover particle = get_default_particle(np.float64) pset = ParticleSet(fieldset, pclass=particle, lon=[0], lat=[0]) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(50, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(50, "s")) pset.execute(Update_lon, runtime=5 * dt, dt=dt, output_file=ofile) assert np.allclose(pset.lon, 0.6) -@pytest.mark.uses_old_zarr -def test_correct_misaligned_outputdt_dt(fieldset, tmp_zarrfile): + +def test_correct_misaligned_outputdt_dt(fieldset, tmp_parquet): """Testing that outputdt does not need to be a multiple of dt.""" def Update_lon(particles, fieldset): # pragma: no cover @@ -384,10 +384,10 @@ def Update_lon(particles, fieldset): # pragma: no cover particle = get_default_particle(np.float64) pset = ParticleSet(fieldset, pclass=particle, lon=[0], lat=[0]) - ofile = ParticleFile(tmp_zarrfile, outputdt=np.timedelta64(3, "s")) + ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(3, "s")) pset.execute(Update_lon, runtime=np.timedelta64(11, "s"), dt=np.timedelta64(2, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert np.allclose(ds.lon.values, [0, 3, 6, 9]) assert np.allclose(timedelta_to_float(ds.time.values - ds.time.values[0, 0]), [0, 3, 6, 9]) @@ -412,7 +412,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg return ds -@pytest.mark.uses_old_zarr + def test_pset_execute_outputdt_forwards(fieldset): """Testing output data dt matches outputdt in forward time.""" outputdt = timedelta(hours=1) @@ -424,7 +424,7 @@ def test_pset_execute_outputdt_forwards(fieldset): assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) -@pytest.mark.uses_old_zarr + def test_pset_execute_output_time_forwards(fieldset): """Testing output times start at initial time and end at initial time + runtime.""" outputdt = np.timedelta64(1, "h") @@ -439,7 +439,7 @@ def test_pset_execute_output_time_forwards(fieldset): ) -@pytest.mark.uses_old_zarr + def test_pset_execute_outputdt_backwards(fieldset): """Testing output data dt matches outputdt in backwards time.""" outputdt = timedelta(hours=1) @@ -451,7 +451,7 @@ def test_pset_execute_outputdt_backwards(fieldset): assert np.all(file_outputdt == np.timedelta64(-outputdt)) -@pytest.mark.uses_old_zarr + def test_pset_execute_outputdt_backwards_fieldset_timevarying(): """test_pset_execute_outputdt_backwards() still passed despite #1722 as it doesn't account for time-varying fields, which for some reason #1722 @@ -487,7 +487,7 @@ def test_particlefile_init_invalid(tmp_store): # TODO: Add test for read only s ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=1) -@pytest.mark.uses_old_zarr + def test_particlefile_write_particle_data(tmp_store): nparticles = 100 @@ -533,19 +533,19 @@ def test_pfile_write_custom_particle(): @pytest.mark.xfail( reason="set_variable_write_status should be removed - with Particle writing defined on the particle level. GH2186" ) -def test_pfile_set_towrite_False(fieldset, tmp_zarrfile): +def test_pfile_set_towrite_False(fieldset, tmp_parquet): npart = 10 pset = ParticleSet(fieldset, pclass=Particle, lon=np.linspace(0, 1, npart), lat=0.5 * np.ones(npart)) pset.set_variable_write_status("z", False) pset.set_variable_write_status("lat", False) - pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) + pfile = pset.ParticleFile(tmp_parquet, outputdt=1) def Update_lon(particles, fieldset): # pragma: no cover particles.dlon += 0.1 pset.execute(Update_lon, runtime=10, output_file=pfile) - ds = xr.open_zarr(tmp_zarrfile) + ds = xr.open_zarr(tmp_parquet) assert "time" in ds assert "z" not in ds assert "lat" not in ds diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 95e517140..04591019b 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -11,19 +11,19 @@ ) -@pytest.mark.uses_old_zarr + @pytest.mark.parametrize("integrator", [AdvectionEE, AdvectionRK2, AdvectionRK4]) -def test_ux_constant_flow_face_centered_2D(integrator, tmp_zarrfile): +def test_ux_constant_flow_face_centered_2D(integrator, tmp_parquet): ds = datasets_unstructured["ux_constant_flow_face_centered_2D"] T = np.timedelta64(3600, "s") dt = np.timedelta64(300, "s") fieldset = parcels.FieldSet.from_ugrid_conventions(ds, mesh="flat") pset = parcels.ParticleSet(fieldset, lon=[5.0], lat=[5.0]) - pfile = parcels.ParticleFile(store=tmp_zarrfile, outputdt=dt) + pfile = parcels.ParticleFile(store=tmp_parquet, outputdt=dt) pset.execute(integrator, runtime=T, dt=dt, output_file=pfile, verbose_progress=False) expected_lon = 8.6 np.testing.assert_allclose(pset.lon, expected_lon, atol=1e-5) - ds_out = xr.open_zarr(tmp_zarrfile) - np.testing.assert_allclose(ds_out["lon"][:, -1], expected_lon, atol=1e-5) + df = xr.open_zarr(tmp_parquet) + np.testing.assert_allclose(df["lon"][:, -1], expected_lon, atol=1e-5) From 5daec3bc131dfb1f2ee43e299c0d2b40b52cdb8a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:33:49 +0200 Subject: [PATCH 06/69] Remove test_variable_write_double Covered by test_write_dtypes_pfile --- tests/test_particlefile.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 67775fa30..eeade646f 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -142,22 +142,6 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): assert np.all(np.isnan(ds["time"][:, 1:])) -@pytest.mark.uses_old_zarr -def test_variable_write_double(fieldset, tmp_zarrfile): - def Update_lon(particles, fieldset): # pragma: no cover - particles.dlon += 0.1 - - dt = np.timedelta64(1, "s") - particle = get_default_particle(np.float64) - pset = ParticleSet(fieldset, pclass=particle, lon=[0], lat=[0]) - ofile = ParticleFile(tmp_zarrfile, outputdt=dt) - pset.execute(Update_lon, runtime=np.timedelta64(10, "s"), dt=dt, output_file=ofile) - - ds = xr.open_zarr(tmp_zarrfile) - lons = ds["lon"][:] - assert isinstance(lons.values[0, 0], np.float64) - - def test_write_dtypes_pfile(fieldset, tmp_parquet): dtypes = [ From 2a07ced6996d5966a68ab005ca403128a8b6d0e5 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 17 Apr 2026 19:25:35 +0200 Subject: [PATCH 07/69] Fixing tests --- src/parcels/_core/particlefile.py | 11 ++- tests-v3/test_advection.py | 1 + tests-v3/test_fieldset_sampling.py | 1 + tests/test_fieldset.py | 1 + tests/test_particlefile.py | 116 ++++++++--------------------- 5 files changed, 39 insertions(+), 91 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 125a925c7..8d121e55d 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -9,6 +9,7 @@ import cftime import numpy as np import pyarrow as pa +from parcels._typing import PathLike import pyarrow.parquet as pq import parcels @@ -58,22 +59,24 @@ class ParticleFile: ParticleFile object that can be used to write particle data to file """ - def __init__(self, path: Path, outputdt): + def __init__(self, path: PathLike, outputdt): if not isinstance(outputdt, (np.timedelta64, timedelta, float)): raise ValueError( f"Expected outputdt to be a np.timedelta64, datetime.timedelta or float (in seconds), got {type(outputdt)}" ) outputdt = timedelta_to_float(outputdt) + path = Path(path) + + if path.suffix != ".parquet": + raise ValueError(f"ParticleFile data is stored in Parquet files - file extension must be '.parquet'. Got {path.suffix=!r}.") if outputdt <= 0: raise ValueError(f"outputdt must be positive/non-zero. Got {outputdt=!r}") self._outputdt = outputdt - self._path = Path( - path - ) # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? + self._path = path # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? self._writer: pq.ParquetWriter | None = None if path.exists(): # TODO: Add logic for recovering/appending to existing parquet file diff --git a/tests-v3/test_advection.py b/tests-v3/test_advection.py index 77abee900..bdd7a4221 100644 --- a/tests-v3/test_advection.py +++ b/tests-v3/test_advection.py @@ -1,5 +1,6 @@ import numpy as np import pytest +import pandas as pd import xarray as xr from parcels import ( diff --git a/tests-v3/test_fieldset_sampling.py b/tests-v3/test_fieldset_sampling.py index 5f25e355c..176eedab1 100644 --- a/tests-v3/test_fieldset_sampling.py +++ b/tests-v3/test_fieldset_sampling.py @@ -3,6 +3,7 @@ from math import cos, pi import numpy as np +import pandas as pd import pytest import xarray as xr diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index d9b8d6000..8a64546f8 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -4,6 +4,7 @@ import cftime import numpy as np import pytest +import pandas as pd import xarray as xr from parcels import Field, ParticleFile, ParticleSet, VectorField, XGrid, convert diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index eeade646f..397ec6218 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -2,7 +2,8 @@ import tempfile from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta - +import pyarrow.parquet as pq +import pandas as pd import numpy as np import pyarrow as pa import pytest @@ -53,28 +54,8 @@ def test_metadata(fieldset, tmp_parquet): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_parquet) - assert ds.attrs["parcels_kernels"].lower() == "DoNothing".lower() - - - -def test_pfile_array_write_zarr_memorystore(fieldset): - """Check that writing to a Zarr MemoryStore works.""" - npart = 10 - zarr_store = MemoryStore() - pset = ParticleSet( - fieldset, - pclass=Particle, - lon=np.linspace(0, 1, npart), - lat=0.5 * np.ones(npart), - time=fieldset.time_interval.left, - ) - pfile = ParticleFile(zarr_store, outputdt=np.timedelta64(1, "s")) - pfile.write(pset, time=fieldset.time_interval.left) - - ds = xr.open_zarr(zarr_store) - assert ds.sizes["trajectory"] == npart - + tab = pq.read_table(tmp_parquet) + assert tab.schema.metadata[b"parcels_kernels"].decode().lower() == "DoNothing".lower() def test_write_fieldset_without_time(tmp_parquet): @@ -88,12 +69,14 @@ def test_write_fieldset_without_time(tmp_parquet): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_parquet) - assert ds.time.values[0, 1] == np.timedelta64(1, "s") - + df = pd.read_parquet(tmp_parquet) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") + assert df['time'][1] == np.timedelta64(1, "s") +@pytest.mark.xfail("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") def test_pfile_array_remove_particles(fieldset, tmp_parquet): + """If a particle from the middle of a particleset is removed, that writing doesn't crash""" npart = 10 pset = ParticleSet( fieldset, @@ -115,8 +98,7 @@ def test_pfile_array_remove_particles(fieldset, tmp_parquet): -@pytest.mark.parametrize("chunks_obs", [1, None]) -def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): +def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): npart = 10 pset = ParticleSet( fieldset, @@ -125,21 +107,17 @@ def test_pfile_array_remove_all_particles(fieldset, chunks_obs, tmp_parquet): lat=0.5 * np.ones(npart), time=fieldset.time_interval.left, ) - chunks = (npart, chunks_obs) if chunks_obs else None - pfile = ParticleFile(tmp_parquet, chunks=chunks, outputdt=np.timedelta64(1, "s")) + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=0) for _ in range(npart): pset.remove_indices(-1) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(1, "D")) pfile.write(pset, fieldset.time_interval.left + np.timedelta64(2, "D")) + pfile.close() - ds = xr.open_zarr(tmp_parquet) - np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) - if chunks_obs is not None: - assert ds["time"][:].shape == chunks - else: - assert ds["time"][:].shape[0] == npart - assert np.all(np.isnan(ds["time"][:, 1:])) + df = pd.read_parquet(tmp_parquet) + # np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) # TODO: Need to figure out how times work with parquet output (#2386) + assert df['trajectory'].nunique() == npart @@ -164,12 +142,11 @@ def test_write_dtypes_pfile(fieldset, tmp_parquet): pset = ParticleSet(fieldset, pclass=MyParticle, lon=0, lat=0, time=fieldset.time_interval.left) pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pfile.write(pset, time=fieldset.time_interval.left) + pfile.close() - ds = xr.open_zarr( - tmp_parquet, mask_and_scale=False - ) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float + tab = pq.read_table(tmp_parquet) for d in dtypes: - assert ds[f"v_{d.__name__}"].dtype == d + assert tab[f"v_{d.__name__}"].type == pa.from_numpy_dtype(d) def test_variable_written_once(): @@ -196,7 +173,7 @@ def test_pset_repeated_release_delayed_adding_deleting(fieldset, tmp_parquet, dt pclass=MyParticle, time=fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(npart)], ) - pfile = ParticleFile(tmp_parquet, outputdt=abs(dt), chunks=(1, 1)) + pfile = ParticleFile(tmp_parquet, outputdt=abs(dt)) def IncrLon(particles, fieldset): # pragma: no cover particles.sample_var += 1.0 @@ -387,7 +364,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg ) with tempfile.TemporaryDirectory() as dir: - name = f"{dir}/test.zarr" + name = f"{dir}/tmp.parquet" output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) @@ -455,59 +432,24 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) -def test_particlefile_init(tmp_store): - ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=(1, 3)) +def test_particlefile_init(tmp_parquet): + ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) -@pytest.mark.parametrize("name", ["store", "outputdt", "chunks", "create_new_zarrfile"]) -def test_particlefile_readonly_attrs(tmp_store, name): - pfile = ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=(1, 3)) +@pytest.mark.parametrize("name", ["path", "outputdt"]) +def test_particlefile_readonly_attrs(tmp_parquet, name): + pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) with pytest.raises(AttributeError, match="property .* of 'ParticleFile' object has no setter"): setattr(pfile, name, "something") -def test_particlefile_init_invalid(tmp_store): # TODO: Add test for read only store - with pytest.raises(ValueError, match="chunks must be a tuple"): - ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=1) +def test_particlefile_init_invalid(tmp_path): + path = tmp_path / 'file.not-parquet' + with pytest.raises(ValueError, match="file extension must be '.parquet'"): + ParticleFile(path, outputdt=np.timedelta64(1, "s")) -def test_particlefile_write_particle_data(tmp_store): - nparticles = 100 - - pfile = ParticleFile(tmp_store, outputdt=np.timedelta64(1, "s"), chunks=(nparticles, 40)) - pclass = Particle - - left, right = np.datetime64("2019-05-30T12:00:00.000000000", "ns"), np.datetime64("2020-01-02", "ns") - time_interval = TimeInterval(left=left, right=right) - - initial_lon = np.linspace(0, 1, nparticles) - data = create_particle_data( - pclass=pclass, - nparticles=nparticles, - ngrids=4, - time_interval=time_interval, - initial={ - "time": np.full(nparticles, fill_value=0), - "lon": initial_lon, - "dt": np.full(nparticles, fill_value=1.0), - "trajectory": np.arange(nparticles), - }, - ) - np.testing.assert_array_equal(data["time"], 0) - pfile._write_particle_data( - particle_data=data, - pclass=pclass, - time_interval=time_interval, - time=left, - ) - ds = xr.open_zarr(tmp_store) - assert ds.time.dtype == "datetime64[ns]" - np.testing.assert_equal(ds["time"].isel(obs=0).values, left) - assert ds.sizes["trajectory"] == nparticles - np.testing.assert_allclose(ds["lon"].isel(obs=0).values, initial_lon) - - def test_pfile_write_custom_particle(): # Test the writing of a custom particle with variables that are to_write, some to_write once, and some not to_write # ? This is more of an integration test... Should it be housed here? From b8c547761a729ce0503a5986b7f5307ac472d737 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:11:50 +0200 Subject: [PATCH 08/69] More test fixing --- tests/test_particlefile.py | 64 +++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 397ec6218..a3fe47b00 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -47,7 +47,6 @@ def fieldset() -> FieldSet: # TODO v4: Move into a `conftest.py` file and remov ) - def test_metadata(fieldset, tmp_parquet): pset = ParticleSet(fieldset, pclass=Particle, lon=0, lat=0) @@ -71,10 +70,10 @@ def test_write_fieldset_without_time(tmp_parquet): df = pd.read_parquet(tmp_parquet) pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert df['time'][1] == np.timedelta64(1, "s") + assert df["time"][1] == np.timedelta64(1, "s") -@pytest.mark.xfail("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") +@pytest.mark.skip("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") def test_pfile_array_remove_particles(fieldset, tmp_parquet): """If a particle from the middle of a particleset is removed, that writing doesn't crash""" npart = 10 @@ -97,7 +96,6 @@ def test_pfile_array_remove_particles(fieldset, tmp_parquet): assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) - def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): npart = 10 pset = ParticleSet( @@ -117,8 +115,7 @@ def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): df = pd.read_parquet(tmp_parquet) # np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) # TODO: Need to figure out how times work with parquet output (#2386) - assert df['trajectory'].nunique() == npart - + assert df["trajectory"].nunique() == npart def test_write_dtypes_pfile(fieldset, tmp_parquet): @@ -196,7 +193,6 @@ def IncrLon(particles, fieldset): # pragma: no cover assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB - def test_file_warnings(fieldset, tmp_parquet): pset = ParticleSet(fieldset, lon=[0, 0], lat=[0, 0], time=[np.timedelta64(0, "s"), np.timedelta64(1, "s")]) pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(2, "s")) @@ -221,22 +217,22 @@ def test_outputdt_types(outputdt, expectation, tmp_parquet): assert pfile.outputdt == timedelta_to_float(outputdt) - def test_write_timebackward(fieldset, tmp_parquet): release_time = fieldset.time_interval.left + [np.timedelta64(i + 1, "s") for i in range(3)] pset = ParticleSet(fieldset, lat=[0, 1, 2], lon=[0, 0, 0], time=release_time) pfile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(3, "s"), dt=-np.timedelta64(1, "s"), output_file=pfile) - ds = xr.open_zarr(tmp_parquet) - trajs = ds["trajectory"][:] - - output_time = ds["time"][:].values + df = pd.read_parquet(tmp_parquet) - assert trajs.values.dtype == "int64" - assert np.all(np.diff(trajs.values) < 0) # all particles written in order of release - doutput_time = np.diff(output_time, axis=1) - assert np.all(doutput_time[~np.isnan(doutput_time)] < 0) # all times written in decreasing order + assert df["trajectory"].dtype == "int64" + assert bool( + df.groupby("trajectory") + .apply( + lambda x: (np.diff(x["time"]) < 0).all() # for each particle - set True if it has decreasing time + ) + .all() # ensure for all particles + ) @pytest.mark.xfail @@ -294,7 +290,6 @@ def SampleP(particles, fieldset): # pragma: no cover assert fieldset.U.grid.lat[yi] <= lat < fieldset.U.grid.lat[yi + 1] - @pytest.mark.parametrize("outputdt", [np.timedelta64(1, "s"), np.timedelta64(2, "s"), np.timedelta64(3, "s")]) def test_time_is_age(fieldset, tmp_parquet, outputdt): # Test that particle age is same as time - initial_time @@ -311,6 +306,7 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") ds = xr.open_zarr(tmp_parquet) age = ds["age"][:].values.astype("timedelta64[s]") ds_timediff = np.zeros_like(age) @@ -319,7 +315,6 @@ def IncreaseAge(particles, fieldset): # pragma: no cover np.testing.assert_equal(age, ds_timediff) - def test_reset_dt(fieldset, tmp_parquet): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions @@ -336,7 +331,6 @@ def Update_lon(particles, fieldset): # pragma: no cover assert np.allclose(pset.lon, 0.6) - def test_correct_misaligned_outputdt_dt(fieldset, tmp_parquet): """Testing that outputdt does not need to be a multiple of dt.""" @@ -348,9 +342,10 @@ def Update_lon(particles, fieldset): # pragma: no cover ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(3, "s")) pset.execute(Update_lon, runtime=np.timedelta64(11, "s"), dt=np.timedelta64(2, "s"), output_file=ofile) - ds = xr.open_zarr(tmp_parquet) - assert np.allclose(ds.lon.values, [0, 3, 6, 9]) - assert np.allclose(timedelta_to_float(ds.time.values - ds.time.values[0, 0]), [0, 3, 6, 9]) + df = pd.read_parquet(tmp_parquet) + assert np.allclose(df['lon'].values, [0, 3, 6, 9]) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") + assert np.allclose(timedelta_to_float(df.time.values - df.time.values[0, 0]), [0, 3, 6, 9]) def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwargs, particle_class=Particle): @@ -368,10 +363,9 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) - ds = xr.open_zarr(name).load() - - return ds + df = pd.read_parquet(name) + return df def test_pset_execute_outputdt_forwards(fieldset): @@ -380,39 +374,37 @@ def test_pset_execute_outputdt_forwards(fieldset): runtime = timedelta(hours=5) dt = timedelta(minutes=5) - ds = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) - def test_pset_execute_output_time_forwards(fieldset): """Testing output times start at initial time and end at initial time + runtime.""" outputdt = np.timedelta64(1, "h") runtime = np.timedelta64(5, "h") dt = np.timedelta64(5, "m") - ds = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert ( ds.time[0, 0].values == fieldset.time_interval.left and ds.time[0, -1].values == fieldset.time_interval.left + runtime ) - def test_pset_execute_outputdt_backwards(fieldset): """Testing output data dt matches outputdt in backwards time.""" outputdt = timedelta(hours=1) runtime = timedelta(days=2) dt = -timedelta(minutes=5) - ds = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values assert np.all(file_outputdt == np.timedelta64(-outputdt)) - def test_pset_execute_outputdt_backwards_fieldset_timevarying(): """test_pset_execute_outputdt_backwards() still passed despite #1722 as it doesn't account for time-varying fields, which for some reason #1722 @@ -427,7 +419,8 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): ds_fset = copernicusmarine_to_sgrid(fields=fields) fieldset = FieldSet.from_sgrid_conventions(ds_fset) - ds = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) @@ -444,12 +437,11 @@ def test_particlefile_readonly_attrs(tmp_parquet, name): def test_particlefile_init_invalid(tmp_path): - path = tmp_path / 'file.not-parquet' + path = tmp_path / "file.not-parquet" with pytest.raises(ValueError, match="file extension must be '.parquet'"): ParticleFile(path, outputdt=np.timedelta64(1, "s")) - def test_pfile_write_custom_particle(): # Test the writing of a custom particle with variables that are to_write, some to_write once, and some not to_write # ? This is more of an integration test... Should it be housed here? From 2d438c5f7b9cad3e27264b51f30334d571328af3 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:41:07 +0200 Subject: [PATCH 09/69] Fix last tests --- tests/test_fieldset.py | 2 +- tests/test_uxadvection.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index 8a64546f8..c7131608d 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -108,7 +108,7 @@ def SampleP(particles, fieldset): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(SampleP, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - df = xr.open_zarr(tmp_parquet) + df = pd.read_parquet(tmp_parquet) assert len(df["lon"]) == 2 diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 04591019b..52509ba28 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -9,6 +9,7 @@ AdvectionRK2, AdvectionRK4, ) +import pandas as pd @@ -20,10 +21,10 @@ def test_ux_constant_flow_face_centered_2D(integrator, tmp_parquet): fieldset = parcels.FieldSet.from_ugrid_conventions(ds, mesh="flat") pset = parcels.ParticleSet(fieldset, lon=[5.0], lat=[5.0]) - pfile = parcels.ParticleFile(store=tmp_parquet, outputdt=dt) + pfile = parcels.ParticleFile(path=tmp_parquet, outputdt=dt) pset.execute(integrator, runtime=T, dt=dt, output_file=pfile, verbose_progress=False) expected_lon = 8.6 np.testing.assert_allclose(pset.lon, expected_lon, atol=1e-5) - df = xr.open_zarr(tmp_parquet) - np.testing.assert_allclose(df["lon"][:, -1], expected_lon, atol=1e-5) + df = pd.read_parquet(tmp_parquet) + np.testing.assert_allclose(df["lon"].iloc[-1], expected_lon, atol=1e-5) From 32a82fac4236a3824a7190d65b31b39f78cbbc00 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 20 Apr 2026 09:48:43 +0000 Subject: [PATCH 10/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/parcels/_core/particlefile.py | 10 ++++++---- src/parcels/_core/particleset.py | 2 +- src/parcels/_reprs.py | 2 -- tests/conftest.py | 4 ++-- tests/test_advection.py | 2 +- tests/test_fieldset.py | 3 +-- tests/test_particlefile.py | 12 ++++++------ tests/test_uxadvection.py | 4 +--- 8 files changed, 18 insertions(+), 21 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 8d121e55d..c9cdfa62a 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -9,13 +9,13 @@ import cftime import numpy as np import pyarrow as pa -from parcels._typing import PathLike import pyarrow.parquet as pq import parcels from parcels._core.particle import ParticleClass from parcels._core.utils.time import timedelta_to_float from parcels._reprs import particlefile_repr +from parcels._typing import PathLike if TYPE_CHECKING: from parcels._core.particle import Variable @@ -69,14 +69,16 @@ def __init__(self, path: PathLike, outputdt): path = Path(path) if path.suffix != ".parquet": - raise ValueError(f"ParticleFile data is stored in Parquet files - file extension must be '.parquet'. Got {path.suffix=!r}.") + raise ValueError( + f"ParticleFile data is stored in Parquet files - file extension must be '.parquet'. Got {path.suffix=!r}." + ) if outputdt <= 0: raise ValueError(f"outputdt must be positive/non-zero. Got {outputdt=!r}") self._outputdt = outputdt - self._path = path # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? + self._path = path # TODO v4: Consider https://arrow.apache.org/docs/python/getstarted.html#working-with-large-data - though a significant question becomes how to partition, perhaps using a particle variable "partition"? self._writer: pq.ParquetWriter | None = None if path.exists(): # TODO: Add logic for recovering/appending to existing parquet file @@ -138,7 +140,7 @@ def write(self, pset: ParticleSet, time, indices=None): indices_to_write = _to_write_particles(particle_data, time) else: indices_to_write = indices - + self._writer.write_table( pa.table({v.name: pa.array(particle_data[v.name][indices_to_write]) for v in vars_to_write}), ) diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index e4ecb252b..f2e74112f 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -453,7 +453,7 @@ def execute( if output_file is not None: output_file.close() - + if verbose_progress: pbar.close() diff --git a/src/parcels/_reprs.py b/src/parcels/_reprs.py index 34b6814a0..d27eee379 100644 --- a/src/parcels/_reprs.py +++ b/src/parcels/_reprs.py @@ -7,7 +7,6 @@ import numpy as np import xarray as xr -from zarr.storage import DirectoryStore if TYPE_CHECKING: from parcels import Field, FieldSet, ParticleSet @@ -178,6 +177,5 @@ def _format_list_items_multiline(items: list[str] | dict, level: int = 1, with_b return "\n".join([textwrap.indent(e, indentation_str) for e in entries]) - def is_builtin_object(obj): return obj.__class__.__module__ == "builtins" diff --git a/tests/conftest.py b/tests/conftest.py index 56bbfa480..1853f9774 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,7 +12,7 @@ def tmp_parquet(tmp_path, request): def tmp_store(): return MemoryStore() + @pytest.fixture def tmp_parquet(tmp_path): - return tmp_path / 'tmp.parquet' - + return tmp_path / "tmp.parquet" diff --git a/tests/test_advection.py b/tests/test_advection.py index 365e9e6f4..95eca30f3 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -1,10 +1,10 @@ import numpy as np +import pandas as pd import pytest import xarray as xr import parcels import parcels.tutorial -import pandas as pd from parcels import ( Field, FieldSet, diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index c7131608d..6eeef20a6 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -3,8 +3,8 @@ import cf_xarray # noqa: F401 import cftime import numpy as np -import pytest import pandas as pd +import pytest import xarray as xr from parcels import Field, ParticleFile, ParticleSet, VectorField, XGrid, convert @@ -96,7 +96,6 @@ def test_fieldset_gridset(fieldset): assert len(fieldset.gridset) == 2 - def test_fieldset_no_UV(tmp_parquet): grid = XGrid.from_dataset(ds, mesh="flat") fieldset = FieldSet([Field("P", ds["U_A_grid"], grid, interp_method=XLinear)]) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index a3fe47b00..70ef5b505 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -2,13 +2,13 @@ import tempfile from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta -import pyarrow.parquet as pq -import pandas as pd + import numpy as np +import pandas as pd import pyarrow as pa +import pyarrow.parquet as pq import pytest import xarray as xr -from zarr.storage import MemoryStore import parcels.tutorial from parcels import ( @@ -22,9 +22,9 @@ VectorField, XGrid, ) -from parcels._core.particle import Particle, create_particle_data, get_default_particle +from parcels._core.particle import Particle, get_default_particle from parcels._core.particlefile import _get_schema -from parcels._core.utils.time import TimeInterval, timedelta_to_float +from parcels._core.utils.time import timedelta_to_float from parcels._datasets.structured.generated import peninsula_dataset from parcels._datasets.structured.generic import datasets from parcels.convert import copernicusmarine_to_sgrid @@ -343,7 +343,7 @@ def Update_lon(particles, fieldset): # pragma: no cover pset.execute(Update_lon, runtime=np.timedelta64(11, "s"), dt=np.timedelta64(2, "s"), output_file=ofile) df = pd.read_parquet(tmp_parquet) - assert np.allclose(df['lon'].values, [0, 3, 6, 9]) + assert np.allclose(df["lon"].values, [0, 3, 6, 9]) pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert np.allclose(timedelta_to_float(df.time.values - df.time.values[0, 0]), [0, 3, 6, 9]) diff --git a/tests/test_uxadvection.py b/tests/test_uxadvection.py index 52509ba28..d3db9aecd 100644 --- a/tests/test_uxadvection.py +++ b/tests/test_uxadvection.py @@ -1,6 +1,6 @@ import numpy as np +import pandas as pd import pytest -import xarray as xr import parcels from parcels._datasets.unstructured.generic import datasets as datasets_unstructured @@ -9,8 +9,6 @@ AdvectionRK2, AdvectionRK4, ) -import pandas as pd - @pytest.mark.parametrize("integrator", [AdvectionEE, AdvectionRK2, AdvectionRK4]) From 19fbd8dddac5272915a4d9a1d343ecb47a779b41 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:51:15 +0200 Subject: [PATCH 11/69] Remove old fixtures --- tests/conftest.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1853f9774..0fd949880 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,16 +1,4 @@ import pytest -from zarr.storage import MemoryStore - - -@pytest.fixture() -def tmp_parquet(tmp_path, request): - test_name = request.node.name - yield tmp_path / f"{test_name}-output.zarr" - - -@pytest.fixture -def tmp_store(): - return MemoryStore() @pytest.fixture From e74672fefe40d7e75d67b256748df0ea47746f22 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 11:52:04 +0200 Subject: [PATCH 12/69] Fix pre-commit errors --- tests/test_particlefile.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 70ef5b505..f2a3c4553 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -374,9 +374,9 @@ def test_pset_execute_outputdt_forwards(fieldset): runtime = timedelta(hours=5) dt = timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) + assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) # noqa: F821 def test_pset_execute_output_time_forwards(fieldset): @@ -385,11 +385,11 @@ def test_pset_execute_output_time_forwards(fieldset): runtime = np.timedelta64(5, "h") dt = np.timedelta64(5, "m") - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") assert ( - ds.time[0, 0].values == fieldset.time_interval.left - and ds.time[0, -1].values == fieldset.time_interval.left + runtime + ds.time[0, 0].values == fieldset.time_interval.left # noqa: F821 + and ds.time[0, -1].values == fieldset.time_interval.left + runtime # noqa: F821 ) @@ -399,9 +399,9 @@ def test_pset_execute_outputdt_backwards(fieldset): runtime = timedelta(days=2) dt = -timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values + file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 assert np.all(file_outputdt == np.timedelta64(-outputdt)) @@ -419,9 +419,9 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): ds_fset = copernicusmarine_to_sgrid(fields=fields) fieldset = FieldSet.from_sgrid_conventions(ds_fset) - df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) # noqa: F841 pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values + file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) From db9f9834296dfab2804a782a0ad28e936cfe0188 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:08:54 +0200 Subject: [PATCH 13/69] Cleanup This mark was only introduced during refactoring --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d0e6f7ba1..85aba3a67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,6 @@ markers = [ # can be skipped by doing `pytest -m "not slow"` etc. "v4alpha: failing tests that should work for v4alpha", "v4future: failing tests that should work for a future release of v4", "v4remove: failing tests that should probably be removed later", - "uses_old_zarr: tests that need to be migrated to the new particleset format", ] filterwarnings = [ From ac2a8309f197f4a98121d62b3ea8447791685086 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 13:56:37 +0200 Subject: [PATCH 14/69] Add pandas and pyarrow as explicit dependencies --- .github/ci/recipe.yaml | 2 ++ pixi.toml | 4 ++++ pyproject.toml | 2 ++ 3 files changed, 8 insertions(+) diff --git a/.github/ci/recipe.yaml b/.github/ci/recipe.yaml index 2a887e8b0..388c60b2d 100644 --- a/.github/ci/recipe.yaml +++ b/.github/ci/recipe.yaml @@ -36,6 +36,8 @@ requirements: - numpy >=2.1.0 - tqdm >=4.50.0 - xarray >=2025.8.0,<2026.4.0 # TODO: remove upper pin when https://github.com/UXARRAY/uxarray/issues/1490 is resolved + - pandas >=2.2 + - pyarrow >=20.0.0 - cf_xarray >=0.8.6 - xgcm >=0.9.0 - zarr >=2.15.0,!=2.18.0,<3 diff --git a/pixi.toml b/pixi.toml index a71be98ff..286d7f28c 100644 --- a/pixi.toml +++ b/pixi.toml @@ -24,6 +24,8 @@ netcdf4 = ">=1.6.0" numpy = ">=2.1.0" tqdm = ">=4.50.0" xarray = ">=2024.5.0,<2026.4.0" # TODO: remove upper pin when https://github.com/UXARRAY/uxarray/issues/1490 is resolved +pandas = ">=2.2" +pyarrow = ">=20.0.0" holoviews = ">=1.22.0" # https://github.com/prefix-dev/rattler-build/issues/2326 uxarray = ">=2025.3.0" dask = ">=2024.5.1" @@ -51,6 +53,8 @@ netcdf4 = "1.6.*" numpy = "2.1.*" tqdm = "4.50.*" xarray = "2025.8.*" +pandas = "2.2.*" +pyarrow = "20.0.*" uxarray = "2025.3.*" dask = "2024.6.*" zarr = "2.18.*" diff --git a/pyproject.toml b/pyproject.toml index 85aba3a67..072da3b77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,8 @@ dependencies = [ "zarr >=2.15.0,!=2.18.0,<3", "tqdm >=4.50.0", "xarray >=2024.5.0,<2026.4.0", # TODO: remove upper pin when https://github.com/UXARRAY/uxarray/issues/1490 is resolved + "pandas >= 2.2", + "pyarrow >=20.0.0", "uxarray >=2025.3.0", "pooch >=1.8.0", "xgcm >=0.9.0", From de464e561ca97627b0003cb54a1a112e44ffb707 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 13:47:35 +0200 Subject: [PATCH 15/69] Add assert_cftime_like_particlefile Remove temporary test_cftime.py file --- tests/test_utils.py | 29 +++++++++++++++++++++++++++++ tests/utils.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/tests/test_utils.py b/tests/test_utils.py index b42e13330..65ce4ea7a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,9 @@ +import json + import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq +import pytest from tests import utils @@ -17,3 +22,27 @@ def test_round_and_hash_float_array(): arr_test = arr + 0.51 * delta h3 = utils.round_and_hash_float_array(arr_test, decimals=decimals) assert h3 != h + + +@pytest.mark.parametrize("cal", ["julian", "proleptic_gregorian", "365_day", "366_day", "360_day"]) +def test_assert_cftime_like_particlefile(tmp_path, cal): + path = tmp_path / "test.parquet" + attrs = {"units": "seconds since 2000-01-01 17:00:00", "calendar": cal} + field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + schema = pa.schema([field]) + table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) + pq.write_table(table, path) + + utils.assert_cftime_like_particlefile(path) + + +def test_assert_cftime_like_particlefile_broken_parquet(tmp_path): + path = tmp_path / "test.parquet" + attrs = {"units": "broken-units", "calendar": "365_day"} + field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + schema = pa.schema([field]) + table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) + pq.write_table(table, path) + + with pytest.raises(Exception, match="CF-time values in Parquet did not get properly decoded"): + utils.assert_cftime_like_particlefile(path) diff --git a/tests/utils.py b/tests/utils.py index 3213abd31..d20eaf845 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -151,3 +151,34 @@ def round_and_hash_float_array(arr, decimals=6): # Mimic Java's HashMap hash transformation h ^= (h >> 20) ^ (h >> 12) return h ^ (h >> 7) ^ (h >> 4) + + +def assert_cftime_like_particlefile(parquet_path: Path) -> None: + import json + + import cftime + import pyarrow as pa + import pyarrow.parquet as pq + + assert parquet_path.suffix == ".parquet", "Path must be a parquet file" + + table = pq.read_table(parquet_path) + assert "time" in table.schema.names, "Parquet file must have a 'time' column" + + time_field = table.schema.field("time") + assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( + f"'time' column must be numeric, got {time_field.type}" + ) + + raw_meta = time_field.metadata + attrs = json.loads(raw_meta[b"attrs"]) + + values = table.column("time").to_pylist() + v = xr.Variable(("time",), values, attrs) + decoded = xr.coders.CFDatetimeCoder(time_unit="s").decode(v) + + # check first value (and hence rest of array) is what we expect + assert isinstance(decoded.values[0], (cftime.datetime, np.datetime64)), ( + "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" + ) + return From 57ccf6f9b49d20417dfe4972589b1ba2d2df6995 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 20 Apr 2026 14:31:47 +0200 Subject: [PATCH 16/69] MAINT: Cleanup create_particle_data This function is now independent of the time_interval as time is now stored as float --- src/parcels/_core/particle.py | 19 +++++-------------- src/parcels/_core/particleset.py | 1 - tests/test_particle.py | 5 +---- 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/src/parcels/_core/particle.py b/src/parcels/_core/particle.py index dc39a063c..0e1be1792 100644 --- a/src/parcels/_core/particle.py +++ b/src/parcels/_core/particle.py @@ -8,7 +8,6 @@ from parcels._compat import _attrgetter_helper from parcels._core.statuscodes import StatusCode from parcels._core.utils.string import _assert_str_and_python_varname -from parcels._core.utils.time import TimeInterval from parcels._reprs import particleclass_repr, variable_repr __all__ = ["Particle", "ParticleClass", "Variable"] @@ -176,7 +175,6 @@ def create_particle_data( pclass: ParticleClass, nparticles: int, ngrids: int, - time_interval: TimeInterval, initial: dict[str, np.ndarray] | None = None, ): if initial is None: @@ -207,16 +205,9 @@ def create_particle_data( name_to_copy = var.initial(_attrgetter_helper) data[var.name] = data[name_to_copy].copy() else: - data[var.name] = _create_array_for_variable(var, nparticles, time_interval) + data[var.name] = np.full( + shape=(nparticles,), + fill_value=var.initial, + dtype=var.dtype, + ) return data - - -def _create_array_for_variable(variable: Variable, nparticles: int, time_interval: TimeInterval): - assert not isinstance(variable.initial, operator.attrgetter), ( - "This function cannot handle attrgetter initial values." - ) - return np.full( - shape=(nparticles,), - fill_value=variable.initial, - dtype=variable.dtype, - ) diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index f2e74112f..e59e2860b 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -111,7 +111,6 @@ def __init__( pclass=pclass, nparticles=lon.size, ngrids=len(fieldset.gridset), - time_interval=fieldset.time_interval, initial=dict( lon=lon, lat=lat, diff --git a/tests/test_particle.py b/tests/test_particle.py index dabe6944c..62eb65cff 100644 --- a/tests/test_particle.py +++ b/tests/test_particle.py @@ -7,8 +7,6 @@ Variable, create_particle_data, ) -from parcels._core.utils.time import TimeInterval -from parcels._datasets.structured.generic import TIME def test_variable_init(): @@ -140,9 +138,8 @@ def test_particleclass_add_variable_collision(): ) @pytest.mark.parametrize("nparticles", [5, 10]) def test_create_particle_data(particle, nparticles): - time_interval = TimeInterval(TIME[0], TIME[-1]) ngrids = 4 - data = create_particle_data(pclass=particle, nparticles=nparticles, ngrids=ngrids, time_interval=time_interval) + data = create_particle_data(pclass=particle, nparticles=nparticles, ngrids=ngrids) assert isinstance(data, dict) assert len(data) == len(particle.variables) + 1 # ei variable is separate From b2bde50c568a985d1c954e7da9a533d5a6f6e48a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:05:18 +0200 Subject: [PATCH 17/69] Add cftime metadata serialization --- src/parcels/_core/particle.py | 6 +++++- src/parcels/_core/particlefile.py | 26 ++++++++++++++++--------- src/parcels/_core/utils/time.py | 32 ++++++++++++++++++++++++++++++- tests/utils/test_time.py | 12 +++++++++++- 4 files changed, 64 insertions(+), 12 deletions(-) diff --git a/src/parcels/_core/particle.py b/src/parcels/_core/particle.py index 0e1be1792..5152b1130 100644 --- a/src/parcels/_core/particle.py +++ b/src/parcels/_core/particle.py @@ -148,7 +148,11 @@ def get_default_particle(spatial_dtype: type[np.float32] | type[np.float64]) -> Variable( "time", dtype=np.float64, - attrs={"standard_name": "time", "units": "seconds", "axis": "T"}, + attrs={ + "standard_name": "time", + "units": "seconds", + "axis": "T", + }, # "units" and "calendar" gets updated/set if working with cftime time domain ), Variable( "trajectory", diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index c9cdfa62a..439db2668 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -25,18 +25,24 @@ __all__ = ["ParticleFile"] -def _get_schema(particle: parcels.ParticleClass, file_metadata: dict[Any, Any]) -> pa.Schema: - return pa.schema( - [ +def _get_schema( + particle: parcels.ParticleClass, file_metadata: dict[Any, Any], fset_time_interval: TimeInterval | None +) -> pa.Schema: + + fields = [] + for v in _get_vars_to_write(particle): + attrs = v.attrs.copy() + if v.name == "time": + if fset_time_interval is not None: + attrs.update(fset_time_interval.get_cf_attrs()) + fields.append( pa.field( v.name, pa.from_numpy_dtype(v.dtype), - metadata=v.attrs, + metadata=attrs, ) - for v in _get_vars_to_write(particle) - ], - metadata=file_metadata.copy(), - ) + ) + return pa.schema(fields, metadata=file_metadata.copy()) class ParticleFile: @@ -131,7 +137,9 @@ def write(self, pset: ParticleSet, time, indices=None): if self._writer is None: assert not self.path.exists(), "If the file exists, the writer should already be set" - self._writer = pq.ParquetWriter(self.path, _get_schema(pclass, self.extra_metadata)) + self._writer = pq.ParquetWriter( + self.path, _get_schema(pclass, self.extra_metadata, pset.fieldset.time_interval) + ) if isinstance(time, (np.timedelta64, np.datetime64)): time = timedelta_to_float(time - time_interval.left) diff --git a/src/parcels/_core/utils/time.py b/src/parcels/_core/utils/time.py index b76473a3f..7b9299f3a 100644 --- a/src/parcels/_core/utils/time.py +++ b/src/parcels/_core/utils/time.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime, timedelta -from typing import TYPE_CHECKING, TypeVar +from typing import TYPE_CHECKING, Literal, TypeVar, cast import cftime import numpy as np @@ -85,6 +85,36 @@ def intersection(self, other: TimeInterval) -> TimeInterval | None: return TimeInterval(start, end) if start <= end else None + def get_cf_attrs(self) -> dict[Literal["units", "calendar"], str]: + """Return the cf-attrs that would correspond to x seconds from the left edge.""" + return _get_cf_attrs(self.left) + + +def _get_cf_attrs(dt: TimeLike) -> dict[Literal["units", "calendar"], str]: + if isinstance(dt, cftime.datetime): + dt = cast(cftime.datetime, dt) + return {"units": f"seconds since {dt.strftime(dt.format)}", "calendar": dt.calendar} + + from pandas import Timestamp + + if isinstance(dt, np.datetime64): + dt = Timestamp(dt) + + if isinstance(dt, (Timestamp, datetime)): + dt_cf = cftime.datetime( + year=dt.year, + month=dt.month, + day=dt.day, + hour=dt.hour, + minute=dt.minute, + second=dt.second, + microsecond=dt.microsecond, + calendar="gregorian", # What is the cftime proleptic_gregorian calendar? is that relevant here? + ) + return _get_cf_attrs(dt_cf) + + raise NotImplementedError(f"Not implemented for time object {type(dt)=!r}") + def is_compatible( t1: datetime | cftime.datetime | np.timedelta64, t2: datetime | cftime.datetime | np.timedelta64 diff --git a/tests/utils/test_time.py b/tests/utils/test_time.py index ef1f39346..2734b3e91 100644 --- a/tests/utils/test_time.py +++ b/tests/utils/test_time.py @@ -8,7 +8,12 @@ from hypothesis import given from hypothesis import strategies as st -from parcels._core.utils.time import TimeInterval, maybe_convert_python_timedelta_to_numpy, timedelta_to_float +from parcels._core.utils.time import ( + TimeInterval, + _get_cf_attrs, + maybe_convert_python_timedelta_to_numpy, + timedelta_to_float, +) calendar_strategy = st.sampled_from( [ @@ -215,3 +220,8 @@ def test_timedelta_to_float(input, expected): def test_timedelta_to_float_exceptions(): with pytest.raises((ValueError, TypeError)): timedelta_to_float("invalid_type") + + +@given(datetime_strategy()) +def test_datetime_get_cf_attrs(dt): + _get_cf_attrs(dt) From 55493a9d8b5d4353ac90a3fc068a0b843bb76b3d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 15:58:26 +0200 Subject: [PATCH 18/69] Add np.timedelta64 support --- src/parcels/_core/utils/time.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/parcels/_core/utils/time.py b/src/parcels/_core/utils/time.py index 7b9299f3a..bdce7fc13 100644 --- a/src/parcels/_core/utils/time.py +++ b/src/parcels/_core/utils/time.py @@ -95,6 +95,9 @@ def _get_cf_attrs(dt: TimeLike) -> dict[Literal["units", "calendar"], str]: dt = cast(cftime.datetime, dt) return {"units": f"seconds since {dt.strftime(dt.format)}", "calendar": dt.calendar} + if isinstance(dt, np.timedelta64): + return {"units": "seconds"} + from pandas import Timestamp if isinstance(dt, np.datetime64): From bab4d5d7fc6f56d569b9a64513eb962f4ed243c6 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 16:04:20 +0200 Subject: [PATCH 19/69] Fix assert_cftime_like_particlefile Remove nested key - save on root instead --- tests/test_advection.py | 3 ++- tests/test_utils.py | 6 ++---- tests/utils.py | 4 +--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/tests/test_advection.py b/tests/test_advection.py index 95eca30f3..e5c000132 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -37,7 +37,7 @@ AdvectionRK4_3D, AdvectionRK45, ) -from tests.utils import DEFAULT_PARTICLES +from tests.utils import DEFAULT_PARTICLES, assert_cftime_like_particlefile @pytest.mark.parametrize("mesh", ["spherical", "flat"]) @@ -76,6 +76,7 @@ def test_advection_zonal_with_particlefile(tmp_parquet): df = pd.read_parquet(tmp_parquet) final_time = df["time"].max() np.testing.assert_allclose(df[df["time"] == final_time]["lon"].values, pset.lon, atol=1e-5) + assert_cftime_like_particlefile(tmp_parquet) def periodicBC(particles, fieldset): diff --git a/tests/test_utils.py b/tests/test_utils.py index 65ce4ea7a..d8d695c18 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,5 +1,3 @@ -import json - import numpy as np import pyarrow as pa import pyarrow.parquet as pq @@ -28,7 +26,7 @@ def test_round_and_hash_float_array(): def test_assert_cftime_like_particlefile(tmp_path, cal): path = tmp_path / "test.parquet" attrs = {"units": "seconds since 2000-01-01 17:00:00", "calendar": cal} - field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + field = pa.field("time", pa.float64(), metadata=attrs) schema = pa.schema([field]) table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) pq.write_table(table, path) @@ -39,7 +37,7 @@ def test_assert_cftime_like_particlefile(tmp_path, cal): def test_assert_cftime_like_particlefile_broken_parquet(tmp_path): path = tmp_path / "test.parquet" attrs = {"units": "broken-units", "calendar": "365_day"} - field = pa.field("time", pa.float64(), metadata={"attrs": json.dumps(attrs)}) + field = pa.field("time", pa.float64(), metadata=attrs) schema = pa.schema([field]) table = pa.table({"time": pa.array([-20.0, 1.0])}, schema=schema) pq.write_table(table, path) diff --git a/tests/utils.py b/tests/utils.py index d20eaf845..f474aca2f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -154,7 +154,6 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: - import json import cftime import pyarrow as pa @@ -170,8 +169,7 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: f"'time' column must be numeric, got {time_field.type}" ) - raw_meta = time_field.metadata - attrs = json.loads(raw_meta[b"attrs"]) + attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} values = table.column("time").to_pylist() v = xr.Variable(("time",), values, attrs) From b28665c4d8375c9390c42330e42d7f4c2284115a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 17:27:04 +0200 Subject: [PATCH 20/69] Move imports --- tests/utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index f474aca2f..374c8ecc3 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -6,7 +6,10 @@ from collections import defaultdict from pathlib import Path +import cftime import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq import xarray as xr import parcels @@ -154,11 +157,6 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: - - import cftime - import pyarrow as pa - import pyarrow.parquet as pq - assert parquet_path.suffix == ".parquet", "Path must be a parquet file" table = pq.read_table(parquet_path) From 7184e1f3c7b541c68e598fda71895b8e152a8c5a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 17:29:23 +0200 Subject: [PATCH 21/69] Fixing tests --- tests/test_particlefile.py | 41 +++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index f2a3c4553..0a9534e05 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -68,9 +68,10 @@ def test_write_fieldset_without_time(tmp_parquet): ofile = ParticleFile(tmp_parquet, outputdt=np.timedelta64(1, "s")) pset.execute(DoNothing, runtime=np.timedelta64(1, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - df = pd.read_parquet(tmp_parquet) - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert df["time"][1] == np.timedelta64(1, "s") + table = pq.read_table(tmp_parquet) + assert table.schema.field("time").metadata[b"units"] == b"seconds" + assert b"calendar" not in table.schema.field("time").metadata + assert table["time"].to_numpy()[1] == 1.0 @pytest.mark.skip("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") @@ -114,7 +115,6 @@ def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): pfile.close() df = pd.read_parquet(tmp_parquet) - # np.testing.assert_allclose(ds["time"][:, 0] - fieldset.time_interval.left, np.timedelta64(0, "s")) # TODO: Need to figure out how times work with parquet output (#2386) assert df["trajectory"].nunique() == npart @@ -344,8 +344,7 @@ def Update_lon(particles, fieldset): # pragma: no cover df = pd.read_parquet(tmp_parquet) assert np.allclose(df["lon"].values, [0, 3, 6, 9]) - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert np.allclose(timedelta_to_float(df.time.values - df.time.values[0, 0]), [0, 3, 6, 9]) + assert np.allclose(df.time - df.time.min(), [0, 3, 6, 9]) def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwargs, particle_class=Particle): @@ -374,9 +373,10 @@ def test_pset_execute_outputdt_forwards(fieldset): runtime = timedelta(hours=5) dt = timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert np.all(ds.isel(trajectory=0).time.diff(dim="obs").values == np.timedelta64(outputdt)) # noqa: F821 + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + particle_0_times = df[df.trajectory == 0].time.values + + np.testing.assert_equal(np.diff(particle_0_times), outputdt.seconds) def test_pset_execute_output_time_forwards(fieldset): @@ -385,12 +385,9 @@ def test_pset_execute_output_time_forwards(fieldset): runtime = np.timedelta64(5, "h") dt = np.timedelta64(5, "m") - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - assert ( - ds.time[0, 0].values == fieldset.time_interval.left # noqa: F821 - and ds.time[0, -1].values == fieldset.time_interval.left + runtime # noqa: F821 - ) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + assert df.time.min() == 0.0 + assert df.time.max() == runtime / np.timedelta64(1, "s") def test_pset_execute_outputdt_backwards(fieldset): @@ -399,10 +396,9 @@ def test_pset_execute_outputdt_backwards(fieldset): runtime = timedelta(days=2) dt = -timedelta(minutes=5) - df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 - assert np.all(file_outputdt == np.timedelta64(-outputdt)) + df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) + particle_0_times = df[df.trajectory == 0].time.values + np.testing.assert_equal(np.diff(particle_0_times), -outputdt.seconds) def test_pset_execute_outputdt_backwards_fieldset_timevarying(): @@ -419,10 +415,9 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): ds_fset = copernicusmarine_to_sgrid(fields=fields) fieldset = FieldSet.from_sgrid_conventions(ds_fset) - df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) # noqa: F841 - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - file_outputdt = ds.isel(trajectory=0).time.diff(dim="obs").values # noqa: F821 - assert np.all(file_outputdt == np.timedelta64(-outputdt)), (file_outputdt, np.timedelta64(-outputdt)) + df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) + particle_0_times = df[df.trajectory == 0].time.values + np.testing.assert_equal(np.diff(particle_0_times), -outputdt.seconds) def test_particlefile_init(tmp_parquet): From e7e37ef3a81228e6280787c39d98b9389df0189f Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Wed, 22 Apr 2026 18:31:39 +0200 Subject: [PATCH 22/69] Fix test_time_is_age test --- tests/test_particlefile.py | 15 ++++++++------- tests/utils.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 0a9534e05..0afc81f44 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -30,6 +30,7 @@ from parcels.convert import copernicusmarine_to_sgrid from parcels.interpolators import XLinear, XLinear_Velocity from parcels.kernels import AdvectionRK4 +from tests import utils from tests.common_kernels import DoNothing @@ -306,13 +307,13 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - pytest.skip("# TODO: Need to figure out how times work with parquet output (#2386)") - ds = xr.open_zarr(tmp_parquet) - age = ds["age"][:].values.astype("timedelta64[s]") - ds_timediff = np.zeros_like(age) - for i in range(npart): - ds_timediff[i, :] = ds.time.values[i, :] - time[i] - np.testing.assert_equal(age, ds_timediff) + # df = pd.read_parquet(tmp_parquet) + df = utils.read_particlefile(tmp_parquet) + + # Map sorted trajectory IDs to release times (0, 1, ..., npart-1 seconds) + for index, df_traj in df.groupby("trajectory"): + release_time = time[index] + np.testing.assert_equal(df_traj["age"].astype("timedelta64[s]").values, (df_traj["time"] - release_time).values) def test_reset_dt(fieldset, tmp_parquet): diff --git a/tests/utils.py b/tests/utils.py index 374c8ecc3..f2d88919f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,6 +8,7 @@ import cftime import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import xarray as xr @@ -178,3 +179,35 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return + + +def read_particlefile(path: Path, decode_times: bool = True) -> pd.DataFrame: + assert path.suffix == ".parquet", "Only Parquet files are supported" + + table = pq.read_table(path) + + try: + time_field = table.field("time") + except KeyError as e: + raise ValueError( + f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" + ) from e + + try: + assert b"units" in time_field.metadata + except AssertionError as e: + raise ValueError(f"Could not find 'units' in the 'time' column metadata for parquet {path=!r}.") from e + + attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} + + df = pd.read_parquet(path) + if not decode_times: + return df + + values = table.column("time").to_numpy() + var = xr.Variable(("time",), values, attrs) + values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values + + df["time"] = values + + return df From 54c829a7dd04584827b4395da33b61741c4d9dae Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:03:09 +0200 Subject: [PATCH 23/69] Refactor assert_cftime_like_particlefile --- tests/utils.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index f2d88919f..166710bec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -160,22 +160,10 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: assert parquet_path.suffix == ".parquet", "Path must be a parquet file" - table = pq.read_table(parquet_path) - assert "time" in table.schema.names, "Parquet file must have a 'time' column" - - time_field = table.schema.field("time") - assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( - f"'time' column must be numeric, got {time_field.type}" - ) - - attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} - - values = table.column("time").to_pylist() - v = xr.Variable(("time",), values, attrs) - decoded = xr.coders.CFDatetimeCoder(time_unit="s").decode(v) + df = read_particlefile(parquet_path, decode_times=True) # check first value (and hence rest of array) is what we expect - assert isinstance(decoded.values[0], (cftime.datetime, np.datetime64)), ( + assert isinstance(df["time"].values[0], (cftime.datetime, np.datetime64)), ( "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return @@ -193,6 +181,10 @@ def read_particlefile(path: Path, decode_times: bool = True) -> pd.DataFrame: f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" ) from e + assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( + f"'time' column must be numeric, got {time_field.type}" + ) + try: assert b"units" in time_field.metadata except AssertionError as e: From 8626d486cbceaf2f79cc5c51b2941cba133268fc Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 13:37:45 +0200 Subject: [PATCH 24/69] Self-review feedback --- tests/test_particlefile.py | 6 +++--- tests/utils.py | 5 ++++- tests/utils/test_time.py | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 0afc81f44..d8cd8fd1f 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -363,7 +363,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) - df = pd.read_parquet(name) + df = utils.read_particlefile(name) return df @@ -387,8 +387,8 @@ def test_pset_execute_output_time_forwards(fieldset): dt = np.timedelta64(5, "m") df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - assert df.time.min() == 0.0 - assert df.time.max() == runtime / np.timedelta64(1, "s") + assert df.time.min() == pd.Timestamp(fieldset.time_interval.left) + assert df.time.max() - df.time.min() == runtime def test_pset_execute_outputdt_backwards(fieldset): diff --git a/tests/utils.py b/tests/utils.py index 166710bec..93214848d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ from parcels import FieldSet, Particle, Variable from parcels._core.xgrid import _FIELD_DATA_ORDERING, XGrid, get_axis_from_dim_name from parcels._datasets.structured.generated import simple_UV_dataset +from parcels._typing import PathLike PROJECT_ROOT = Path(__file__).resolve().parents[1] TEST_ROOT = PROJECT_ROOT / "tests" @@ -169,7 +170,9 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: return -def read_particlefile(path: Path, decode_times: bool = True) -> pd.DataFrame: +def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: + path = Path(path) + assert path.suffix == ".parquet", "Only Parquet files are supported" table = pq.read_table(path) diff --git a/tests/utils/test_time.py b/tests/utils/test_time.py index 2734b3e91..26cc39c1c 100644 --- a/tests/utils/test_time.py +++ b/tests/utils/test_time.py @@ -224,4 +224,5 @@ def test_timedelta_to_float_exceptions(): @given(datetime_strategy()) def test_datetime_get_cf_attrs(dt): - _get_cf_attrs(dt) + attrs = _get_cf_attrs(dt) + assert "seconds" in attrs["units"] From 3693329fcf6fcb0bc44a499d19cddbfbf1b15b8d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:34:53 +0200 Subject: [PATCH 25/69] Fix test_particle_schema --- tests/test_particlefile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index d8cd8fd1f..fb5b34bf7 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -24,7 +24,7 @@ ) from parcels._core.particle import Particle, get_default_particle from parcels._core.particlefile import _get_schema -from parcels._core.utils.time import timedelta_to_float +from parcels._core.utils.time import TimeInterval, timedelta_to_float from parcels._datasets.structured.generated import peninsula_dataset from parcels._datasets.structured.generic import datasets from parcels.convert import copernicusmarine_to_sgrid @@ -496,7 +496,7 @@ def Update_lon(particles, fieldset): # pragma: no cover ], ) def test_particle_schema(particle): - s = _get_schema(particle, {}) + s = _get_schema(particle, {}, TimeInterval(datetime(2023, 1, 1, 12, 0), datetime(2023, 1, 2, 12, 0))) written_variables = [v for v in particle.variables if v.to_write] @@ -510,5 +510,9 @@ def test_particle_schema(particle): strict=False, ): assert variable.name == pyarrow_field.name - assert variable.attrs == {k.decode(): v.decode() for k, v in pyarrow_field.metadata.items()} + if variable.name != "time": + assert variable.attrs == {k.decode(): v.decode() for k, v in pyarrow_field.metadata.items()} + else: + assert b"units" in pyarrow_field.metadata + assert b"calendar" in pyarrow_field.metadata assert pa.from_numpy_dtype(variable.dtype) == pyarrow_field.type From 81f127bdb4d6d696b49d6b4e39752d77f46b6e20 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:49:08 +0200 Subject: [PATCH 26/69] Make read_particlefile public --- src/parcels/__init__.py | 3 ++- src/parcels/_core/particlefile.py | 40 ++++++++++++++++++++++++++++ tests/test_particlefile.py | 5 ++-- tests/utils.py | 44 +------------------------------ 4 files changed, 45 insertions(+), 47 deletions(-) diff --git a/src/parcels/__init__.py b/src/parcels/__init__.py index 2a7854cde..7ae1f6928 100644 --- a/src/parcels/__init__.py +++ b/src/parcels/__init__.py @@ -11,7 +11,7 @@ from parcels._core.fieldset import FieldSet from parcels._core.particleset import ParticleSet -from parcels._core.particlefile import ParticleFile +from parcels._core.particlefile import ParticleFile, read_particlefile from parcels._core.particle import ( Variable, Particle, @@ -67,6 +67,7 @@ "ParticleSetWarning", # Utilities "logger", + "read_particlefile", ] _stdlib_warnings.warn( diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 439db2668..b43d4fad2 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -8,8 +8,10 @@ import cftime import numpy as np +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq +import xarray as xr import parcels from parcels._core.particle import ParticleClass @@ -207,3 +209,41 @@ def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # T attrs["calendar"] = calendar return attrs + + +def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: + path = Path(path) + + assert path.suffix == ".parquet", "Only Parquet files are supported" + + table = pq.read_table(path) + + try: + time_field = table.field("time") + except KeyError as e: + raise ValueError( + f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" + ) from e + + assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( + f"'time' column must be numeric, got {time_field.type}" + ) + + try: + assert b"units" in time_field.metadata + except AssertionError as e: + raise ValueError(f"Could not find 'units' in the 'time' column metadata for parquet {path=!r}.") from e + + attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} + + df = pd.read_parquet(path) + if not decode_times: + return df + + values = table.column("time").to_numpy() + var = xr.Variable(("time",), values, attrs) + values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values + + df["time"] = values + + return df diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index fb5b34bf7..4fac1a809 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -30,7 +30,6 @@ from parcels.convert import copernicusmarine_to_sgrid from parcels.interpolators import XLinear, XLinear_Velocity from parcels.kernels import AdvectionRK4 -from tests import utils from tests.common_kernels import DoNothing @@ -308,7 +307,7 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) # df = pd.read_parquet(tmp_parquet) - df = utils.read_particlefile(tmp_parquet) + df = parcels.read_particlefile(tmp_parquet) # Map sorted trajectory IDs to release times (0, 1, ..., npart-1 seconds) for index, df_traj in df.groupby("trajectory"): @@ -363,7 +362,7 @@ def setup_pset_execute(*, fieldset: FieldSet, outputdt: timedelta, execute_kwarg output_file = ParticleFile(name, outputdt=outputdt) pset.execute(DoNothing, output_file=output_file, **execute_kwargs) - df = utils.read_particlefile(name) + df = parcels.read_particlefile(name) return df diff --git a/tests/utils.py b/tests/utils.py index 93214848d..33d6e0012 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,16 +8,12 @@ import cftime import numpy as np -import pandas as pd -import pyarrow as pa -import pyarrow.parquet as pq import xarray as xr import parcels from parcels import FieldSet, Particle, Variable from parcels._core.xgrid import _FIELD_DATA_ORDERING, XGrid, get_axis_from_dim_name from parcels._datasets.structured.generated import simple_UV_dataset -from parcels._typing import PathLike PROJECT_ROOT = Path(__file__).resolve().parents[1] TEST_ROOT = PROJECT_ROOT / "tests" @@ -161,48 +157,10 @@ def round_and_hash_float_array(arr, decimals=6): def assert_cftime_like_particlefile(parquet_path: Path) -> None: assert parquet_path.suffix == ".parquet", "Path must be a parquet file" - df = read_particlefile(parquet_path, decode_times=True) + df = parcels.read_particlefile(parquet_path, decode_times=True) # check first value (and hence rest of array) is what we expect assert isinstance(df["time"].values[0], (cftime.datetime, np.datetime64)), ( "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return - - -def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: - path = Path(path) - - assert path.suffix == ".parquet", "Only Parquet files are supported" - - table = pq.read_table(path) - - try: - time_field = table.field("time") - except KeyError as e: - raise ValueError( - f"Could not find 'time' column in parquet file. Are you sure {path=!r} is a particlefile?" - ) from e - - assert pa.types.is_floating(time_field.type) or pa.types.is_integer(time_field.type), ( - f"'time' column must be numeric, got {time_field.type}" - ) - - try: - assert b"units" in time_field.metadata - except AssertionError as e: - raise ValueError(f"Could not find 'units' in the 'time' column metadata for parquet {path=!r}.") from e - - attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} - - df = pd.read_parquet(path) - if not decode_times: - return df - - values = table.column("time").to_numpy() - var = xr.Variable(("time",), values, attrs) - values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values - - df["time"] = values - - return df From 9fcb5bfbc8f89534d4191d17b332929a14034942 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Thu, 23 Apr 2026 14:52:55 +0200 Subject: [PATCH 27/69] Add docstring to read_particlefile --- src/parcels/_core/particlefile.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index b43d4fad2..e5b7ee5af 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -212,6 +212,31 @@ def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # T def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: + """Read a Parcels particlefile (Parquet format) into a pandas DataFrame. + + Parameters + ---------- + path : PathLike + Path to the ``.parquet`` particlefile. + decode_times : bool, optional + If ``True`` (default), use Xarray to decode the numeric ``time`` column from CF + conventions into ``datetime`` or ``cftime.datetime`` values using the units stored in + the column metadata. If ``False``, the raw numeric values are + returned unchanged. + + Returns + ------- + pd.DataFrame + DataFrame containing the particle data. When *decode_times* is + ``True``, the ``time`` column contains datetime-like values; + otherwise it contains the original numeric representation. + + Notes + ----- + For larger datasets, consider using `Polars `_ directly, + e.g. ``polars.read_parquet(path)``, which offers better performance and lower + memory usage than pandas for large Parquet files. + """ path = Path(path) assert path.suffix == ".parquet", "Only Parquet files are supported" From 41ed3d8189653d8373f00c68471365c8e73c236e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 23 Apr 2026 15:32:25 +0200 Subject: [PATCH 28/69] Updarting Argo tutorial to use parquet --- .../examples/tutorial_Argofloats.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/user_guide/examples/tutorial_Argofloats.ipynb b/docs/user_guide/examples/tutorial_Argofloats.ipynb index a96e7bcb6..1e6abb9a9 100644 --- a/docs/user_guide/examples/tutorial_Argofloats.ipynb +++ b/docs/user_guide/examples/tutorial_Argofloats.ipynb @@ -159,9 +159,8 @@ "\n", "# Create a ParticleFile object to store the output\n", "output_file = parcels.ParticleFile(\n", - " store=\"argo_float.zarr\",\n", + " \"argo_float.parquet\",\n", " outputdt=timedelta(minutes=15),\n", - " chunks=(1, 500), # setting to write in chunks of 500 observations\n", ")\n", "\n", "# Now execute the Kernels for 30 days, saving data every 30 minutes\n", @@ -189,12 +188,13 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = xr.open_zarr(output_file.store)\n", - "x = ds_particles[\"lon\"][:].squeeze()\n", - "y = ds_particles[\"lat\"][:].squeeze()\n", - "z = ds_particles[\"z\"][:].squeeze()\n", - "time = ds_particles[\"time\"][:].squeeze()\n", - "temp = ds_particles[\"temp\"][:].squeeze()" + "df = parcels.read_particlefile(\"argo_float.parquet\")\n", + "\n", + "x = df[\"lon\"].values\n", + "y = df[\"lat\"].values\n", + "z = df[\"z\"].values\n", + "time = df[\"time\"].values\n", + "temp = df[\"temp\"].values" ] }, { From b64a00e7b317d5445e93f752635acdcd92efecec Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 23 Apr 2026 16:12:25 +0200 Subject: [PATCH 29/69] Updating tutorial_nemo to use parquet output --- docs/user_guide/examples/tutorial_nemo.ipynb | 38 +++++++++++--------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/docs/user_guide/examples/tutorial_nemo.ipynb b/docs/user_guide/examples/tutorial_nemo.ipynb index 61fdbac56..77de0c5e5 100644 --- a/docs/user_guide/examples/tutorial_nemo.ipynb +++ b/docs/user_guide/examples/tutorial_nemo.ipynb @@ -151,7 +151,7 @@ "\n", "pset = parcels.ParticleSet(fieldset, lon=lonp, lat=latp)\n", "pfile = parcels.ParticleFile(\n", - " store=\"output_curvilinear.zarr\", outputdt=np.timedelta64(1, \"D\")\n", + " \"output_curvilinear.parquet\", outputdt=np.timedelta64(1, \"D\")\n", ")\n", "\n", "pset.execute(\n", @@ -176,9 +176,11 @@ "metadata": {}, "outputs": [], "source": [ - "ds = xr.open_zarr(\"output_curvilinear.zarr\")\n", + "df = parcels.read_particlefile(\"output_curvilinear.parquet\")\n", + "\n", + "for _, g in df.groupby(\"trajectory\"):\n", + " plt.plot(g.lon, g.lat, \".-\")\n", "\n", - "plt.plot(ds.lon.T, ds.lat.T, \".-\")\n", "plt.vlines(np.arange(-180, 901, 360), -90, 90, color=\"r\", label=\"antimeridian\")\n", "plt.ylabel(\"Latitude [deg N]\")\n", "plt.xlabel(\"Longitude [deg E]\")\n", @@ -202,8 +204,8 @@ "outputs": [], "source": [ "# post processing\n", - "ds[\"lon\"] = ds[\"lon\"] % 360\n", - "ds[\"lon\"] = ds[\"lon\"].where(ds[\"lon\"] <= 180, ds[\"lon\"] - 360)" + "df[\"lon\"] = df[\"lon\"] % 360\n", + "df[\"lon\"] = df[\"lon\"].where(df[\"lon\"] <= 180, df[\"lon\"] - 360)" ] }, { @@ -225,7 +227,7 @@ "\n", "pset = parcels.ParticleSet(fieldset, lon=lonp, lat=latp)\n", "pfile = parcels.ParticleFile(\n", - " store=\"output_curvilinear_periodic.zarr\", outputdt=np.timedelta64(1, \"D\")\n", + " \"output_curvilinear_periodic.parquet\", outputdt=np.timedelta64(1, \"D\")\n", ")\n", "\n", "pset.execute(\n", @@ -242,10 +244,9 @@ "metadata": {}, "outputs": [], "source": [ - "ds_periodic = xr.open_zarr(\"output_curvilinear_periodic.zarr\")\n", - "\n", "fig, ax = plt.subplots(1, 2, figsize=(10, 5))\n", - "ax[0].plot(ds.lon.T, ds.lat.T, \".-\")\n", + "for _, g in df.groupby(\"trajectory\"):\n", + " ax[0].plot(g.lon, g.lat, \".-\")\n", "ax[0].vlines(np.arange(-180, 360, 360), -90, 90, color=\"r\", label=\"antimeridian\")\n", "ax[0].set_ylabel(\"Latitude [deg N]\")\n", "ax[0].set_xlabel(\"Longitude [deg E]\")\n", @@ -253,7 +254,11 @@ "ax[0].set_title(\"in post processing\")\n", "ax[0].legend(loc=\"lower center\")\n", "\n", - "ax[1].plot(ds_periodic.lon.T, ds_periodic.lat.T, \".-\")\n", + "\n", + "df_periodic = parcels.read_particlefile(\"output_curvilinear_periodic.parquet\")\n", + "for _, g in df_periodic.groupby(\"trajectory\"):\n", + " ax[1].plot(g.lon, g.lat, \".-\")\n", + "\n", "ax[1].vlines(np.arange(-180, 360, 360), -90, 90, color=\"r\", label=\"antimeridian\")\n", "ax[1].set_ylabel(\"Latitude [deg N]\")\n", "ax[1].set_xlabel(\"Longitude [deg E]\")\n", @@ -325,9 +330,7 @@ " z=100 * np.ones(npart),\n", ")\n", "\n", - "pfile = parcels.ParticleFile(\n", - " store=\"output_nemo3D.zarr\", outputdt=np.timedelta64(1, \"D\")\n", - ")\n", + "pfile = parcels.ParticleFile(\"output_nemo3D.parquet\", outputdt=np.timedelta64(1, \"D\"))\n", "\n", "pset.execute(\n", " parcels.kernels.AdvectionRK2_3D,\n", @@ -354,8 +357,9 @@ "field = field.where(field != 0, np.nan) # Mask land values for better plotting\n", "plt.pcolormesh(fieldset.U.grid.lon, fieldset.U.grid.lat, field, cmap=\"RdBu\")\n", "\n", - "ds_out = xr.open_zarr(\"output_nemo3D.zarr\")\n", - "plt.scatter(ds_out.lon.T, ds_out.lat.T, c=-ds_out.z.T, marker=\".\")\n", + "df_out = parcels.read_particlefile(\"output_nemo3D.parquet\")\n", + "for _, g in df_out.groupby(\"trajectory\"):\n", + " plt.scatter(g.lon, g.lat, c=-g.z, marker=\".\")\n", "plt.colorbar(label=\"Depth (m)\")\n", "plt.show()" ] @@ -363,7 +367,7 @@ ], "metadata": { "kernelspec": { - "display_name": "default", + "display_name": "docs", "language": "python", "name": "python3" }, @@ -377,7 +381,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.3" + "version": "3.14.4" } }, "nbformat": 4, From 5e0fc7f2fd7184d039efd4d64086e790114ff477 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 23 Apr 2026 16:31:34 +0200 Subject: [PATCH 30/69] Update tutorial_diffusion to use parquet --- .../examples/tutorial_diffusion.ipynb | 77 ++++++++++--------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/docs/user_guide/examples/tutorial_diffusion.ipynb b/docs/user_guide/examples/tutorial_diffusion.ipynb index 5010f8406..b05dd7166 100644 --- a/docs/user_guide/examples/tutorial_diffusion.ipynb +++ b/docs/user_guide/examples/tutorial_diffusion.ipynb @@ -116,7 +116,6 @@ "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import trajan as ta\n", "import xarray as xr\n", "\n", "import parcels\n", @@ -253,11 +252,7 @@ "outputs": [], "source": [ "testParticles = get_test_particles()\n", - "output_file = parcels.ParticleFile(\n", - " store=\"M1_out.zarr\",\n", - " chunks=(len(testParticles), 50),\n", - " outputdt=np.timedelta64(1, \"ms\"),\n", - ")\n", + "output_file = parcels.ParticleFile(\"M1_out.parquet\", outputdt=np.timedelta64(1, \"ms\"))\n", "\n", "testParticles.execute(\n", " parcels.kernels.AdvectionDiffusionM1,\n", @@ -273,7 +268,7 @@ "metadata": {}, "outputs": [], "source": [ - "M1_out = xr.open_zarr(\"M1_out.zarr\")" + "M1_out = parcels.read_particlefile(\"M1_out.parquet\")" ] }, { @@ -294,19 +289,24 @@ "fig.set_figwidth(12)\n", "\n", "x = np.arange(0, 0.3, 0.001)\n", - "for data, ai, dim, ystart, ylim in zip(\n", - " [M1_out.lat, M1_out.lon], ax, (\"y\", \"x\"), (0.75, 0), [(0, 1), (-1, 1)]\n", - "):\n", - " ai.plot(x, data.T[: len(x), :], alpha=0.3)\n", - " ai.scatter(0, ystart, s=20, c=\"r\", zorder=3)\n", + "for _, g in M1_out.groupby(\"trajectory\"):\n", + " ax[0].plot(x, g.lat[: len(x)], alpha=0.3)\n", + "ax[0].scatter(0, 0.75, s=20, c=\"r\", zorder=3)\n", + "ax[0].set_ylabel(\"y\")\n", + "ax[0].set_ylim(0, 1)\n", + "\n", + "for _, g in M1_out.groupby(\"trajectory\"):\n", + " ax[1].plot(x, g.lon[: len(x)], alpha=0.3)\n", + "ax[1].scatter(0, 0, s=20, c=\"r\", zorder=3)\n", + "ax[1].set_ylabel(\"x\")\n", + "ax[1].set_ylim(-1, 1)\n", + "\n", + "for ai in ax:\n", " ai.set_xlabel(\"t\")\n", - " ai.set_ylabel(dim)\n", " ai.set_xlim(0, 0.3)\n", - " ai.set_ylim(ylim)\n", "\n", "fig.suptitle(\n", - " \"`AdvectionDiffusionM1` Simulation: \"\n", - " f\"Particle trajectories in the x- and y-directions against time\"\n", + " \"`AdvectionDiffusionM1` Simulation: Particle trajectories in the x- and y-directions against time\"\n", ")\n", "plt.show()" ] @@ -332,11 +332,7 @@ "outputs": [], "source": [ "testParticles = get_test_particles()\n", - "output_file = parcels.ParticleFile(\n", - " store=\"EM_out.zarr\",\n", - " chunks=(len(testParticles), 50),\n", - " outputdt=np.timedelta64(1, \"ms\"),\n", - ")\n", + "output_file = parcels.ParticleFile(\"EM_out.parquet\", outputdt=np.timedelta64(1, \"ms\"))\n", "np.random.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(\n", " parcels.kernels.AdvectionDiffusionEM,\n", @@ -352,7 +348,7 @@ "metadata": {}, "outputs": [], "source": [ - "EM_out = xr.open_zarr(\"EM_out.zarr\")" + "EM_out = parcels.read_particlefile(\"EM_out.parquet\")" ] }, { @@ -365,19 +361,25 @@ "fig.set_figwidth(12)\n", "\n", "x = np.arange(0, 0.3, 0.001)\n", - "for data, ai, dim, ystart, ylim in zip(\n", - " [EM_out.lat, EM_out.lon], ax, (\"y\", \"x\"), (0.75, 0), [(0, 1), (-1, 1)]\n", - "):\n", - " ai.plot(x, data.T[: len(x), :], alpha=0.3)\n", - " ai.scatter(0, ystart, s=20, c=\"r\", zorder=3)\n", + "x = np.arange(0, 0.3, 0.001)\n", + "for _, g in EM_out.groupby(\"trajectory\"):\n", + " ax[0].plot(x, g.lat[: len(x)], alpha=0.3)\n", + "ax[0].scatter(0, 0.75, s=20, c=\"r\", zorder=3)\n", + "ax[0].set_ylabel(\"y\")\n", + "ax[0].set_ylim(0, 1)\n", + "\n", + "for _, g in EM_out.groupby(\"trajectory\"):\n", + " ax[1].plot(x, g.lon[: len(x)], alpha=0.3)\n", + "ax[1].scatter(0, 0, s=20, c=\"r\", zorder=3)\n", + "ax[1].set_ylabel(\"x\")\n", + "ax[1].set_ylim(-1, 1)\n", + "\n", + "for ai in ax:\n", " ai.set_xlabel(\"t\")\n", - " ai.set_ylabel(dim)\n", " ai.set_xlim(0, 0.3)\n", - " ai.set_ylim(ylim)\n", "\n", "fig.suptitle(\n", - " \"`AdvectionDiffusionEM` Simulation: \"\n", - " f\"Particle trajectories in the x- and y-directions against time\"\n", + " \"`AdvectionDiffusionEM` Simulation: Particle trajectories in the x- and y-directions against time\"\n", ")\n", "plt.show()" ] @@ -568,9 +570,7 @@ " lon=lon,\n", ")\n", "\n", - "output_file = parcels.ParticleFile(\n", - " store=\"smagdiff.zarr\", outputdt=np.timedelta64(1, \"h\"), chunks=(1, 57)\n", - ")\n", + "output_file = parcels.ParticleFile(\"smagdiff.parquet\", outputdt=np.timedelta64(1, \"h\"))\n", "\n", "np.random.seed(1636) # Random seed for reproducibility\n", "\n", @@ -596,13 +596,14 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = xr.open_zarr(\"smagdiff.zarr\")\n", + "df_particles = parcels.read_particlefile(\"smagdiff.parquet\")\n", "\n", "temperature = ds_fields.isel(time=0, depth=0).thetao.plot(cmap=\"magma\")\n", "velocity = ds_fields.isel(time=0, depth=0).plot.quiver(\n", " x=\"longitude\", y=\"latitude\", u=\"uo\", v=\"vo\"\n", ")\n", - "particles = ds_particles.traj.plot(color=\"blue\")\n", + "for _, g in df_particles.groupby(\"trajectory\"):\n", + " plt.plot(g.lon, g.lat, color=\"blue\")\n", "plt.ylim(-31, -30)\n", "plt.xlim(31, 32.1)\n", "plt.show()" @@ -634,7 +635,7 @@ ], "metadata": { "kernelspec": { - "display_name": "test-notebooks", + "display_name": "docs", "language": "python", "name": "python3" }, @@ -648,7 +649,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.14.4" } }, "nbformat": 4, From 511ce102b3ab83801a56c470aa5caca5f9a323e3 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 24 Apr 2026 10:12:03 +0200 Subject: [PATCH 31/69] Update tutorial_output to use parquet --- docs/getting_started/tutorial_output.ipynb | 386 +++++++++------------ 1 file changed, 164 insertions(+), 222 deletions(-) diff --git a/docs/getting_started/tutorial_output.ipynb b/docs/getting_started/tutorial_output.ipynb index c3dbba852..18eb44435 100644 --- a/docs/getting_started/tutorial_output.ipynb +++ b/docs/getting_started/tutorial_output.ipynb @@ -21,7 +21,7 @@ "- [**Plotting**](#plotting)\n", "- [**Animations**](#animations)\n", "\n", - "For more advanced reading and tutorials on the analysis of Lagrangian trajectories, we recommend checking out the [Lagrangian Diagnostics Analysis Cookbook](https://lagrangian-diags.readthedocs.io/en/latest/tutorials.html) and the project in general. The [TrajAn package](https://opendrift.github.io/trajan/index.html) can be used to read and plot datasets of Lagrangian trajectories." + "For more advanced reading and tutorials on the analysis of Lagrangian trajectories, we recommend checking out the [Lagrangian Diagnostics Analysis Cookbook](https://lagrangian-diags.readthedocs.io/en/latest/tutorials.html) and the project in general." ] }, { @@ -30,10 +30,12 @@ "metadata": {}, "outputs": [], "source": [ - "from datetime import datetime, timedelta\n", + "from datetime import datetime\n", "\n", + "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import xarray as xr\n", + "import pandas as pd\n", + "import pyarrow.parquet as pq\n", "\n", "import parcels\n", "import parcels.tutorial" @@ -81,14 +83,14 @@ " fieldset=fieldset, pclass=parcels.Particle, lon=lon, lat=lat, time=time, z=z\n", ")\n", "\n", - "output_file = parcels.ParticleFile(\"output.zarr\", outputdt=np.timedelta64(2, \"h\"))" + "output_file = parcels.ParticleFile(\"output.parquet\", outputdt=np.timedelta64(2, \"h\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Parcels saves some metadata in the output file with every simulation (Parcels version, CF convention information, etc.). This metadata is just a dictionary which is propogated to `xr.Dataset(attrs=...)` and is stored in the `.metadata` attribute. We are free to manipulate this dictionary to add any custom, xarray-compatible metadata relevant to their simulation. Here we add a custom metadata field `date_created` to the output file." + "Parcels saves some metadata in the output file with every simulation (Parcels version, CF convention information, etc.). This metadata is just a dictionary which is propogated to the parquet metadata. We are free to manipulate this dictionary to add any custom metadata relevant to our simulation. Here we add a custom metadata field `date_created` to the output file." ] }, { @@ -97,8 +99,8 @@ "metadata": {}, "outputs": [], "source": [ - "output_file.metadata[\"date_created\"] = datetime.now().isoformat()\n", - "output_file.metadata" + "output_file.extra_metadata[\"date_created\"] = datetime.now().isoformat()\n", + "output_file.extra_metadata" ] }, { @@ -126,23 +128,37 @@ ")" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading the output file\n", + "\n" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "```{note}\n", - "TODO: add section on chunking\n", + "As of Parcels v4, the default output format is [`parquet`](https://parquet.apache.org/) (instead of `zarr`). The `parquet` output format is a tabular format, in which every row corresponds to an observation of a particle trajectory. The `zarr` output format is a multidimensional array format, in which the data is stored in a 2D array with dimensions `traj` and `obs`. The `parquet` format is more compact and faster to read.\n", + "\n", + "However, the `parquet` format does not support the [CF-convention for trajectories data](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_multidimensional_array_representation_of_trajectories) implemented with the [NCEI trajectory template](https://www.ncei.noaa.gov/data/oceans/ncei/formats/netcdf/v2.0/trajectoryIncomplete.cdl). We are working on efficient tooling to convert the parcels `parquet` output into a CF-compliant format.\n", + "\n", + "TODO: Add link to tracking issue on github for this tooling.\n", "```" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Reading the output file\n", + "Parcels exports output trajectories in `parquet` [format](https://parquet.apache.org/). Files in `parquet` are stored in tabular data, so each row corresponds to a particle at a given time step, and columns correspond to particle attributes (lon, lat, time, etc.). \n", "\n", - "Parcels exports output trajectories in `zarr` [format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than netcdf, although may be slightly more challenging to handle (but `xarray` has a fairly seamless `open_zarr()` method). Note when we display the dataset we can see our custom metadata field `date_created`.\n" + "The files can be analysed with a wide range of tools, including `pandas` or the [`lt toolbox`](https://github.com/oj-tooth/lt_toolbox). The latter is specifically designed for the analysis of Lagrangian trajectories, and can be used to compute a wide range of Lagrangian diagnostics- but is still in alpha stage of development.\n", + "\n", + "In Pandas, these files can be opened with `pandas.read_parquet`:" ] }, { @@ -151,29 +167,34 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = xr.open_zarr(\"output.zarr\")\n", + "ds_pandas = pd.read_parquet(\"output.parquet\")\n", "\n", - "print(ds_particles)" + "print(ds_pandas)" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "Once you have loaded the data as an `xarray` DataSet using `xr.open_zarr()`, you can always save the file to NetCDF if you prefer with the `.to_netcdf()` method.\n" + "To see our custom metadata field `date_created`, we need to use `pyarrow.parquet`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "schema = pq.read_schema(\"output.parquet\")\n", + "for k, v in schema.metadata.items():\n", + " print(k.decode(), \"->\", v.decode())" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Trajectory data structure\n", - "\n", - "The data zarr file are organised according to the [CF-convention for trajectories data](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_multidimensional_array_representation_of_trajectories) implemented with the [NCEI trajectory template](https://www.ncei.noaa.gov/data/oceans/ncei/formats/netcdf/v2.0/trajectoryIncomplete.cdl). The data is stored in a **two-dimensional array** with the dimensions `traj` and `obs`. Each particle trajectory is essentially stored as a time series where the coordinate data (`lon`, `lat`, `time`) are a function of the observation (`obs`).\n", - "\n", - "The output dataset used here contains **10 particles** and **13 observations**. Not every particle has 13 observations however; since we released particles at different times some particle trajectories are shorter than others.\n" + "As you may have noticed above, the `time` is shown as a `float` (in seconds) in `ds_pandas`. That is because `pandas.read_parquet` does not automatically convert the cftime. To handle this, we also provide a helper function `parcels.read_particlefile` to read ParticleFiles, which does automatically convert the cftime. " ] }, { @@ -182,11 +203,9 @@ "metadata": {}, "outputs": [], "source": [ - "np.set_printoptions(linewidth=160)\n", - "one_hour = np.timedelta64(1, \"h\") # Define timedelta object to help with conversion\n", - "time_from_start = ds_particles[\"time\"].values - fieldset.time_interval.left\n", + "ds_particles = parcels.read_particlefile(\"output.parquet\")\n", "\n", - "print(time_from_start / one_hour) # timedelta / timedelta -> float number of hours" + "print(ds_particles)" ] }, { @@ -194,7 +213,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note how the first observation occurs at a different time for each trajectory. So remember that `obs != time`\n" + "## Trajectory data structure\n", + "\n", + "The output dataset used here contains **10 particles** and **25 observations**. Not every particle has 25 observations however; since we released particles at different times some particle trajectories are shorter than others." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# np.set_printoptions(linewidth=160)\n", + "\n", + "for traj, g in ds_particles.groupby(\"trajectory\"):\n", + " time_in_hour = (g[\"time\"] - fieldset.time_interval.left) / np.timedelta64(1, \"h\")\n", + " print(f\"trajectory {traj}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" ] }, { @@ -202,29 +236,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Analysis\n", - "\n", - "Sometimes, trajectories are analyzed as they are stored: as individual time series. If we want to study the distance travelled as a function of time, the time we are interested in is the time relative to the start of the each particular trajectory: the array operations are simple since each trajectory is analyzed as a function of `obs`. The time variable is only needed to express the results in the correct units.\n" + "Note how the first observation occurs at a different time for each trajectory.\n" ] }, { - "cell_type": "code", - "execution_count": null, + "attachments": {}, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", + "## Plotting trajectories\n", "\n", - "x = ds_particles[\"lon\"].values\n", - "y = ds_particles[\"lat\"].values\n", - "distance = np.cumsum(\n", - " np.sqrt(np.square(np.diff(x)) + np.square(np.diff(y))), axis=1\n", - ") # d = (dx^2 + dy^2)^(1/2)\n", + "Parcels output consists of particle trajectories through time and space. An important way to explore patterns in this information is to draw the trajectories in space. The [**trajan**](https://opendrift.github.io/trajan/index.html) package can be used to quickly plot parcels results, but users are encouraged to create their own figures, for example by using the comprehensive [**matplotlib**](https://matplotlib.org/) library. Here we show a basic setup on how to process the parcels output into trajectory plots and animations.\n", + "\n", + "```{warning}\n", + "Trajan is not yet compatible with the `parquet` output format, but we are working on a solution to this.\n", + "```\n", "\n", - "real_time = time_from_start / one_hour # convert time to hours\n", - "time_since_release = (\n", - " real_time.transpose() - real_time[:, 0]\n", - ") # substract the initial time from each timeseries" + "Since the `parquet` output format is tabular, a simple plot of the longitude and latitude of all particles will show one continuous line:" ] }, { @@ -233,26 +261,16 @@ "metadata": {}, "outputs": [], "source": [ - "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4), constrained_layout=True)\n", - "\n", - "ax1.set_ylabel(\"Distance travelled [m]\")\n", - "ax1.set_xlabel(\"observation\", weight=\"bold\")\n", - "d_plot = ax1.plot(distance.transpose())\n", - "\n", - "ax2.set_ylabel(\"Distance travelled [m]\")\n", - "ax2.set_xlabel(\"time since release [hours]\", weight=\"bold\")\n", - "d_plot_t = ax2.plot(time_since_release[1:], distance.transpose())\n", + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "ax.plot(ds_particles[\"lon\"].values, ds_particles[\"lat\"].values, \".-\")\n", "plt.show()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "The two figures above show the same graph. Time is not needed to create the first figure. The time variable minus the first value of each trajectory gives the x-axis the correct units in the second figure.\n", - "\n", - "We can also plot the distance travelled as a function of the absolute time easily, since the `time` variable matches up with the data for each individual trajectory.\n" + "To show the trajectories of individual particles, we must group the data by trajectory and plot each trajectory separately:" ] }, { @@ -261,21 +279,19 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure()\n", - "ax = plt.axes()\n", - "ax.set_ylabel(\"Distance travelled [m]\")\n", - "ax.set_xlabel(\"time [hours]\", weight=\"bold\")\n", - "d_plot_t = ax.plot(real_time.T[1:], distance.transpose())" + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "for traj, g in ds_particles.groupby(\"trajectory\"):\n", + " ax.plot(g[\"lon\"].values, g[\"lat\"].values, \".-\", label=f\"P{traj}\")\n", + "ax.legend(loc=\"center left\", bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0)\n", + "plt.tight_layout()\n", + "plt.show()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "### Conditional selection\n", - "\n", - "In other cases, the processing of the data itself however depends on the absolute time at which the observations are made, e.g. studying seasonal phenomena. In that case the array structure is not as simple: the data must be selected by their `time` value. Here we show how the mean location of the particles evolves through time. This also requires the trajectory data to be aligned in time. The data are selected using `xr.DataArray.where()` which compares the time variable to a specific time. This type of selecting data with a condition (`ds_particles['time']==time`) is a powerful tool to analyze trajectory data.\n" + "However, if you want to plot only the locations at a certain time step, you can simply select the data at that time step:" ] }, { @@ -284,65 +300,47 @@ "metadata": {}, "outputs": [], "source": [ - "# Using xarray\n", - "mean_lon_x = []\n", - "mean_lat_x = []\n", + "time_step = np.timedelta64(18, \"h\")\n", + "time_to_plot = fieldset.time_interval.left + time_step\n", + "g = ds_particles[ds_particles[\"time\"] == time_to_plot]\n", "\n", - "timerange = np.arange(\n", - " np.nanmin(ds_particles[\"time\"].values),\n", - " np.nanmax(ds_particles[\"time\"].values) + np.timedelta64(timedelta(hours=2)),\n", - " timedelta(hours=2),\n", - ") # timerange in nanoseconds\n", - "\n", - "for time in timerange:\n", - " # if all trajectories share an observation at time\n", - " if np.all(np.any(ds_particles[\"time\"] == time, axis=1)):\n", - " # find the data that share the time\n", - " mean_lon_x += [\n", - " np.nanmean(ds_particles[\"lon\"].where(ds_particles[\"time\"] == time).values)\n", - " ]\n", - " mean_lat_x += [\n", - " np.nanmean(ds_particles[\"lat\"].where(ds_particles[\"time\"] == time).values)\n", - " ]" + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", + "title_time = pd.to_datetime(time_to_plot).strftime(\"%Y-%m-%d %H:%M:%S\")\n", + "ax.set_title(f\"Particle locations at {title_time}\")\n", + "plt.show()" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "plt.figure()\n", - "ax = plt.axes()\n", - "ax.set_ylabel(\"Meridional distance [m]\")\n", - "ax.set_xlabel(\"Zonal distance [m]\")\n", - "ax.grid()\n", - "ax.scatter(mean_lon_x, mean_lat_x, marker=\"^\", s=80)\n", - "plt.show()" + "Or, if you want to plot the particles a certain amount of time after they were released, you can select the data based on the time since release:" ] }, { - "attachments": {}, - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "## Plotting\n", + "time_step = np.timedelta64(18, \"h\")\n", + "release_time = ds_particles[\"time\"] - ds_particles.groupby(\"trajectory\")[\n", + " \"time\"\n", + "].transform(\"min\")\n", + "g = ds_particles[release_time == time_step]\n", "\n", - "Parcels output consists of particle trajectories through time and space. An important way to explore patterns in this information is to draw the trajectories in space. The [**trajan**](https://opendrift.github.io/trajan/index.html) package can be used to quickly plot parcels results, but users are encouraged to create their own figures, for example by using the comprehensive [**matplotlib**](https://matplotlib.org/) library. Here we show a basic setup on how to process the parcels output into trajectory plots and animations.\n", - "\n", - "Some other packages to help you make beautiful figures are:\n", - "\n", - "- [**cartopy**](https://scitools.org.uk/cartopy/docs/latest/), a map-drawing tool especially compatible with matplotlib\n", - "- [**trajan**](https://opendrift.github.io/trajan/index.html), a package to quickly plot trajectories\n", - "- [**cmocean**](https://matplotlib.org/cmocean/), a set of ocean-relevant colormaps\n" + "fig, ax = plt.subplots(figsize=(5, 3))\n", + "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", + "ax.set_title(f\"Particle locations {time_step} after their release\")\n", + "plt.show()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "To draw the trajectory data in space usually it is informative to draw points at the observed coordinates to see the resolution of the output and draw a line through them to separate the different trajectories. The coordinates to draw are in `lon` and `lat` and can be passed to either `matplotlib.pyplot.plot` or `matplotlib.pyplot.scatter`. Note however, that the default way matplotlib plots 2D arrays is to plot a separate set for each column. In the parcels 2D output, the columns correspond to the `obs` dimension, so to separate the different trajectories we need to transpose the 2D array using `.T`.\n" + "We can plot the distance travelled as a function of absolute and relative time:" ] }, { @@ -351,23 +349,23 @@ "metadata": {}, "outputs": [], "source": [ - "fig, (ax1, ax2, ax3, ax4) = plt.subplots(\n", - " 1, 4, figsize=(16, 3.5), constrained_layout=True\n", - ")\n", + "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n", "\n", - "###-Points-###\n", - "ax1.set_title(\"Points\")\n", - "ax1.scatter(ds_particles[\"lon\"].T, ds_particles[\"lat\"].T)\n", - "###-Lines-###\n", - "ax2.set_title(\"Lines\")\n", - "ax2.plot(ds_particles[\"lon\"].T, ds_particles[\"lat\"].T)\n", - "###-Points + Lines-###\n", - "ax3.set_title(\"Points + Lines\")\n", - "ax3.plot(ds_particles[\"lon\"].T, ds_particles[\"lat\"].T, marker=\"o\")\n", - "###-Not Transposed-###\n", - "ax4.set_title(\"Not transposed\")\n", - "ax4.plot(ds_particles[\"lon\"], ds_particles[\"lat\"], marker=\"o\")\n", + "for traj, g in ds_particles.groupby(\"trajectory\"):\n", + " distance = np.sqrt(\n", + " (g[\"lon\"] - g[\"lon\"].values[0]) ** 2 + (g[\"lat\"] - g[\"lat\"].values[0]) ** 2\n", + " )\n", + " ax[0].plot(g[\"time\"], distance, \".-\", label=f\"P{traj}\")\n", + " rel_time = (g[\"time\"] - g[\"time\"].values[0]) / np.timedelta64(1, \"h\")\n", + " ax[1].plot(rel_time, distance, \".-\", label=f\"P{traj}\")\n", + "\n", + "ax[0].set_xlabel(\"Date\")\n", + "ax[0].set_ylabel(\"Distance travelled [degrees]\")\n", + "ax[0].tick_params(axis=\"x\", labelrotation=45)\n", + "ax[1].set_xlabel(\"Time since release [hours]\")\n", + "ax[1].set_ylabel(\"Distance travelled [degrees]\")\n", "\n", + "plt.tight_layout()\n", "plt.show()" ] }, @@ -384,9 +382,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Trajectory plots like the ones above can become very cluttered for large sets of particles. To better see patterns, it's a good idea to create an animation in time and space. To do this, matplotlib offers an [animation package](https://matplotlib.org/stable/api/animation_api.html). Here we show how to use the [**FuncAnimation**](https://matplotlib.org/3.3.2/api/_as_gen/matplotlib.animation.FuncAnimation.html#matplotlib.animation.FuncAnimation) class to animate parcels trajectory data, based on [this visualisation tutorial](https://github.com/Parcels-code/10year-anniversary-session5/blob/eaf7ac35f43c222280fa5577858be81dc346c06b/animations_tutorial.ipynb) from 10-years Parcels. \n", - "\n", - "To correctly reveal the patterns in time we must remember that the `obs` dimension does not necessarily correspond to the `time` variable ([see the section of Trajectory data structure above](#trajectory-data-structure)). In the animation of the particles, we usually want to draw the points at each consecutive moment in time, not necessarily at each moment since the start of the trajectory. To do this we must [select the correct data](#conditional-selection) in each rendering.\n" + "Trajectory plots like the ones above can become very cluttered for large sets of particles. To better see patterns, it's a good idea to create an animation in time and space. To do this, matplotlib offers an [animation package](https://matplotlib.org/stable/api/animation_api.html). Here we show how to use the [**FuncAnimation**](https://matplotlib.org/3.3.2/api/_as_gen/matplotlib.animation.FuncAnimation.html#matplotlib.animation.FuncAnimation) class to animate parcels trajectory data.\n" ] }, { @@ -398,15 +394,8 @@ "import cartopy.crs as ccrs\n", "import cartopy.feature as cfeature\n", "import matplotlib\n", - "from matplotlib.animation import FuncAnimation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "from matplotlib.animation import FuncAnimation\n", + "\n", "# for interactive display of animation\n", "plt.rcParams[\"animation.html\"] = \"jshtml\"" ] @@ -417,49 +406,22 @@ "metadata": {}, "outputs": [], "source": [ - "# Number of timesteps to animate\n", - "nframes = 25 # use less frames for testing purposes\n", - "nreducedtrails = 1 # every 10th particle will have a trail (if 1, all particles have trails. Adjust for faster performance)\n", - "\n", + "time_step = np.timedelta64(2, \"h\") # time step for animation frames\n", "\n", - "# Set up the colors and associated trajectories:\n", - "# get release times for each particle (first valide obs for each trajectory)\n", - "release_times = ds_particles[\"time\"].min(dim=\"obs\", skipna=True).values\n", - "\n", - "# get unique release times and assign colors\n", - "unique_release_times = np.unique(release_times[~np.isnat(release_times)])\n", - "n_release_times = len(unique_release_times)\n", - "print(f\"Number of unique release times: {n_release_times}\")\n", + "timerange = np.arange(\n", + " np.nanmin(ds_particles[\"time\"]),\n", + " np.nanmax(ds_particles[\"time\"]) + time_step,\n", + " time_step,\n", + ")\n", "\n", - "# choose a continuous colormap\n", + "# set up a unique color for each trajectory\n", "colormap = matplotlib.colormaps[\"tab20b\"]\n", - "\n", - "# set up a unique color for each release time\n", - "release_time_to_color = {}\n", - "for i, release_time in enumerate(unique_release_times):\n", - " release_time_to_color[release_time] = colormap(i / max(n_release_times - 1, 1))\n", - "\n", - "\n", - "# --> Store data for all timeframes (this is needed for faster performance)\n", - "print(\"Pre-computing all particle positions...\")\n", - "all_particles_data = []\n", - "for i, target_time in enumerate(timerange):\n", - " time_id = np.where(ds_particles[\"time\"] == target_time)\n", - " lons = ds_particles[\"lon\"].values[time_id]\n", - " lats = ds_particles[\"lat\"].values[time_id]\n", - " particle_indices = time_id[0]\n", - " valid = ~np.isnan(lons) & ~np.isnan(lats)\n", - "\n", - " all_particles_data.append(\n", - " {\n", - " \"lons\": lons[valid],\n", - " \"lats\": lats[valid],\n", - " \"particle_indices\": particle_indices[valid],\n", - " \"valid_count\": np.sum(valid),\n", - " }\n", + "trajectory_to_color = {}\n", + "for i, trajectory in enumerate(ds_particles[\"trajectory\"].unique()):\n", + " trajectory_to_color[trajectory] = colormap(\n", + " i / max(len(ds_particles[\"trajectory\"].unique()) - 1, 1)\n", " )\n", "\n", - "\n", "# figure setup\n", "fig, ax = plt.subplots(figsize=(6, 5), subplot_kw={\"projection\": ccrs.PlateCarree()})\n", "ax.set_xlim(30, 33)\n", @@ -469,86 +431,66 @@ "ax.set_yticks(ticks=np.arange(-33, -29.5, 0.5))\n", "ax.set_yticklabels(np.arange(33, 29.5, -0.5).astype(str))\n", "ax.set_ylabel(\"Latitude (deg S)\")\n", - "ax.coastlines(color=\"saddlebrown\")\n", - "ax.add_feature(cfeature.LAND, alpha=0.5, facecolor=\"saddlebrown\")\n", - "\n", - "# --> Use pre-computed data for initial setup\n", - "initial_data = all_particles_data[0]\n", - "initial_colors = []\n", - "for particle_idx in initial_data[\"particle_indices\"]:\n", - " rt = release_times[particle_idx]\n", - " if rt in release_time_to_color:\n", - " initial_colors.append(release_time_to_color[rt])\n", - " else:\n", - " initial_colors.append(\"blue\")\n", + "ax.coastlines()\n", + "ax.add_feature(cfeature.LAND)\n", "\n", "# --> plot first timestep\n", - "scatter = ax.scatter(initial_data[\"lons\"], initial_data[\"lats\"], s=10, c=initial_colors)\n", + "g = ds_particles[ds_particles[\"time\"] == timerange[0]]\n", + "scatter = ax.scatter(\n", + " g[\"lon\"], g[\"lat\"], s=10, c=[trajectory_to_color[rt] for rt in g[\"trajectory\"]]\n", + ")\n", "\n", "# --> initialize trails\n", "trail_plot = []\n", "\n", "# Set initial title\n", - "t_str = str(timerange[0])[:19] # Format datetime nicely\n", + "t_str = pd.to_datetime(timerange[0]).strftime(\n", + " \"%Y-%m-%d %H:%M:%S\"\n", + ") # Format datetime nicely\n", "title = ax.set_title(f\"Particles at t = {t_str}\")\n", "\n", "\n", "# loop over for animation\n", "def animate(i):\n", - " print(f\"Animating frame {i + 1}/{len(timerange)} at time {timerange[i]}\")\n", - " t_str = str(timerange[i])[:19]\n", + " t_str = pd.to_datetime(timerange[i]).strftime(\"%Y-%m-%d %H:%M:%S\")\n", " title.set_text(f\"Particles at t = {t_str}\")\n", "\n", " # Find particles at current time\n", - " current_data = all_particles_data[i]\n", - "\n", - " if current_data[\"valid_count\"] > 0:\n", - " current_colors = []\n", - " for particle_idx in current_data[\"particle_indices\"]:\n", - " rt = release_times[particle_idx]\n", - " current_colors.append(release_time_to_color[rt])\n", - "\n", - " scatter.set_offsets(np.c_[current_data[\"lons\"], current_data[\"lats\"]])\n", - " scatter.set_color(current_colors)\n", + " trajs_at_timestep = ds_particles[ds_particles[\"time\"] == timerange[i]]\n", "\n", - " # --> add trails\n", + " if len(trajs_at_timestep) > 0:\n", + " scatter.set_offsets(np.c_[trajs_at_timestep[\"lon\"], trajs_at_timestep[\"lat\"]])\n", + " scatter.set_color(\n", + " [trajectory_to_color[traj] for traj in trajs_at_timestep[\"trajectory\"]]\n", + " )\n", "\n", + " # --> reset trails\n", " for trail in trail_plot:\n", " trail.remove()\n", " trail_plot.clear()\n", - "\n", " trail_length = min(10, i) # trails will have max length of 10 time steps\n", - "\n", " if trail_length > 0:\n", - " sampled_particles = current_data[\"particle_indices\"][\n", - " ::nreducedtrails\n", - " ] # use all or sample if you want faster computation\n", - "\n", - " for particle_idx in sampled_particles:\n", - " trail_lons = []\n", - " trail_lats = []\n", - " for j in range(i - trail_length, i + 1):\n", - " past_data = all_particles_data[j]\n", - " if particle_idx in past_data[\"particle_indices\"]:\n", - " idx = np.where(past_data[\"particle_indices\"] == particle_idx)[\n", - " 0\n", - " ][0]\n", - " trail_lons.append(past_data[\"lons\"][idx])\n", - " trail_lats.append(past_data[\"lats\"][idx])\n", - " if len(trail_lons) > 1:\n", - " rt = release_times[particle_idx]\n", - " color = release_time_to_color[rt]\n", + " for traj in trajs_at_timestep[\"trajectory\"].unique():\n", + " traj_trail = ds_particles[\n", + " (ds_particles[\"trajectory\"] == traj)\n", + " & (ds_particles[\"time\"] >= timerange[max(0, i - trail_length)])\n", + " & (ds_particles[\"time\"] <= timerange[i])\n", + " ]\n", + " if len(traj_trail) > 1:\n", " (trail,) = ax.plot(\n", - " trail_lons, trail_lats, color=color, linewidth=0.6, alpha=0.6\n", + " traj_trail[\"lon\"],\n", + " traj_trail[\"lat\"],\n", + " color=trajectory_to_color[traj],\n", + " linewidth=0.6,\n", + " alpha=0.6,\n", " )\n", " trail_plot.append(trail)\n", - "\n", " else:\n", " scatter.set_offsets(np.empty((0, 2)))\n", "\n", "\n", "# Create animation\n", - "anim = FuncAnimation(fig, animate, frames=nframes, interval=100)\n", + "anim = FuncAnimation(fig, animate, frames=len(timerange), interval=100)\n", "plt.close(fig)\n", "anim" ] @@ -557,7 +499,7 @@ "metadata": { "celltoolbar": "Metagegevens bewerken", "kernelspec": { - "display_name": "default", + "display_name": "docs", "language": "python", "name": "python3" }, @@ -571,7 +513,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.3" + "version": "3.14.4" } }, "nbformat": 4, From 04d8676f113bea63dd85a2229b9f2e5574fbc852 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:30:17 +0200 Subject: [PATCH 32/69] Review feedback --- src/parcels/_core/particlefile.py | 35 +++--------------------------- tests-v3/test_advection.py | 7 +++--- tests-v3/test_fieldset_sampling.py | 7 +++--- tests-v3/test_particlesets.py | 6 ++--- tests/test_particlefile.py | 1 - 5 files changed, 12 insertions(+), 44 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index e5b7ee5af..58483572d 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -2,11 +2,10 @@ from __future__ import annotations -from datetime import datetime, timedelta +from datetime import timedelta from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -import cftime import numpy as np import pandas as pd import pyarrow as pa @@ -52,10 +51,8 @@ class ParticleFile: Parameters ---------- - name : str - Basename of the output file. This can also be a Zarr store object. - particleset : - ParticleSet to output + path : PathLike + Path of the output Parquet file. outputdt : Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() @@ -94,12 +91,8 @@ def __init__(self, path: PathLike, outputdt): if not path.parent.exists(): raise ValueError(f"Folder location for {path=!r} does not exist. Create the folder location first.") - self._maxids = 0 - self._pids_written = {} self.extra_metadata = {} - # TODO v4: Add check that if create_new_zarrfile is False, the store already exists - def __repr__(self) -> str: return particlefile_repr(self) @@ -155,9 +148,6 @@ def write(self, pset: ParticleSet, time, indices=None): pa.table({v.name: pa.array(particle_data[v.name][indices_to_write]) for v in vars_to_write}), ) - # if len(indices_to_write) == 0: # TODO: Remove this? - # return - def close(self): if self._writer is not None: self._writer.close() @@ -192,25 +182,6 @@ def _to_write_particles(particle_data, time): )[0] -def _get_calendar_and_units(time_interval: TimeInterval) -> dict[str, str]: # TODO: Remove? - calendar = None - units = "seconds" - if time_interval: - if isinstance(time_interval.left, (np.datetime64, datetime)): - calendar = "standard" - elif isinstance(time_interval.left, cftime.datetime): - calendar = time_interval.left.calendar - - if calendar is not None: - units += f" since {time_interval.left}" - - attrs = {"units": units} - if calendar is not None: - attrs["calendar"] = calendar - - return attrs - - def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame: """Read a Parcels particlefile (Parquet format) into a pandas DataFrame. diff --git a/tests-v3/test_advection.py b/tests-v3/test_advection.py index bdd7a4221..3d8f06bac 100644 --- a/tests-v3/test_advection.py +++ b/tests-v3/test_advection.py @@ -1,6 +1,5 @@ import numpy as np import pytest -import pandas as pd import xarray as xr from parcels import ( @@ -80,7 +79,7 @@ def test_analyticalAgrid(): @pytest.mark.parametrize("v", [1, -0.3, 0, -1]) @pytest.mark.parametrize("w", [None, 1, -0.3, 0, -1]) @pytest.mark.parametrize("direction", [1, -1]) -def test_uniform_analytical(u, v, w, direction, tmp_parquet): +def test_uniform_analytical(u, v, w, direction, tmp_zarrfile): lon = np.arange(0, 15, dtype=np.float32) lat = np.arange(0, 15, dtype=np.float32) if w is not None: @@ -100,14 +99,14 @@ def test_uniform_analytical(u, v, w, direction, tmp_parquet): x0, y0, z0 = 6.1, 6.2, 20 pset = ParticleSet(fieldset, pclass=Particle, lon=x0, lat=y0, depth=z0) - outfile = pset.ParticleFile(name=tmp_parquet, outputdt=1, chunks=(1, 1)) + outfile = pset.ParticleFile(name=tmp_zarrfile, outputdt=1, chunks=(1, 1)) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) assert np.abs(pset.lon - x0 - pset.time * u) < 1e-6 assert np.abs(pset.lat - y0 - pset.time * v) < 1e-6 if w is not None: assert np.abs(pset.depth - z0 - pset.time * w) < 1e-4 - ds = xr.open_zarr(tmp_parquet) + ds = xr.open_zarr(tmp_zarrfile) times = (direction * ds["time"][:]).values.astype("timedelta64[s]")[0] timeref = np.arange(1, 5).astype("timedelta64[s]") assert np.allclose(times, timeref, atol=np.timedelta64(1, "ms")) diff --git a/tests-v3/test_fieldset_sampling.py b/tests-v3/test_fieldset_sampling.py index 176eedab1..291c27b88 100644 --- a/tests-v3/test_fieldset_sampling.py +++ b/tests-v3/test_fieldset_sampling.py @@ -3,7 +3,6 @@ from math import cos, pi import numpy as np -import pandas as pd import pytest import xarray as xr @@ -774,7 +773,7 @@ def test_multiple_grid_addlater_error(): assert fail -def test_fieldset_sampling_updating_order(tmp_parquet): +def test_fieldset_sampling_updating_order(tmp_zarrfile): def calc_p(t, y, x): return 10 * t + x + 0.2 * y @@ -806,10 +805,10 @@ def SampleP(particle, fieldset, time): # pragma: no cover kernels = [AdvectionRK4, SampleP] - pfile = pset.ParticleFile(tmp_parquet, outputdt=1) + pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) pset.execute(kernels, endtime=1, dt=1, output_file=pfile) - ds = xr.open_zarr(tmp_parquet) + ds = xr.open_zarr(tmp_zarrfile) for t in range(len(ds["obs"])): for i in range(len(ds["trajectory"])): assert np.isclose( diff --git a/tests-v3/test_particlesets.py b/tests-v3/test_particlesets.py index 5c0f2495f..ed884f595 100644 --- a/tests-v3/test_particlesets.py +++ b/tests-v3/test_particlesets.py @@ -39,7 +39,7 @@ def test_pset_create_list_with_customvariable(fieldset): @pytest.mark.parametrize("restart", [True, False]) -def test_pset_create_fromparticlefile(fieldset, restart, tmp_parquet): +def test_pset_create_fromparticlefile(fieldset, restart, tmp_zarrfile): lon = np.linspace(0, 1, 10, dtype=np.float32) lat = np.linspace(1, 0, 10, dtype=np.float32) @@ -48,7 +48,7 @@ def test_pset_create_fromparticlefile(fieldset, restart, tmp_parquet): TestParticle = TestParticle.add_variable("p3", np.float64, to_write="once") pset = ParticleSet(fieldset, lon=lon, lat=lat, depth=[4] * len(lon), pclass=TestParticle, p3=np.arange(len(lon))) - pfile = pset.ParticleFile(tmp_parquet, outputdt=1) + pfile = pset.ParticleFile(tmp_zarrfile, outputdt=1) def Kernel(particle, fieldset, time): # pragma: no cover particle.p = 2.0 @@ -58,7 +58,7 @@ def Kernel(particle, fieldset, time): # pragma: no cover pset.execute(Kernel, runtime=2, dt=1, output_file=pfile) pset_new = ParticleSet.from_particlefile( - fieldset, pclass=TestParticle, filename=tmp_parquet, restart=restart, repeatdt=1 + fieldset, pclass=TestParticle, filename=tmp_zarrfile, restart=restart, repeatdt=1 ) for var in ["lon", "lat", "depth", "time", "p", "p2", "p3"]: diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index 4fac1a809..d53ff6aca 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -306,7 +306,6 @@ def IncreaseAge(particles, fieldset): # pragma: no cover pset.execute(IncreaseAge, runtime=np.timedelta64(npart * 2, "s"), dt=np.timedelta64(1, "s"), output_file=ofile) - # df = pd.read_parquet(tmp_parquet) df = parcels.read_particlefile(tmp_parquet) # Map sorted trajectory IDs to release times (0, 1, ..., npart-1 seconds) From 66951876e02f7cf3411e3a7db804babb83c1d01a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:56:16 +0200 Subject: [PATCH 33/69] Update migration guide --- docs/user_guide/v4-migration.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index 7e33be83c..a472b9844 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -36,7 +36,9 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat ## ParticleFile - Particlefiles should be created by `ParticleFile(...)` instead of `pset.ParticleFile(...)` -- The `name` argument in `ParticleFile` has been replaced by `store` and can now be a string, a Path or a zarr store. +- `ParticleFile` output is now in Parquet format +- `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in xxxx) + ## Field From b4e221412749cf0d3532bbd6383a51c02fec63ff Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:54:48 +0200 Subject: [PATCH 34/69] Remove obs_written --- src/parcels/_core/particle.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/parcels/_core/particle.py b/src/parcels/_core/particle.py index 5152b1130..15c55a519 100644 --- a/src/parcels/_core/particle.py +++ b/src/parcels/_core/particle.py @@ -163,7 +163,6 @@ def get_default_particle(spatial_dtype: type[np.float32] | type[np.float64]) -> "cf_role": "trajectory_id", }, ), - Variable("obs_written", dtype=np.int32, initial=0, to_write=False), Variable("dt", dtype=np.float64, initial=1.0, to_write=False), Variable("state", dtype=np.int32, initial=StatusCode.Evaluate, to_write=False), ] From a5bdd310c4f09e4b0848908c1a6896b425b78bbb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 09:15:56 +0000 Subject: [PATCH 35/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/user_guide/v4-migration.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index a472b9844..9e240d7a3 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -39,7 +39,6 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat - `ParticleFile` output is now in Parquet format - `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in xxxx) - ## Field - `Field.eval()` returns an array of floats instead of a single float (related to the vectorization) From 663f80ca14d8297443f7289eb2456aec316c38d8 Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 10:56:16 +0200 Subject: [PATCH 36/69] Update migration guide --- docs/user_guide/v4-migration.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index 7e33be83c..655f1e9c1 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -36,7 +36,10 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat ## ParticleFile - Particlefiles should be created by `ParticleFile(...)` instead of `pset.ParticleFile(...)` -- The `name` argument in `ParticleFile` has been replaced by `store` and can now be a string, a Path or a zarr store. +- `ParticleFile` output is now in Parquet format +- `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in https://github.com/Parcels-code/Parcels/issues/2593 ) +- A utility to read in ParticleFile output is now available. `parcels.read_particlefile()` + ## Field From 5be22aa1adf4fb7386dd101bb5708fcddd1fbe94 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Apr 2026 09:15:56 +0000 Subject: [PATCH 37/69] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/user_guide/v4-migration.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index 655f1e9c1..0e06bbc7b 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -40,7 +40,6 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat - `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in https://github.com/Parcels-code/Parcels/issues/2593 ) - A utility to read in ParticleFile output is now available. `parcels.read_particlefile()` - ## Field - `Field.eval()` returns an array of floats instead of a single float (related to the vectorization) From 002b8f2a97e73a7baa059b01cabdc6dd719e8a9a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:18:52 +0200 Subject: [PATCH 38/69] Revert from extra_metadata to metadata --- src/parcels/_core/particlefile.py | 8 +++----- src/parcels/_core/particleset.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 58483572d..cb0bf4050 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -91,13 +91,13 @@ def __init__(self, path: PathLike, outputdt): if not path.parent.exists(): raise ValueError(f"Folder location for {path=!r} does not exist. Create the folder location first.") - self.extra_metadata = {} + self.metadata = {} def __repr__(self) -> str: return particlefile_repr(self) def set_metadata(self, parcels_grid_mesh: Literal["spherical", "flat"]): - self.extra_metadata.update( + self.metadata.update( { "feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -132,9 +132,7 @@ def write(self, pset: ParticleSet, time, indices=None): if self._writer is None: assert not self.path.exists(), "If the file exists, the writer should already be set" - self._writer = pq.ParquetWriter( - self.path, _get_schema(pclass, self.extra_metadata, pset.fieldset.time_interval) - ) + self._writer = pq.ParquetWriter(self.path, _get_schema(pclass, self.metadata, pset.fieldset.time_interval)) if isinstance(time, (np.timedelta64, np.datetime64)): time = timedelta_to_float(time - time_interval.left) diff --git a/src/parcels/_core/particleset.py b/src/parcels/_core/particleset.py index e59e2860b..b25c269a6 100644 --- a/src/parcels/_core/particleset.py +++ b/src/parcels/_core/particleset.py @@ -394,7 +394,7 @@ def execute( if output_file is not None: output_file.set_metadata(self.fieldset.gridset[0]._mesh) - output_file.extra_metadata["parcels_kernels"] = self._kernel.funcname + output_file.metadata["parcels_kernels"] = self._kernel.funcname dt, sign_dt = _convert_dt_to_float(dt) self._data["dt"][:] = dt From 3c5264798044135b7f603a0a1b8a1ff529fc7f7d Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:28:20 +0200 Subject: [PATCH 39/69] Fix test_pfile_array_remove_particles --- tests/test_particlefile.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index d53ff6aca..d782fb171 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -1,4 +1,3 @@ -import os import tempfile from contextlib import nullcontext as does_not_raise from datetime import datetime, timedelta @@ -74,7 +73,6 @@ def test_write_fieldset_without_time(tmp_parquet): assert table["time"].to_numpy()[1] == 1.0 -@pytest.mark.skip("Keep or remove? Introduced in 5d7dd6bba800baa0fe4bd38edfc17ca3e310062b ") def test_pfile_array_remove_particles(fieldset, tmp_parquet): """If a particle from the middle of a particleset is removed, that writing doesn't crash""" npart = 10 @@ -92,9 +90,7 @@ def test_pfile_array_remove_particles(fieldset, tmp_parquet): new_time = 86400 # s in a day pset._data["time"][:] = new_time pfile.write(pset, new_time) - ds = xr.open_zarr(tmp_parquet) - timearr = ds["time"][:] - assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) + pfile.close() def test_pfile_array_remove_all_particles(fieldset, tmp_parquet): @@ -189,8 +185,6 @@ def IncrLon(particles, fieldset): # pragma: no cover # test whether samplevar[:, k] = k for k in range(samplevar.shape[1]): assert np.allclose([p for p in samplevar[:, k] if np.isfinite(p)], k + 1) - filesize = os.path.getsize(str(tmp_parquet)) - assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB def test_file_warnings(fieldset, tmp_parquet): From ae375b372a7d332a67dbbfe132174c022a4a973a Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 24 Apr 2026 13:44:24 +0200 Subject: [PATCH 40/69] Fix numpy warning https://github.com/Parcels-code/Parcels/pull/2583#discussion_r3131150923 --- src/parcels/_core/particlefile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index cb0bf4050..90384f356 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -164,15 +164,17 @@ def _to_write_particles(particle_data, time): time - np.abs(particle_data["dt"] / 2), particle_data["time"], where=np.isfinite(particle_data["time"]), + out=None, ) & np.greater_equal( time + np.abs(particle_data["dt"] / 2), particle_data["time"], where=np.isfinite(particle_data["time"]), + out=None, ) # check time - dt/2 <= particle_data["time"] <= time + dt/2 | ( (np.isnan(particle_data["dt"])) - & np.equal(time, particle_data["time"], where=np.isfinite(particle_data["time"])) + & np.equal(time, particle_data["time"], where=np.isfinite(particle_data["time"]), out=None) ) # or dt is NaN and time matches particle_data["time"] ) & (np.isfinite(particle_data["trajectory"])) From c10e7413e573a9b283c9029e7181c0f480c1000c Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 24 Apr 2026 13:55:59 +0200 Subject: [PATCH 41/69] Using polars in tutorial_output --- docs/getting_started/tutorial_output.ipynb | 53 +++++++++++----------- pixi.toml | 1 + 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/docs/getting_started/tutorial_output.ipynb b/docs/getting_started/tutorial_output.ipynb index 18eb44435..307ba9dd5 100644 --- a/docs/getting_started/tutorial_output.ipynb +++ b/docs/getting_started/tutorial_output.ipynb @@ -35,6 +35,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", + "import polars as pl\n", "import pyarrow.parquet as pq\n", "\n", "import parcels\n", @@ -99,8 +100,8 @@ "metadata": {}, "outputs": [], "source": [ - "output_file.extra_metadata[\"date_created\"] = datetime.now().isoformat()\n", - "output_file.extra_metadata" + "output_file.metadata[\"date_created\"] = datetime.now().isoformat()\n", + "output_file.metadata" ] }, { @@ -156,9 +157,9 @@ "source": [ "Parcels exports output trajectories in `parquet` [format](https://parquet.apache.org/). Files in `parquet` are stored in tabular data, so each row corresponds to a particle at a given time step, and columns correspond to particle attributes (lon, lat, time, etc.). \n", "\n", - "The files can be analysed with a wide range of tools, including `pandas` or the [`lt toolbox`](https://github.com/oj-tooth/lt_toolbox). The latter is specifically designed for the analysis of Lagrangian trajectories, and can be used to compute a wide range of Lagrangian diagnostics- but is still in alpha stage of development.\n", + "The files can be analysed with a wide range of tools, including `pandas`, `polars` or the [`lt toolbox`](https://github.com/oj-tooth/lt_toolbox). The latter is specifically designed for the analysis of Lagrangian trajectories, and can be used to compute a wide range of Lagrangian diagnostics- but is still in alpha stage of development.\n", "\n", - "In Pandas, these files can be opened with `pandas.read_parquet`:" + "In Polars, these files can be opened with `polars.read_parquet`:" ] }, { @@ -167,9 +168,9 @@ "metadata": {}, "outputs": [], "source": [ - "ds_pandas = pd.read_parquet(\"output.parquet\")\n", + "df_polars = pl.read_parquet(\"output.parquet\")\n", "\n", - "print(ds_pandas)" + "print(df_polars)" ] }, { @@ -203,9 +204,9 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = parcels.read_particlefile(\"output.parquet\")\n", + "df_particles = parcels.read_particlefile(\"output.parquet\")\n", "\n", - "print(ds_particles)" + "print(df_particles)" ] }, { @@ -224,9 +225,7 @@ "metadata": {}, "outputs": [], "source": [ - "# np.set_printoptions(linewidth=160)\n", - "\n", - "for traj, g in ds_particles.groupby(\"trajectory\"):\n", + "for traj, g in df_particles.groupby(\"trajectory\"):\n", " time_in_hour = (g[\"time\"] - fieldset.time_interval.left) / np.timedelta64(1, \"h\")\n", " print(f\"trajectory {traj}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" ] @@ -262,7 +261,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 3))\n", - "ax.plot(ds_particles[\"lon\"].values, ds_particles[\"lat\"].values, \".-\")\n", + "ax.plot(df_particles[\"lon\"].values, df_particles[\"lat\"].values, \".-\")\n", "plt.show()" ] }, @@ -280,7 +279,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 3))\n", - "for traj, g in ds_particles.groupby(\"trajectory\"):\n", + "for traj, g in df_particles.groupby(\"trajectory\"):\n", " ax.plot(g[\"lon\"].values, g[\"lat\"].values, \".-\", label=f\"P{traj}\")\n", "ax.legend(loc=\"center left\", bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0)\n", "plt.tight_layout()\n", @@ -302,7 +301,7 @@ "source": [ "time_step = np.timedelta64(18, \"h\")\n", "time_to_plot = fieldset.time_interval.left + time_step\n", - "g = ds_particles[ds_particles[\"time\"] == time_to_plot]\n", + "g = df_particles[df_particles[\"time\"] == time_to_plot]\n", "\n", "fig, ax = plt.subplots(figsize=(5, 3))\n", "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", @@ -325,10 +324,10 @@ "outputs": [], "source": [ "time_step = np.timedelta64(18, \"h\")\n", - "release_time = ds_particles[\"time\"] - ds_particles.groupby(\"trajectory\")[\n", + "release_time = df_particles[\"time\"] - df_particles.groupby(\"trajectory\")[\n", " \"time\"\n", "].transform(\"min\")\n", - "g = ds_particles[release_time == time_step]\n", + "g = df_particles[release_time == time_step]\n", "\n", "fig, ax = plt.subplots(figsize=(5, 3))\n", "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", @@ -351,7 +350,7 @@ "source": [ "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n", "\n", - "for traj, g in ds_particles.groupby(\"trajectory\"):\n", + "for traj, g in df_particles.groupby(\"trajectory\"):\n", " distance = np.sqrt(\n", " (g[\"lon\"] - g[\"lon\"].values[0]) ** 2 + (g[\"lat\"] - g[\"lat\"].values[0]) ** 2\n", " )\n", @@ -409,17 +408,17 @@ "time_step = np.timedelta64(2, \"h\") # time step for animation frames\n", "\n", "timerange = np.arange(\n", - " np.nanmin(ds_particles[\"time\"]),\n", - " np.nanmax(ds_particles[\"time\"]) + time_step,\n", + " np.nanmin(df_particles[\"time\"]),\n", + " np.nanmax(df_particles[\"time\"]) + time_step,\n", " time_step,\n", ")\n", "\n", "# set up a unique color for each trajectory\n", "colormap = matplotlib.colormaps[\"tab20b\"]\n", "trajectory_to_color = {}\n", - "for i, trajectory in enumerate(ds_particles[\"trajectory\"].unique()):\n", + "for i, trajectory in enumerate(df_particles[\"trajectory\"].unique()):\n", " trajectory_to_color[trajectory] = colormap(\n", - " i / max(len(ds_particles[\"trajectory\"].unique()) - 1, 1)\n", + " i / max(len(df_particles[\"trajectory\"].unique()) - 1, 1)\n", " )\n", "\n", "# figure setup\n", @@ -435,7 +434,7 @@ "ax.add_feature(cfeature.LAND)\n", "\n", "# --> plot first timestep\n", - "g = ds_particles[ds_particles[\"time\"] == timerange[0]]\n", + "g = df_particles[df_particles[\"time\"] == timerange[0]]\n", "scatter = ax.scatter(\n", " g[\"lon\"], g[\"lat\"], s=10, c=[trajectory_to_color[rt] for rt in g[\"trajectory\"]]\n", ")\n", @@ -456,7 +455,7 @@ " title.set_text(f\"Particles at t = {t_str}\")\n", "\n", " # Find particles at current time\n", - " trajs_at_timestep = ds_particles[ds_particles[\"time\"] == timerange[i]]\n", + " trajs_at_timestep = df_particles[df_particles[\"time\"] == timerange[i]]\n", "\n", " if len(trajs_at_timestep) > 0:\n", " scatter.set_offsets(np.c_[trajs_at_timestep[\"lon\"], trajs_at_timestep[\"lat\"]])\n", @@ -471,10 +470,10 @@ " trail_length = min(10, i) # trails will have max length of 10 time steps\n", " if trail_length > 0:\n", " for traj in trajs_at_timestep[\"trajectory\"].unique():\n", - " traj_trail = ds_particles[\n", - " (ds_particles[\"trajectory\"] == traj)\n", - " & (ds_particles[\"time\"] >= timerange[max(0, i - trail_length)])\n", - " & (ds_particles[\"time\"] <= timerange[i])\n", + " traj_trail = df_particles[\n", + " (df_particles[\"trajectory\"] == traj)\n", + " & (df_particles[\"time\"] >= timerange[max(0, i - trail_length)])\n", + " & (df_particles[\"time\"] <= timerange[i])\n", " ]\n", " if len(traj_trail) > 1:\n", " (trail,) = ax.plot(\n", diff --git a/pixi.toml b/pixi.toml index 286d7f28c..8f49eada1 100644 --- a/pixi.toml +++ b/pixi.toml @@ -87,6 +87,7 @@ trajan = "*" matplotlib-base = ">=2.0.2" gsw = "*" py-triangle = "*" +polars = "*" [feature.devtools.dependencies] pdbpp = "*" From b986c8182c9b6ac6e2c5c91be3e74b0ebf9dee97 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 24 Apr 2026 13:56:14 +0200 Subject: [PATCH 42/69] Update explanation_concepts.md --- docs/getting_started/explanation_concepts.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started/explanation_concepts.md b/docs/getting_started/explanation_concepts.md index d8b285039..0283299ff 100644 --- a/docs/getting_started/explanation_concepts.md +++ b/docs/getting_started/explanation_concepts.md @@ -186,7 +186,7 @@ pset.execute(kernels=kernels, dt=dt, runtime=runtime) ### Output -To analyse the particle data generated in the simulation, we need to define a `parcels.ParticleFile` and add it as an argument to `parcels.ParticleSet.execute()`. The output will be written in a [zarr format](https://zarr.readthedocs.io/en/stable/), which can be opened as an `xarray.Dataset`. The dataset will contain the particle data with at least `time`, `z`, `lat` and `lon`, for each particle at timesteps defined by the `outputdt` argument. +To analyse the particle data generated in the simulation, we need to define a `parcels.ParticleFile` and add it as an argument to `parcels.ParticleSet.execute()`. The output will be written in a [parquet format](https://parquet.apache.org/), which can be opened as a `polars.DataFrame`. The dataset will contain the particle data with at least `time`, `z`, `lat` and `lon`, for each particle at timesteps defined by the `outputdt` argument. There are many ways to analyze particle output, and although we provide [a short tutorial to get started](./tutorial_output.ipynb), we recommend writing your own analysis code and checking out [related Lagrangian analysis projects in our community page](../community/index.md#analysis-code). From 4d5d6707ea038bbe44dd6a8e0cdbedf3a021ee03 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 24 Apr 2026 14:45:46 +0200 Subject: [PATCH 43/69] update tutorial_croco to use parquet --- .../examples/tutorial_croco_3D.ipynb | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/docs/user_guide/examples/tutorial_croco_3D.ipynb b/docs/user_guide/examples/tutorial_croco_3D.ipynb index 7ea142439..c54508a58 100644 --- a/docs/user_guide/examples/tutorial_croco_3D.ipynb +++ b/docs/user_guide/examples/tutorial_croco_3D.ipynb @@ -34,7 +34,17 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\nimport numpy as np\nimport xarray as xr\n\nimport parcels\nimport parcels.tutorial\n\nds_fields = parcels.tutorial.open_dataset(\"CROCOidealized_data/data\")\n\nds_fields.load(); # Preload data to speed up access" + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import polars as pl\n", + "import xarray as xr\n", + "\n", + "import parcels\n", + "import parcels.tutorial\n", + "\n", + "ds_fields = parcels.tutorial.open_dataset(\"CROCOidealized_data/data\")\n", + "\n", + "ds_fields.load(); # Preload data to speed up access" ] }, { @@ -119,7 +129,7 @@ ")\n", "\n", "outputfile = parcels.ParticleFile(\n", - " store=\"croco_particles3D.zarr\",\n", + " path=\"croco_particles3D.parquet\",\n", " outputdt=np.timedelta64(5000, \"s\"),\n", ")\n", "\n", @@ -149,10 +159,11 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(1, 1, figsize=(6, 4))\n", - "ds = xr.open_zarr(\"croco_particles3D.zarr\")\n", + "df = pl.read_parquet(\"croco_particles3D.parquet\")\n", "\n", "ax.plot(X / 1e3, Z, \"k.\", label=\"Initial positions\")\n", - "ax.plot(ds.lon.T / 1e3, ds.z.T, \".-\")\n", + "for g in df.partition_by(\"trajectory\", maintain_order=True):\n", + " ax.plot(g[\"lon\"] / 1e3, g[\"z\"], \".-\")\n", "\n", "for z in ds_fields.s_w.values:\n", " ax.plot(\n", @@ -208,7 +219,7 @@ ")\n", "\n", "outputfile = parcels.ParticleFile(\n", - " store=\"croco_particles_noW.zarr\", outputdt=np.timedelta64(5000, \"s\")\n", + " path=\"croco_particles_noW.parquet\", outputdt=np.timedelta64(5000, \"s\")\n", ")\n", "\n", "pset_noW.execute(\n", @@ -219,10 +230,11 @@ ")\n", "\n", "fig, ax = plt.subplots(1, 1, figsize=(6, 4))\n", - "ds = xr.open_zarr(\"croco_particles_noW.zarr\")\n", + "df = pl.read_parquet(\"croco_particles_noW.parquet\")\n", "\n", "ax.plot(X / 1e3, Z, \"k.\", label=\"Initial positions\")\n", - "ax.plot(ds.lon.T / 1e3, ds.z.T, \".-\")\n", + "for g in df.partition_by(\"trajectory\", maintain_order=True):\n", + " ax.plot(g[\"lon\"] / 1e3, g[\"z\"], \".-\")\n", "\n", "for z in ds_fields.s_w.values:\n", " ax.plot(\n", @@ -306,7 +318,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" } }, "nbformat": 4, From e00559500fe427471934127fe017df279b7ed8d6 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 28 Apr 2026 11:38:48 +0200 Subject: [PATCH 44/69] using polars for read_particlefile --- pixi.toml | 2 +- src/parcels/_core/particlefile.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pixi.toml b/pixi.toml index 8f49eada1..632d24100 100644 --- a/pixi.toml +++ b/pixi.toml @@ -62,6 +62,7 @@ xgcm = { version = "0.9.*", channel = "conda-forge" } cf_xarray = "0.10.*" cftime = "1.6.*" pooch = "1.8.*" +polars = "*" [feature.py311.dependencies] python = "3.11.*" @@ -87,7 +88,6 @@ trajan = "*" matplotlib-base = ">=2.0.2" gsw = "*" py-triangle = "*" -polars = "*" [feature.devtools.dependencies] pdbpp = "*" diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 90384f356..5a110f0ca 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +import polars as pl import pyarrow as pa import pyarrow.parquet as pq import xarray as xr @@ -232,14 +233,14 @@ def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame attrs = {k.decode(): v.decode() for k, v in time_field.metadata.items()} - df = pd.read_parquet(path) + df = pl.read_parquet(path) if not decode_times: return df values = table.column("time").to_numpy() var = xr.Variable(("time",), values, attrs) values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values - - df["time"] = values + values = values.astype("np.datetime64[s]") + df = df.with_columns(pl.Series("time", values)) return df From f738ca7d49386308edfae1f47b86d3c1d1d686b6 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 28 Apr 2026 11:39:22 +0200 Subject: [PATCH 45/69] Update tutorial_quickstart to use parquet --- docs/getting_started/tutorial_quickstart.md | 37 ++++++++++----------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/docs/getting_started/tutorial_quickstart.md b/docs/getting_started/tutorial_quickstart.md index cf17e4bb4..318625271 100644 --- a/docs/getting_started/tutorial_quickstart.md +++ b/docs/getting_started/tutorial_quickstart.md @@ -13,12 +13,12 @@ read more, we have a [concepts overview](./explanation_concepts.md) discussing t ## Imports -Parcels depends on `xarray`, expecting inputs in the form of [`xarray.Dataset`](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html) -and writing output files that can be read with xarray. +Parcels depends on `xarray`, expecting inputs in the form of [`xarray.Dataset`](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.html). Output files can be read with `pandas`. ```{code-cell} import numpy as np import xarray as xr +import polars as pl import parcels import parcels.tutorial ``` @@ -123,11 +123,11 @@ Before starting the simulation, we must define where and how frequent we want to We can define this in a {py:obj}`parcels.ParticleFile` object: ```{code-cell} -output_file = parcels.ParticleFile("output-quickstart.zarr", outputdt=np.timedelta64(1, "h")) +output_file = parcels.ParticleFile("output-quickstart.parquet", outputdt=np.timedelta64(1, "h")) ``` -The output files are in `.zarr` [format](https://zarr.readthedocs.io/en/stable/), which can be read by `xarray`. -See the [Parcels output tutorial](./tutorial_output.ipynb) for more information on the zarr format. We want to choose +The output files are in `.parquet` [format](https://parquet.apache.org/), which can be read by `polars`. +See the [Parcels output tutorial](./tutorial_output.ipynb) for more information on the parquet format. We want to choose the `outputdt` argument so that it captures the smallest timescales of our interest. ## Run Simulation: `ParticleSet.execute()` @@ -155,13 +155,12 @@ pset.execute( To start analyzing the trajectories computed by **Parcels**, we can open the `ParticleFile` using `xarray`: ```{code-cell} -ds_particles = xr.open_zarr("output-quickstart.zarr") -ds_particles +df_particles = pl.read_parquet("output-quickstart.parquet") +df_particles ``` -The 10 particle trajectories are stored along the `trajectory` dimension, and each trajectory contains 25 observations -(initial values + 24 hourly timesteps) along the `obs` dimension. The [working with Parcels output tutorial](./tutorial_output.ipynb) -provides more detail about the dataset and how to analyse it. +The file contains 250 rows: 25 observations for the 10 particle trajectories. +The [working with Parcels output tutorial](./tutorial_output.ipynb) provides more detail about the dataset and how to analyse it. Let's verify that Parcels has computed the advection of the virtual particles! @@ -169,9 +168,9 @@ Let's verify that Parcels has computed the advection of the virtual particles! import matplotlib.pyplot as plt # plot positions and color particles by number of observation -scatter = plt.scatter(ds_particles.lon.T, ds_particles.lat.T, c=np.repeat(ds_particles.obs.values,npart)) -plt.scatter(ds_particles.lon[:,0],ds_particles.lat[:,0],facecolors="none",edgecolors='r') # starting positions -plt.scatter(lon,lat,facecolors="none",edgecolors='r') # starting positions +scatter = plt.scatter(df_particles['lon'], df_particles['lat'], c=np.repeat(df_particles['obs'].values, npart)) +plt.scatter(df_particles['lon'][:npart], df_particles['lat'][:npart], facecolors="none", edgecolors='r') # starting positions +plt.scatter(lon, lat, facecolors="none", edgecolors='r') # starting positions plt.xlim(31,33) plt.ylabel("Latitude [deg N]") plt.ylim(-33,-30) @@ -196,7 +195,7 @@ location! ```{code-cell} :tags: [hide-output] # set up output file -output_file = parcels.ParticleFile("output-backwards.zarr", outputdt=np.timedelta64(1, "h")) +output_file = parcels.ParticleFile("output-backwards.parquet", outputdt=np.timedelta64(1, "h")) # execute simulation in backwards time pset.execute( @@ -210,10 +209,10 @@ pset.execute( When we check the output, we can see that the particles have returned to their original position! ```{code-cell} -ds_particles_back = xr.open_zarr("output-backwards.zarr") +df_particles_back = pl.read_parquet("output-backwards.parquet") -scatter = plt.scatter(ds_particles_back.lon.T, ds_particles_back.lat.T, c=np.repeat(ds_particles_back.obs.values,npart)) -plt.scatter(ds_particles_back.lon[:,0],ds_particles_back.lat[:,0],facecolors="none",edgecolors='r') # starting positions +scatter = plt.scatter(df_particles_back['lon'], df_particles_back['lat'], c=np.repeat(df_particles_back['obs'].values, npart)) +plt.scatter(df_particles_back['lon'][:npart], df_particles_back['lat'][:npart], facecolors="none", edgecolors='r') # starting positions plt.xlabel("Longitude [deg E]") plt.xlim(31,33) plt.ylabel("Latitude [deg N]") @@ -226,6 +225,6 @@ Using Euler forward advection, the final positions are equal to the original pos ```{code-cell} # testing that final location == original location -np.testing.assert_almost_equal(ds_particles_back['lat'].values[:,-1],ds_particles['lat'].values[:,0], 2) -np.testing.assert_almost_equal(ds_particles_back['lon'].values[:,-1],ds_particles['lon'].values[:,0], 2) +np.testing.assert_almost_equal(df_particles_back['lat'].values[:,-1],df_particles['lat'].values[:,0], 2) +np.testing.assert_almost_equal(df_particles_back['lon'].values[:,-1],df_particles['lon'].values[:,0], 2) ``` From ea9fe5101514240aa2194a72cdcd20ca34734807 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 28 Apr 2026 11:40:06 +0200 Subject: [PATCH 46/69] Update tutorial_output to use polars --- docs/getting_started/tutorial_output.ipynb | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/docs/getting_started/tutorial_output.ipynb b/docs/getting_started/tutorial_output.ipynb index 307ba9dd5..a6ee59e17 100644 --- a/docs/getting_started/tutorial_output.ipynb +++ b/docs/getting_started/tutorial_output.ipynb @@ -195,7 +195,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you may have noticed above, the `time` is shown as a `float` (in seconds) in `ds_pandas`. That is because `pandas.read_parquet` does not automatically convert the cftime. To handle this, we also provide a helper function `parcels.read_particlefile` to read ParticleFiles, which does automatically convert the cftime. " + "As you may have noticed above, the `time` is shown as a `float` (in seconds) in `ds_pandas`. That is because `polars.read_parquet` does not automatically convert the cftime. To handle this, we also provide a helper function `parcels.read_particlefile` to read ParticleFiles, which does automatically convert the cftime. " ] }, { @@ -225,8 +225,10 @@ "metadata": {}, "outputs": [], "source": [ - "for traj, g in df_particles.groupby(\"trajectory\"):\n", - " time_in_hour = (g[\"time\"] - fieldset.time_interval.left) / np.timedelta64(1, \"h\")\n", + "for g in df_particles.partition_by(\"trajectory\", maintain_order=True):\n", + " time_origin = pd.Timestamp(fieldset.time_interval.left).to_pydatetime()\n", + " time_in_hour = (g[\"time\"] - time_origin).dt.total_hours()\n", + " traj = g[\"trajectory\"][0]\n", " print(f\"trajectory {traj}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" ] }, @@ -261,7 +263,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 3))\n", - "ax.plot(df_particles[\"lon\"].values, df_particles[\"lat\"].values, \".-\")\n", + "ax.plot(df_particles[\"lon\"], df_particles[\"lat\"], \".-\")\n", "plt.show()" ] }, @@ -279,8 +281,9 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 3))\n", - "for traj, g in df_particles.groupby(\"trajectory\"):\n", - " ax.plot(g[\"lon\"].values, g[\"lat\"].values, \".-\", label=f\"P{traj}\")\n", + "for g in df_particles.partition_by(\"trajectory\", maintain_order=True):\n", + " traj = g[\"trajectory\"][0]\n", + " ax.plot(g[\"lon\"], g[\"lat\"], \".-\", label=f\"P{traj}\")\n", "ax.legend(loc=\"center left\", bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0)\n", "plt.tight_layout()\n", "plt.show()" From 8b380a0c48c3b4958acf839d57d04291278b2209 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 28 Apr 2026 11:45:28 +0200 Subject: [PATCH 47/69] Update v4-migration.md --- docs/user_guide/v4-migration.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/v4-migration.md b/docs/user_guide/v4-migration.md index a30da6301..0e4110834 100644 --- a/docs/user_guide/v4-migration.md +++ b/docs/user_guide/v4-migration.md @@ -35,11 +35,13 @@ Version 4 of Parcels is unreleased at the moment. The information in this migrat ## ParticleFile +- ParticleFiles output is now written in parquet format by default, instead of zarr. This means that ParticleFiles can now be read with `polars.read_parquet`. We also provide a helper function `parcels.read_particlefile` to read ParticleFiles, which automatically converts the cftime. - Particlefiles should be created by `ParticleFile(...)` instead of `pset.ParticleFile(...)` -- `ParticleFile` output is now in Parquet format - `ParticleFile` writing behaviour now errors out if there's existing output (this be being further discussed in https://github.com/Parcels-code/Parcels/issues/2593 ) - A utility to read in ParticleFile output is now available. `parcels.read_particlefile()` - "trajectory" is now called "particle_id" in the particle file output +- The `name` argument in `ParticleFile` has been replaced by `path` and can now be a string or a Path. +- The `chunks` argument in `ParticleFile` has been removed. ## Field From b049a73412c9d95b0348efb1dcc51e4d6d246652 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 28 Apr 2026 12:34:57 +0200 Subject: [PATCH 48/69] Fix using polars in tutorial_output --- docs/getting_started/tutorial_output.ipynb | 61 +++++++++++----------- src/parcels/_core/particlefile.py | 2 +- 2 files changed, 32 insertions(+), 31 deletions(-) diff --git a/docs/getting_started/tutorial_output.ipynb b/docs/getting_started/tutorial_output.ipynb index a6ee59e17..ed3a3480f 100644 --- a/docs/getting_started/tutorial_output.ipynb +++ b/docs/getting_started/tutorial_output.ipynb @@ -195,7 +195,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As you may have noticed above, the `time` is shown as a `float` (in seconds) in `ds_pandas`. That is because `polars.read_parquet` does not automatically convert the cftime. To handle this, we also provide a helper function `parcels.read_particlefile` to read ParticleFiles, which does automatically convert the cftime. " + "As you may have noticed above, the `time` is shown as a `float64` (in seconds) in `df_polars`. That is because `polars.read_parquet` does not automatically convert the cftime. To handle this, we also provide a helper function `parcels.read_particlefile` to read ParticleFiles, which does automatically convert the cftime. " ] }, { @@ -225,11 +225,11 @@ "metadata": {}, "outputs": [], "source": [ - "for g in df_particles.partition_by(\"trajectory\", maintain_order=True):\n", + "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", " time_origin = pd.Timestamp(fieldset.time_interval.left).to_pydatetime()\n", " time_in_hour = (g[\"time\"] - time_origin).dt.total_hours()\n", - " traj = g[\"trajectory\"][0]\n", - " print(f\"trajectory {traj}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" + " traj = g[\"particle_id\"][0]\n", + " print(f\"Particle {traj}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" ] }, { @@ -281,8 +281,8 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 3))\n", - "for g in df_particles.partition_by(\"trajectory\", maintain_order=True):\n", - " traj = g[\"trajectory\"][0]\n", + "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", + " traj = g[\"particle_id\"][0]\n", " ax.plot(g[\"lon\"], g[\"lat\"], \".-\", label=f\"P{traj}\")\n", "ax.legend(loc=\"center left\", bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0)\n", "plt.tight_layout()\n", @@ -304,7 +304,7 @@ "source": [ "time_step = np.timedelta64(18, \"h\")\n", "time_to_plot = fieldset.time_interval.left + time_step\n", - "g = df_particles[df_particles[\"time\"] == time_to_plot]\n", + "g = df_particles.filter(pl.col(\"time\") == pl.lit(time_to_plot))\n", "\n", "fig, ax = plt.subplots(figsize=(5, 3))\n", "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", @@ -327,10 +327,9 @@ "outputs": [], "source": [ "time_step = np.timedelta64(18, \"h\")\n", - "release_time = df_particles[\"time\"] - df_particles.groupby(\"trajectory\")[\n", - " \"time\"\n", - "].transform(\"min\")\n", - "g = df_particles[release_time == time_step]\n", + "g = df_particles.filter(\n", + " (pl.col(\"time\") - pl.col(\"time\").min().over(\"particle_id\")) == pl.lit(time_step)\n", + ")\n", "\n", "fig, ax = plt.subplots(figsize=(5, 3))\n", "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", @@ -353,13 +352,15 @@ "source": [ "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n", "\n", - "for traj, g in df_particles.groupby(\"trajectory\"):\n", - " distance = np.sqrt(\n", - " (g[\"lon\"] - g[\"lon\"].values[0]) ** 2 + (g[\"lat\"] - g[\"lat\"].values[0]) ** 2\n", - " )\n", - " ax[0].plot(g[\"time\"], distance, \".-\", label=f\"P{traj}\")\n", - " rel_time = (g[\"time\"] - g[\"time\"].values[0]) / np.timedelta64(1, \"h\")\n", - " ax[1].plot(rel_time, distance, \".-\", label=f\"P{traj}\")\n", + "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", + " distance = np.sqrt((g[\"lon\"] - g[\"lon\"][0]) ** 2 + (g[\"lat\"] - g[\"lat\"][0]) ** 2)\n", + " ax[0].plot(g[\"time\"], distance, \".-\", label=f\"P{g['particle_id'][0]}\")\n", + " time_step_py = time_step.astype(\n", + " \"timedelta64[us]\"\n", + " ).item() # gives a Python timedelta\n", + "\n", + " rel_time = (g[\"time\"] - pd.Timestamp(g[\"time\"][0]).to_pydatetime()).dt.total_hours()\n", + " ax[1].plot(rel_time, distance, \".-\", label=f\"P{g['particle_id'][0]}\")\n", "\n", "ax[0].set_xlabel(\"Date\")\n", "ax[0].set_ylabel(\"Distance travelled [degrees]\")\n", @@ -419,9 +420,9 @@ "# set up a unique color for each trajectory\n", "colormap = matplotlib.colormaps[\"tab20b\"]\n", "trajectory_to_color = {}\n", - "for i, trajectory in enumerate(df_particles[\"trajectory\"].unique()):\n", + "for i, trajectory in enumerate(df_particles[\"particle_id\"].unique()):\n", " trajectory_to_color[trajectory] = colormap(\n", - " i / max(len(df_particles[\"trajectory\"].unique()) - 1, 1)\n", + " i / max(len(df_particles[\"particle_id\"].unique()) - 1, 1)\n", " )\n", "\n", "# figure setup\n", @@ -437,9 +438,9 @@ "ax.add_feature(cfeature.LAND)\n", "\n", "# --> plot first timestep\n", - "g = df_particles[df_particles[\"time\"] == timerange[0]]\n", + "g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", "scatter = ax.scatter(\n", - " g[\"lon\"], g[\"lat\"], s=10, c=[trajectory_to_color[rt] for rt in g[\"trajectory\"]]\n", + " g[\"lon\"], g[\"lat\"], s=10, c=[trajectory_to_color[rt] for rt in g[\"particle_id\"]]\n", ")\n", "\n", "# --> initialize trails\n", @@ -458,12 +459,12 @@ " title.set_text(f\"Particles at t = {t_str}\")\n", "\n", " # Find particles at current time\n", - " trajs_at_timestep = df_particles[df_particles[\"time\"] == timerange[i]]\n", + " trajs_at_timestep = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", "\n", " if len(trajs_at_timestep) > 0:\n", " scatter.set_offsets(np.c_[trajs_at_timestep[\"lon\"], trajs_at_timestep[\"lat\"]])\n", " scatter.set_color(\n", - " [trajectory_to_color[traj] for traj in trajs_at_timestep[\"trajectory\"]]\n", + " [trajectory_to_color[traj] for traj in trajs_at_timestep[\"particle_id\"]]\n", " )\n", "\n", " # --> reset trails\n", @@ -472,12 +473,12 @@ " trail_plot.clear()\n", " trail_length = min(10, i) # trails will have max length of 10 time steps\n", " if trail_length > 0:\n", - " for traj in trajs_at_timestep[\"trajectory\"].unique():\n", - " traj_trail = df_particles[\n", - " (df_particles[\"trajectory\"] == traj)\n", - " & (df_particles[\"time\"] >= timerange[max(0, i - trail_length)])\n", - " & (df_particles[\"time\"] <= timerange[i])\n", - " ]\n", + " for traj in trajs_at_timestep[\"particle_id\"].unique():\n", + " traj_trail = df_particles.filter(\n", + " (pl.col(\"particle_id\") == traj)\n", + " & (pl.col(\"time\") >= pl.lit(timerange[max(0, i - trail_length)]))\n", + " & (pl.col(\"time\") <= pl.lit(timerange[i]))\n", + " )\n", " if len(traj_trail) > 1:\n", " (trail,) = ax.plot(\n", " traj_trail[\"lon\"],\n", diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 4645a3cf8..0785b0ab4 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -240,7 +240,7 @@ def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame values = table.column("time").to_numpy() var = xr.Variable(("time",), values, attrs) values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values - values = values.astype("np.datetime64[s]") + values = values.astype("datetime64[ns]") df = df.with_columns(pl.Series("time", values)) return df From 59ed170985340ec649fe538f9b97a8ae69e499dc Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 28 Apr 2026 15:03:24 +0200 Subject: [PATCH 49/69] Fixing read_parquet to use polars --- docs/getting_started/tutorial_output.ipynb | 5 +---- src/parcels/_core/particlefile.py | 8 +++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/getting_started/tutorial_output.ipynb b/docs/getting_started/tutorial_output.ipynb index ed3a3480f..7a7eb4401 100644 --- a/docs/getting_started/tutorial_output.ipynb +++ b/docs/getting_started/tutorial_output.ipynb @@ -355,11 +355,8 @@ "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", " distance = np.sqrt((g[\"lon\"] - g[\"lon\"][0]) ** 2 + (g[\"lat\"] - g[\"lat\"][0]) ** 2)\n", " ax[0].plot(g[\"time\"], distance, \".-\", label=f\"P{g['particle_id'][0]}\")\n", - " time_step_py = time_step.astype(\n", - " \"timedelta64[us]\"\n", - " ).item() # gives a Python timedelta\n", "\n", - " rel_time = (g[\"time\"] - pd.Timestamp(g[\"time\"][0]).to_pydatetime()).dt.total_hours()\n", + " rel_time = (g[\"time\"] - g[\"time\"][0]).dt.total_hours()\n", " ax[1].plot(rel_time, distance, \".-\", label=f\"P{g['particle_id'][0]}\")\n", "\n", "ax[0].set_xlabel(\"Date\")\n", diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index 0785b0ab4..a147622cc 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -239,8 +239,10 @@ def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame values = table.column("time").to_numpy() var = xr.Variable(("time",), values, attrs) - values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values - values = values.astype("datetime64[ns]") - df = df.with_columns(pl.Series("time", values)) + values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values.astype("datetime64[ns]") + if np.issubdtype(values.dtype, np.datetime64): + df = df.with_columns(pl.Series("time", values, dtype=pl.Datetime("ns"))) + elif np.issubdtype(values.dtype, np.timedelta64): + df = df.with_columns(pl.Series("time", values, dtype=pl.Duration("ns"))) return df From 3f326ceb04f34868a568eed05d8eb1174d843868 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 29 Apr 2026 10:27:35 +0200 Subject: [PATCH 50/69] Update tutorial_delaystart to use parquet --- .../examples/tutorial_delaystart.ipynb | 93 ++++++------------- 1 file changed, 29 insertions(+), 64 deletions(-) diff --git a/docs/user_guide/examples/tutorial_delaystart.ipynb b/docs/user_guide/examples/tutorial_delaystart.ipynb index 8bb3ffd95..565187415 100644 --- a/docs/user_guide/examples/tutorial_delaystart.ipynb +++ b/docs/user_guide/examples/tutorial_delaystart.ipynb @@ -26,6 +26,8 @@ "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", + "import pandas as pd\n", + "import polars as pl\n", "import xarray as xr\n", "from matplotlib.animation import FuncAnimation\n", "\n", @@ -115,7 +117,7 @@ "outputs": [], "source": [ "output_file = parcels.ParticleFile(\n", - " \"delayparticle_time.zarr\", outputdt=np.timedelta64(1, \"h\")\n", + " \"delayparticle_time.parquet\", outputdt=np.timedelta64(1, \"h\")\n", ")\n", "\n", "pset.execute(\n", @@ -140,7 +142,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = xr.open_zarr(\"delayparticle_time.zarr\")\n", + "df_particles = parcels.read_particlefile(\"delayparticle_time.parquet\")\n", "\n", "fig = plt.figure(figsize=(7, 5), constrained_layout=True)\n", "ax = fig.add_subplot()\n", @@ -150,27 +152,18 @@ "ax.set_xlim(31, 33)\n", "ax.set_ylim(-32, -30)\n", "\n", - "timerange = np.unique(ds_particles[\"time\"].values[np.isfinite(ds_particles[\"time\"])])\n", + "timerange = df_particles[\"time\"].unique()\n", "\n", - "# Indices of the data where time = 0\n", - "time_id = np.where(ds_particles[\"time\"] == timerange[0])\n", - "\n", - "sc = ax.scatter(\n", - " ds_particles[\"lon\"].values[time_id], ds_particles[\"lat\"].values[time_id]\n", - ")\n", - "\n", - "t = timerange[0].astype(\"datetime64[h]\")\n", - "title = ax.set_title(f\"Particles at t = {t}\")\n", + "g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "sc = ax.scatter(g[\"lon\"], g[\"lat\"])\n", + "title = ax.set_title(f\"Particles at t = {timerange[0]}\")\n", "\n", "\n", "def animate(i):\n", - " t = timerange[i].astype(\"datetime64[h]\")\n", - " title.set_text(f\"Particles at t = {t}\")\n", + " title.set_text(f\"Particles at t = {timerange[i]}\")\n", "\n", - " time_id = np.where(ds_particles[\"time\"] == timerange[i])\n", - " sc.set_offsets(\n", - " np.c_[ds_particles[\"lon\"].values[time_id], ds_particles[\"lat\"].values[time_id]]\n", - " )\n", + " g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " sc.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", "\n", "\n", "anim = FuncAnimation(fig, animate, frames=len(timerange), interval=100)\n", @@ -254,7 +247,7 @@ "outputs": [], "source": [ "output_file = parcels.ParticleFile(\n", - " \"delayparticle_releasedt.zarr\", outputdt=np.timedelta64(1, \"h\")\n", + " \"delayparticle_releasedt.parquet\", outputdt=np.timedelta64(1, \"h\")\n", ")\n", "\n", "pset.execute(\n", @@ -279,7 +272,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = xr.open_zarr(\"delayparticle_releasedt.zarr\")\n", + "df_particles = parcels.read_particlefile(\"delayparticle_releasedt.parquet\")\n", "\n", "fig = plt.figure(figsize=(7, 5), constrained_layout=True)\n", "ax = fig.add_subplot()\n", @@ -289,27 +282,18 @@ "ax.set_xlim(31, 33)\n", "ax.set_ylim(-32, -30)\n", "\n", - "timerange = np.unique(ds_particles[\"time\"].values[np.isfinite(ds_particles[\"time\"])])\n", + "timerange = df_particles[\"time\"].unique()\n", "\n", - "# Indices of the data where time = 0\n", - "time_id = np.where(ds_particles[\"time\"] == timerange[0])\n", - "\n", - "sc = ax.scatter(\n", - " ds_particles[\"lon\"].values[time_id], ds_particles[\"lat\"].values[time_id]\n", - ")\n", - "\n", - "t = timerange[0].astype(\"datetime64[h]\")\n", - "title = ax.set_title(f\"Particles at t = {t}\")\n", + "g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "sc = ax.scatter(g[\"lon\"], g[\"lat\"])\n", + "title = ax.set_title(f\"Particles at t = {timerange[0]}\")\n", "\n", "\n", "def animate(i):\n", - " t = timerange[i].astype(\"datetime64[h]\")\n", - " title.set_text(f\"Particles at t = {t}\")\n", + " title.set_text(f\"Particles at t = {timerange[i]}\")\n", "\n", - " time_id = np.where(ds_particles[\"time\"] == timerange[i])\n", - " sc.set_offsets(\n", - " np.c_[ds_particles[\"lon\"].values[time_id], ds_particles[\"lat\"].values[time_id]]\n", - " )\n", + " g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " sc.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", "\n", "\n", "anim = FuncAnimation(fig, animate, frames=len(timerange), interval=100)\n", @@ -326,20 +310,7 @@ "\n", "Note that, because the `outputdt` variable controls the Kernel-loop, all particles are written _at the same time_, even when they start at a non-multiple of `outputdt`.\n", "\n", - "For example, if your particles start at `time=[0, 1, 2]` and `outputdt=2`, then the times written (for `dt=1` and `endtime=4`) will be\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "outtime_expected = np.array(\n", - " [[0, 2, 4], [2, 4, np.datetime64(\"NaT\")], [2, 4, np.datetime64(\"NaT\")]],\n", - " dtype=\"timedelta64[h]\",\n", - ")\n", - "print(outtime_expected)" + "For example, if your particles start at `time=[0, 1, 2]` and `outputdt=2`, then the times written (for `dt=1` and `endtime=4`) will be `[0, 2, 2, 2, 4, 4, 4]`" ] }, { @@ -352,7 +323,7 @@ }, "outputs": [], "source": [ - "outfilepath = \"delayparticle_nonmatchingtime.zarr\"\n", + "outfilepath = \"delayparticle_nonmatchingtime.parquet\"\n", "\n", "pset = parcels.ParticleSet(\n", " fieldset=fieldset,\n", @@ -377,7 +348,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And indeed, the `time` values in the NetCDF output file are as expected\n" + "And indeed, the `time` values in the output file are as expected\n" ] }, { @@ -386,15 +357,8 @@ "metadata": {}, "outputs": [], "source": [ - "outtime_infile = (\n", - " xr.open_zarr(outfilepath).time.values[:] - ds_fields.time.values[0]\n", - ") # subtract initial time to convert from datetime64 to timedelta64\n", - "print(outtime_infile.astype(\"timedelta64[h]\"))\n", - "\n", - "assert (\n", - " outtime_expected[np.isfinite(outtime_expected)]\n", - " == outtime_infile[np.isfinite(outtime_infile)]\n", - ").all()" + "outtime_infile = parcels.read_particlefile(outfilepath)\n", + "print(outtime_infile[\"time\"])" ] }, { @@ -420,6 +384,7 @@ " time=ds_fields.time.values[0] + times,\n", " z=[0.5] * len(times),\n", " )\n", + " outfilepath = f\"delayparticle_nonmatchingtime_{times[0]}_{times[1]}.parquet\"\n", " output_file = parcels.ParticleFile(outfilepath, outputdt=np.timedelta64(2, \"h\"))\n", " pset.execute(\n", " parcels.kernels.AdvectionRK2,\n", @@ -428,8 +393,8 @@ " output_file=output_file,\n", " verbose_progress=False,\n", " )\n", - " outtime_infile = xr.open_zarr(outfilepath).time.values[:] - ds_fields.time.values[0]\n", - " print(outtime_infile.astype(\"timedelta64[h]\"))" + " outtime_infile = parcels.read_particlefile(outfilepath)\n", + " print(outtime_infile[\"time\"])" ] } ], @@ -449,7 +414,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" } }, "nbformat": 4, From 1a48b4483821b8e10a0b188efd5a5be62c8fa6f6 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 07:41:03 +0200 Subject: [PATCH 51/69] Update tutorial_dt_integrators to use parquet --- .../examples/tutorial_dt_integrators.ipynb | 174 ++++++++---------- 1 file changed, 76 insertions(+), 98 deletions(-) diff --git a/docs/user_guide/examples/tutorial_dt_integrators.ipynb b/docs/user_guide/examples/tutorial_dt_integrators.ipynb index bd4d93de5..ef86954a9 100644 --- a/docs/user_guide/examples/tutorial_dt_integrators.ipynb +++ b/docs/user_guide/examples/tutorial_dt_integrators.ipynb @@ -56,7 +56,7 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "import xarray as xr\n", + "import polars as pl\n", "\n", "import parcels\n", "import parcels.tutorial\n", @@ -237,14 +237,10 @@ " lon=initial_release_lons,\n", " )\n", " outputdt = dt\n", - " chunks = int(\n", - " runtime / outputdt / 2\n", - " ) # Because we will store a lot of positions, to speed up our simulation we need to chunk the output datafile\n", "\n", " pfile = parcels.ParticleFile(\n", - " store=f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\",\n", + " path=f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\",\n", " outputdt=outputdt,\n", - " chunks=(len(pset), chunks),\n", " )\n", "\n", " print(f\"Begin simulation for dt = {int(dt / np.timedelta64(1, 's'))} s\")\n", @@ -294,18 +290,21 @@ "ax = plt.axes()\n", "temperature = ds_fields.isel(time=0, depth=0).thetao.plot(cmap=\"Greys\")\n", "for j, dt in enumerate(dt_choices):\n", - " ds = xr.open_zarr(\n", - " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\"\n", + " df = parcels.read_particlefile(\n", + " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", - " labels = [f\"dt = {str(dt)}\"] + [None] * (ds.lon.shape[0] - 1)\n", - " ax.plot(\n", - " ds.lon.T,\n", - " ds.lat.T,\n", - " alpha=0.75,\n", - " color=plt.cm.viridis(dt_colours[j]),\n", - " label=labels,\n", - " )\n", - "ax.scatter(ds.lon[:, 0], ds.lat[:, 0], c=\"r\", marker=\"s\", label=\"starting locations\")\n", + " for i, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " ax.plot(\n", + " g[\"lon\"],\n", + " g[\"lat\"],\n", + " alpha=0.75,\n", + " color=plt.cm.viridis(dt_colours[j]),\n", + " label=f\"dt = {dt}\" if i == 0 else None,\n", + " )\n", + "df_start = df.filter(pl.col(\"time\") == df[\"time\"].min())\n", + "ax.scatter(\n", + " df_start[\"lon\"], df_start[\"lat\"], c=\"r\", marker=\"s\", label=\"starting locations\"\n", + ")\n", "ax.legend()\n", "ax.set_ylim(-32.7, -31.3)\n", "ax.set_xlim(31, 32.4)\n", @@ -335,7 +334,6 @@ " Haversine formula used, which assumes the Earth is a sphere.\n", " source: https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude\n", " \"\"\"\n", - "\n", " R = 6371.0 # approximate radius of earth in km\n", "\n", " lat1 = np.radians(lata)\n", @@ -378,45 +376,37 @@ "axs[1].set_ylim(0, 50)\n", "\n", "# set 5 minute dt as benchmark\n", - "ds_5min = xr.open_zarr(f\"output/AdvectionRK2_dt_300s.zarr\")\n", + "df_5min = parcels.read_particlefile(\"output/AdvectionRK2_dt_300s.parquet\")\n", "for i, dt in enumerate(dt_choices[:-1]):\n", - " ds = xr.open_zarr(\n", - " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\"\n", + " df = parcels.read_particlefile(\n", + " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", - " labels = [f\"dt = {str(dt)}\"] + [None] * (ds.lon.shape[0] - 1)\n", "\n", " # subset 5 minute data to match dt\n", - " lon_5min_sub = ds_5min.lon.where(\n", - " ds_5min.time.isin(ds.time.values).compute(), drop=True\n", - " ).values\n", - " lat_5min_sub = ds_5min.lat.where(\n", - " ds_5min.time.isin(ds.time.values).compute(), drop=True\n", - " ).values\n", - "\n", - " # remove nans\n", - " lon_valid = ds.lon.where(~np.isnan(ds.lon).compute(), drop=True).values\n", - " lat_valid = ds.lat.where(~np.isnan(ds.lat).compute(), drop=True).values\n", + " lon_5min_sub = df_5min.filter(pl.col(\"time\").is_in(df[\"time\"].implode()))[\"lon\"]\n", + " lat_5min_sub = df_5min.filter(pl.col(\"time\").is_in(df[\"time\"].implode()))[\"lat\"]\n", "\n", " # compute separation distance between each particle in km\n", - " dist = dist_km(lon_valid, lon_5min_sub, lat_valid, lat_5min_sub)\n", + " dist = dist_km(df[\"lon\"], lon_5min_sub, df[\"lat\"], lat_5min_sub)\n", + " df = df.with_columns(pl.Series(\"dist\", dist))\n", "\n", " # plot\n", - " time_valid = ds.time.where(~np.isnan(ds.time).compute(), drop=True)\n", - " axs[0].plot(\n", - " time_valid.T,\n", - " dist.T,\n", - " alpha=0.75,\n", - " color=plt.cm.viridis(dt_colours[i]),\n", - " label=labels,\n", - " )\n", - " axs[1].plot(\n", - " time_valid.T,\n", - " dist.T,\n", - " alpha=0.75,\n", - " color=plt.cm.viridis(dt_colours[i]),\n", - " label=labels,\n", - " )\n", - " dist_end[i] = dist[:, -1]\n", + " for j, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " axs[0].plot(\n", + " g[\"time\"],\n", + " g[\"dist\"],\n", + " alpha=0.75,\n", + " color=plt.cm.viridis(dt_colours[i]),\n", + " label=f\"dt = {dt}\" if j == 0 else None,\n", + " )\n", + " axs[1].plot(\n", + " g[\"time\"],\n", + " g[\"dist\"],\n", + " alpha=0.75,\n", + " color=plt.cm.viridis(dt_colours[i]),\n", + " label=f\"dt = {dt}\" if j == 0 else None,\n", + " )\n", + " dist_end[i] = df.filter(pl.col(\"time\") == df[\"time\"].max())[\"dist\"]\n", "axs[0].legend()\n", "axs[0].set_ylabel(\"Distance (km)\")\n", "plt.show()" @@ -445,7 +435,7 @@ " (dt / np.timedelta64(1, \"m\")).astype(int),\n", " np.mean(dist_end[i]),\n", " color=plt.cm.viridis(dt_colours[i]),\n", - " label=f\"dt = {str(dt)}\",\n", + " label=f\"dt = {dt}\",\n", " )\n", "ax[0].plot(\n", " (dt_choices[:-1] / np.timedelta64(1, \"m\")).astype(int), np.mean(dist_end, axis=1)\n", @@ -460,7 +450,7 @@ " (dt / np.timedelta64(1, \"m\")).astype(int),\n", " sim_duration[i],\n", " color=plt.cm.viridis(dt_colours[i]),\n", - " label=f\"dt = {str(dt)}\",\n", + " label=f\"dt = {dt}\",\n", " )\n", "ax[1].set_ylabel(\"Simulation Duration (s)\")\n", "ax[1].set_xlabel(\"dt (minutes)\")\n", @@ -589,14 +579,10 @@ " lon=initial_release_lons,\n", " )\n", " outputdt = dt\n", - " chunks = int(\n", - " runtime / outputdt / 2\n", - " ) # Because we will store a lot of positions, to speed up our simulation we need to chunk the output datafile\n", "\n", " pfile = parcels.ParticleFile(\n", - " store=f\"output/{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\",\n", + " path=f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\",\n", " outputdt=outputdt,\n", - " chunks=(len(pset), chunks),\n", " )\n", "\n", " print(\n", @@ -630,22 +616,23 @@ "for i, dt in enumerate(dt_choices):\n", " m = i // 3\n", " n = i % 3\n", - " axs[m, n].set_title(f\"dt = {str(dt)}\")\n", + " axs[m, n].set_title(f\"dt = {dt}\")\n", " axs[m, n].set_xlabel(\"Longitude\")\n", " for j, advection_scheme in enumerate(advection_schemes):\n", - " ds = xr.open_zarr(\n", - " f\"output/{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\"\n", - " )\n", - " labels = [f\"{advection_scheme.__name__}\"] + [None] * (ds.lon.shape[0] - 1)\n", - " axs[m, n].plot(\n", - " ds.lon.T,\n", - " ds.lat.T,\n", - " alpha=0.75,\n", - " color=plt.cm.viridis(scheme_colours[j]),\n", - " label=labels,\n", + " df = parcels.read_particlefile(\n", + " f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", + " for i, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " axs[m, n].plot(\n", + " g[\"lon\"],\n", + " g[\"lat\"],\n", + " alpha=0.75,\n", + " color=plt.cm.viridis(scheme_colours[j]),\n", + " label=f\"{advection_scheme.__name__}\" if i == 0 else None,\n", + " )\n", + " df_start = df.filter(pl.col(\"time\") == df[\"time\"].min())\n", " axs[m, n].scatter(\n", - " ds.lon[:, 0], ds.lat[:, 0], c=\"r\", marker=\"s\", label=\"starting locations\"\n", + " df_start[\"lon\"], df_start[\"lat\"], c=\"r\", marker=\"s\", label=\"starting locations\"\n", " )\n", " axs[m, n].grid()\n", "axs[-1, -1].axis(\"off\")\n", @@ -677,46 +664,37 @@ "for i, dt in enumerate(dt_choices):\n", " m = i // 3\n", " n = i % 3\n", - " axs[m, n].set_title(f\"dt = {str(dt)}\")\n", + " axs[m, n].set_title(f\"dt = {dt}\")\n", " axs[m, n].set_xlabel(\"Time\")\n", " axs[m, n].tick_params(\"x\", rotation=45)\n", " axs[m, n].set_yscale(\"log\")\n", " axs[m, n].set_ylim(1e-4, 1e1)\n", - " ds_RK4 = xr.open_zarr(\n", - " f\"output/AdvectionRK4_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\"\n", + " df_RK4 = parcels.read_particlefile(\n", + " f\"output/KernelCompare_AdvectionRK4_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", " for j, advection_scheme in enumerate(advection_schemes[:-1]):\n", - " ds = xr.open_zarr(\n", - " f\"output/{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.zarr\"\n", - " )\n", - " labels = [f\"|{advection_scheme.__name__} - AdvectionRK4|\"] + [None] * (\n", - " ds.lon.shape[0] - 1\n", + " df = parcels.read_particlefile(\n", + " f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", "\n", - " # remove nans\n", - " lon_valid_RK4 = ds_RK4.lon.where(\n", - " ~np.isnan(ds_RK4.lon).compute(), drop=True\n", - " ).values\n", - " lat_valid_RK4 = ds_RK4.lat.where(\n", - " ~np.isnan(ds_RK4.lat).compute(), drop=True\n", - " ).values\n", - " lon_valid = ds.lon.where(~np.isnan(ds.lon).compute(), drop=True).values\n", - " lat_valid = ds.lat.where(~np.isnan(ds.lat).compute(), drop=True).values\n", - " dist = dist_km(lon_valid, lon_valid_RK4, lat_valid, lat_valid_RK4)\n", - " time_valid = ds.time.where(~np.isnan(ds.time).compute(), drop=True).values\n", - " axs[m, n].plot(\n", - " time_valid.T,\n", - " dist.T,\n", - " alpha=0.75,\n", - " color=plt.cm.viridis(scheme_colours[j]),\n", - " label=labels,\n", - " )\n", - " dist_end[j, i] = dist[:, -1]\n", + " dist = dist_km(df[\"lon\"], df_RK4[\"lon\"], df[\"lat\"], df_RK4[\"lat\"])\n", + " df = df.with_columns(pl.Series(\"dist\", dist))\n", + " for k, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " axs[m, n].plot(\n", + " g[\"time\"],\n", + " g[\"dist\"],\n", + " alpha=0.75,\n", + " color=plt.cm.viridis(scheme_colours[j]),\n", + " label=f\"|{advection_scheme.__name__} - AdvectionRK4|\"\n", + " if k == 0\n", + " else None,\n", + " )\n", + " dist_end[j, i] = df.filter(pl.col(\"time\") == df[\"time\"].max())[\"dist\"]\n", " axs[m, n].grid()\n", "axs[-1, -1].axis(\"off\")\n", "axs[0, 0].legend()\n", - "axs[0, 0].set_ylabel(\"Latitude\")\n", - "axs[1, 0].set_ylabel(\"Latitude\")\n", + "axs[0, 0].set_ylabel(\"Distance (km)\")\n", + "axs[1, 0].set_ylabel(\"Distance (km)\")\n", "plt.tight_layout()\n", "plt.show()" ] @@ -838,7 +816,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" } }, "nbformat": 4, From 9e3b88a601532a5e3096d9c10ee32c89a232d12d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 08:45:50 +0200 Subject: [PATCH 52/69] Fixing parcels.read_particlefile for timedelta time --- src/parcels/_core/particlefile.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/parcels/_core/particlefile.py b/src/parcels/_core/particlefile.py index a147622cc..0f1a2d2c5 100644 --- a/src/parcels/_core/particlefile.py +++ b/src/parcels/_core/particlefile.py @@ -239,10 +239,12 @@ def read_particlefile(path: PathLike, decode_times: bool = True) -> pd.DataFrame values = table.column("time").to_numpy() var = xr.Variable(("time",), values, attrs) - values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values.astype("datetime64[ns]") - if np.issubdtype(values.dtype, np.datetime64): + values = xr.coders.CFDatetimeCoder(time_unit="s").decode(var).values + if "since" in attrs["units"]: + values = values.astype("datetime64[ns]") df = df.with_columns(pl.Series("time", values, dtype=pl.Datetime("ns"))) - elif np.issubdtype(values.dtype, np.timedelta64): + else: + values = values.astype("timedelta64[ns]") * 1e9 df = df.with_columns(pl.Series("time", values, dtype=pl.Duration("ns"))) return df From 90849105e76fd0f05cf0310ecd5bd886a6ffe133 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 08:47:48 +0200 Subject: [PATCH 53/69] Update tutorial_interaction to use parquet --- .../examples/tutorial_interaction.ipynb | 122 ++++++------------ 1 file changed, 37 insertions(+), 85 deletions(-) diff --git a/docs/user_guide/examples/tutorial_interaction.ipynb b/docs/user_guide/examples/tutorial_interaction.ipynb index defac279d..a7ac853ba 100644 --- a/docs/user_guide/examples/tutorial_interaction.ipynb +++ b/docs/user_guide/examples/tutorial_interaction.ipynb @@ -38,7 +38,7 @@ "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import xarray as xr\n", + "import polars as pl\n", "from matplotlib.animation import FuncAnimation\n", "\n", "import parcels\n", @@ -56,7 +56,8 @@ "source": [ "def Pull(particles, fieldset):\n", " \"\"\"Kernel that \"pulls\" all neighbour particles\n", - " toward the attracting particle with a constant velocity\"\"\"\n", + " toward the attracting particle with a constant velocity\n", + " \"\"\"\n", " interaction_distance = 0.5\n", " velocity = -0.04 # predefined attracting velocity\n", "\n", @@ -129,7 +130,7 @@ ")\n", "\n", "output_file = parcels.ParticleFile(\n", - " store=\"InteractingParticles.zarr\",\n", + " path=\"InteractingParticles.parquet\",\n", " outputdt=np.timedelta64(1, \"s\"),\n", ")\n", "\n", @@ -154,15 +155,11 @@ "metadata": {}, "outputs": [], "source": [ - "data_xarray = xr.open_zarr(\"InteractingParticles.zarr\")\n", - "data_attr = data_xarray.where(data_xarray[\"attractor\"].compute() == 1, drop=True)\n", - "data_other = data_xarray.where(data_xarray[\"attractor\"].compute() == 0, drop=True)\n", - "\n", - "timerange = np.arange(\n", - " np.nanmin(data_xarray[\"time\"].values),\n", - " np.nanmax(data_xarray[\"time\"].values),\n", - " np.timedelta64(1, \"s\"),\n", - ")\n", + "df = parcels.read_particlefile(\"InteractingParticles.parquet\")\n", + "df_attr = df.filter(pl.col(\"attractor\") == 1)\n", + "df_other = df.filter(pl.col(\"attractor\") == 0)\n", + "\n", + "timerange = df[\"time\"].unique()\n", "\n", "fig = plt.figure(figsize=(4, 4), constrained_layout=True)\n", "ax = fig.add_subplot()\n", @@ -172,28 +169,12 @@ "ax.set_xlim(-1.1, 1.1)\n", "ax.set_ylim(-1.1, 1.1)\n", "\n", - "time_id = np.where(data_other[\"time\"] == timerange[0])\n", - "time_id_attr = np.where(data_attr[\"time\"] == timerange[0])\n", - "\n", - "scatter = ax.scatter(\n", - " data_other[\"lon\"].values[time_id],\n", - " data_other[\"lat\"].values[time_id],\n", - " c=\"b\",\n", - " s=5,\n", - " zorder=1,\n", - ")\n", - "scatter_attr = ax.scatter(\n", - " data_attr[\"lon\"].values[time_id_attr],\n", - " data_attr[\"lat\"].values[time_id_attr],\n", - " c=\"r\",\n", - " s=40,\n", - " zorder=2,\n", - ")\n", - "\n", + "g = df_other.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "scatter = ax.scatter(g[\"lon\"], g[\"lat\"], c=\"b\", s=5, zorder=1)\n", + "g_attr = df_attr.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "scatter_attr = ax.scatter(g_attr[\"lon\"], g_attr[\"lat\"], c=\"r\", s=40, zorder=2)\n", "circs = []\n", - "for lon_a, lat_a in zip(\n", - " data_attr[\"lon\"].values[time_id_attr], data_attr[\"lat\"].values[time_id_attr]\n", - "):\n", + "for lon_a, lat_a in zip(g_attr[\"lon\"], g_attr[\"lat\"], strict=True):\n", " circs.append(\n", " ax.add_patch(\n", " plt.Circle(\n", @@ -202,30 +183,21 @@ " )\n", " )\n", "\n", - "t = str(timerange[0].astype(\"timedelta64[s]\"))\n", - "title = ax.set_title(\"Particles at t = \" + t + \" (Red particles are attractors)\")\n", + "title = ax.set_title(\n", + " f\"Particles at t = {timerange[0].total_seconds()}s\\n(Red particles are attractors)\"\n", + ")\n", "\n", "\n", "def animate(i):\n", - " t = str(timerange[i].astype(\"timedelta64[s]\"))\n", - " title.set_text(\"Particles at t = \" + t + \"\\n (Red particles are attractors)\")\n", - "\n", - " time_id = np.where(data_other[\"time\"] == timerange[i])\n", - " time_id_attr = np.where(data_attr[\"time\"] == timerange[i])\n", - " scatter.set_offsets(\n", - " np.c_[data_other[\"lon\"].values[time_id], data_other[\"lat\"].values[time_id]]\n", - " )\n", - " scatter_attr.set_offsets(\n", - " np.c_[\n", - " data_attr[\"lon\"].values[time_id_attr], data_attr[\"lat\"].values[time_id_attr]\n", - " ]\n", - " )\n", - " for c, lon_a, lat_a in zip(\n", - " circs,\n", - " data_attr[\"lon\"].values[time_id_attr],\n", - " data_attr[\"lat\"].values[time_id_attr],\n", - " ):\n", + " g = df_other.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " scatter.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", + " g_attr = df_attr.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " scatter_attr.set_offsets(np.c_[g_attr[\"lon\"], g_attr[\"lat\"]])\n", + " for c, lon_a, lat_a in zip(circs, g_attr[\"lon\"], g_attr[\"lat\"], strict=True):\n", " c.center = (lon_a, lat_a)\n", + " title.set_text(\n", + " f\"Particles at t = {timerange[i].total_seconds()}s\\n(Red particles are attractors)\"\n", + " )\n", "\n", "\n", "# Create animation\n", @@ -321,7 +293,7 @@ ")\n", "\n", "output_file = parcels.ParticleFile(\n", - " store=\"MergingParticles.zarr\",\n", + " path=\"MergingParticles.parquet\",\n", " outputdt=np.timedelta64(1, \"s\"),\n", ")\n", "\n", @@ -346,13 +318,8 @@ "metadata": {}, "outputs": [], "source": [ - "data_xarray = xr.open_zarr(\"MergingParticles.zarr\")\n", - "\n", - "timerange = np.arange(\n", - " np.nanmin(data_xarray[\"time\"].values),\n", - " np.nanmax(data_xarray[\"time\"].values),\n", - " np.timedelta64(1, \"s\"),\n", - ")\n", + "df = parcels.read_particlefile(\"MergingParticles.parquet\")\n", + "timerange = df[\"time\"].unique()\n", "\n", "fig = plt.figure(figsize=(4, 4), constrained_layout=True)\n", "ax = fig.add_subplot()\n", @@ -362,31 +329,16 @@ "ax.set_xlim(-1.1, 1.1)\n", "ax.set_ylim(-1.1, 1.1)\n", "\n", - "time_id = np.where(data_xarray[\"time\"] == timerange[0])\n", - "\n", - "scatter = ax.scatter(\n", - " data_xarray[\"lon\"].values[time_id],\n", - " data_xarray[\"lat\"].values[time_id],\n", - " s=data_xarray[\"mass\"].values[time_id],\n", - " c=\"b\",\n", - " zorder=1,\n", - ")\n", - "\n", - "t = str(timerange[0].astype(\"timedelta64[s]\"))\n", - "title = ax.set_title(\"Particles at t = \" + t)\n", + "g = df.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "scatter = ax.scatter(g[\"lon\"], g[\"lat\"], c=\"b\", s=g[\"mass\"], zorder=1)\n", + "title = ax.set_title(f\"Particles at t = {timerange[0].total_seconds()}s\")\n", "\n", "\n", "def animate(i):\n", - " t = str(timerange[i].astype(\"timedelta64[s]\"))\n", - " title.set_text(\"Particles at t = \" + t)\n", - "\n", - " time_id = np.where(data_xarray[\"time\"] == timerange[i])\n", - " scatter.set_offsets(\n", - " np.c_[data_xarray[\"lon\"].values[time_id], data_xarray[\"lat\"].values[time_id]]\n", - " )\n", - " scatter.set_sizes(data_xarray[\"mass\"].values[time_id])\n", - "\n", - " return (scatter,)\n", + " g = df.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " scatter.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", + " scatter.set_sizes(g[\"mass\"])\n", + " title.set_text(f\"Particles at t = {timerange[i].total_seconds()}s\")\n", "\n", "\n", "anim = FuncAnimation(fig, animate, frames=len(timerange), interval=100)\n", @@ -397,7 +349,7 @@ ], "metadata": { "kernelspec": { - "display_name": "test-latest", + "display_name": "docs", "language": "python", "name": "python3" }, @@ -411,7 +363,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.0" + "version": "3.14.4" } }, "nbformat": 4, From 3ad3f1051884f72602a7fa44ee4ce30aa1ff94b8 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 08:54:43 +0200 Subject: [PATCH 54/69] Update tutorial_manipulating_field_data to use parquet --- .../examples/tutorial_manipulating_field_data.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb b/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb index 4196ba4a5..aabc59ee7 100644 --- a/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb +++ b/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb @@ -131,7 +131,7 @@ "\n", "pset = parcels.ParticleSet(fieldset, pclass=parcels.Particle, z=z, lat=lats, lon=lons)\n", "output_file = parcels.ParticleFile(\n", - " store=\"summed_advection_wind.zarr\", outputdt=np.timedelta64(6, \"h\")\n", + " path=\"summed_advection_wind.parquet\", outputdt=np.timedelta64(6, \"h\")\n", ")\n", "pset.execute(\n", " [parcels.kernels.AdvectionRK2],\n", @@ -157,8 +157,9 @@ "outputs": [], "source": [ "# Plot the resulting particle trajectories overlapped for both cases\n", - "summed_advection_wind = xr.open_zarr(\"summed_advection_wind.zarr\")\n", - "plt.plot(summed_advection_wind.lon.T, summed_advection_wind.lat.T, \"-\")\n", + "summed_advection_wind = parcels.read_particlefile(\"summed_advection_wind.parquet\")\n", + "for g in summed_advection_wind.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(g[\"lon\"], g[\"lat\"], \"-\")\n", "plt.show()" ] } @@ -179,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" } }, "nbformat": 4, From daad8c9ec91925a84c6989b5248a78741680ed94 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 08:56:21 +0200 Subject: [PATCH 55/69] Update tutorial_mitgcm to use parquet --- docs/user_guide/examples/tutorial_mitgcm.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/user_guide/examples/tutorial_mitgcm.ipynb b/docs/user_guide/examples/tutorial_mitgcm.ipynb index ae46c7fce..7f48af15c 100644 --- a/docs/user_guide/examples/tutorial_mitgcm.ipynb +++ b/docs/user_guide/examples/tutorial_mitgcm.ipynb @@ -25,7 +25,6 @@ "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import xarray as xr\n", "\n", "import parcels\n", "import parcels.tutorial\n", @@ -94,7 +93,7 @@ "pset = parcels.ParticleSet(fieldset=fieldset, lon=X, lat=Y, z=Z)\n", "\n", "outputfile = parcels.ParticleFile(\n", - " store=\"mitgcm_particles.zarr\",\n", + " path=\"mitgcm_particles.parquet\",\n", " outputdt=np.timedelta64(5000, \"s\"),\n", ")\n", "\n", @@ -121,9 +120,10 @@ "metadata": {}, "outputs": [], "source": [ - "ds = xr.open_zarr(\"mitgcm_particles.zarr\")\n", + "df = parcels.read_particlefile(\"mitgcm_particles.parquet\")\n", "\n", - "plt.plot(ds.lon.T, ds.lat.T, \".-\")\n", + "for g in df.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(g[\"lon\"], g[\"lat\"], \".-\")\n", "plt.show()" ] } @@ -144,7 +144,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" } }, "nbformat": 4, From f811b8dc6deecc46c4edcd0135c0bfbc331919d2 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 09:01:35 +0200 Subject: [PATCH 56/69] Updsate tutorial_nestedgrids to use parquet --- .../examples/tutorial_nestedgrids.ipynb | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/user_guide/examples/tutorial_nestedgrids.ipynb b/docs/user_guide/examples/tutorial_nestedgrids.ipynb index 86095e6b8..f06ec84c6 100644 --- a/docs/user_guide/examples/tutorial_nestedgrids.ipynb +++ b/docs/user_guide/examples/tutorial_nestedgrids.ipynb @@ -483,7 +483,7 @@ "\n", "pset = parcels.ParticleSet(fieldset, pclass=NestedGridParticle, lon=lon, lat=lat)\n", "ofile = parcels.ParticleFile(\n", - " \"nestedgrid_particles.zarr\", outputdt=np.timedelta64(1, \"D\")\n", + " \"nestedgrid_particles.parquet\", outputdt=np.timedelta64(1, \"D\")\n", ")\n", "pset.execute(\n", " AdvectEE_NestedGrids,\n", @@ -503,12 +503,15 @@ "source": [ "fig, ax = plt.subplots(1, 1, figsize=(10, 5))\n", "\n", - "ds_out = xr.open_zarr(\"nestedgrid_particles.zarr\")\n", + "df = parcels.read_particlefile(\"nestedgrid_particles.parquet\")\n", "\n", - "plt.plot(ds_out.lon.T, ds_out.lat.T, \"k\", linewidth=0.5)\n", - "sc = ax.scatter(ds_out.lon, ds_out.lat, c=ds_out.gridID, s=4, cmap=cmap, vmin=0, vmax=2)\n", - "xl, yl = ax.get_xlim(), ax.get_ylim()\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(traj[\"lon\"], traj[\"lat\"], \".-\")\n", + " sc = ax.scatter(\n", + " traj[\"lon\"], traj[\"lat\"], c=traj[\"gridID\"], s=4, cmap=cmap, vmin=0, vmax=2\n", + " )\n", "\n", + "xl, yl = ax.get_xlim(), ax.get_ylim()\n", "for i in range(n_grids - 1):\n", " poly = grid_polygons[i]\n", " ax.plot(\n", @@ -524,7 +527,6 @@ "cbar = plt.colorbar(sc, ticks=[0, 1, 2], ax=ax)\n", "cbar.set_label(\"Grid ID\")\n", "ax.set_title(\"Particle advection through nested Grids\")\n", - "plt.tight_layout\n", "plt.show()" ] }, @@ -541,7 +543,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "docs", "language": "python", "name": "python3" }, @@ -555,7 +557,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.11" + "version": "3.14.4" } }, "nbformat": 4, From 4dd08b9c11695f971fefedf847754d9e017f863b Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 09:14:39 +0200 Subject: [PATCH 57/69] Update tutorial_sampling to use parquet (and remove to_write="once" section) --- .../examples/tutorial_sampling.ipynb | 115 +++--------------- 1 file changed, 14 insertions(+), 101 deletions(-) diff --git a/docs/user_guide/examples/tutorial_sampling.ipynb b/docs/user_guide/examples/tutorial_sampling.ipynb index 87855debe..590c0f5f8 100644 --- a/docs/user_guide/examples/tutorial_sampling.ipynb +++ b/docs/user_guide/examples/tutorial_sampling.ipynb @@ -38,9 +38,6 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", - "# To open and look at the temperature data\n", - "import xarray as xr\n", - "\n", "import parcels\n", "import parcels.tutorial" ] @@ -156,7 +153,7 @@ " fieldset=fieldset, pclass=SampleParticle, lon=lon, lat=lat, time=time, z=z\n", ")\n", "\n", - "output_file = parcels.ParticleFile(\"sampletemp.zarr\", outputdt=timedelta(hours=1))\n", + "output_file = parcels.ParticleFile(\"sampletemp.parquet\", outputdt=timedelta(hours=1))\n", "\n", "pset.execute(\n", " [parcels.kernels.AdvectionRK2, SampleT],\n", @@ -180,22 +177,23 @@ "metadata": {}, "outputs": [], "source": [ - "ds_particles = xr.open_zarr(\"sampletemp.zarr\")\n", + "df = parcels.read_particlefile(\"sampletemp.parquet\")\n", "\n", "plt.figure()\n", "ax = plt.axes()\n", "ax.set_ylabel(\"Latitude\")\n", "ax.set_xlabel(\"Longitude\")\n", - "ax.plot(ds_particles.lon.transpose(), ds_particles.lat.transpose(), c=\"k\", zorder=1)\n", - "T_scatter = ax.scatter(\n", - " ds_particles.lon,\n", - " ds_particles.lat,\n", - " c=ds_particles.temperature,\n", - " cmap=plt.cm.inferno,\n", - " norm=mpl.colors.Normalize(vmin=22.0, vmax=24.0),\n", - " edgecolor=\"k\",\n", - " zorder=2,\n", - ")\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", + " ax.plot(traj[\"lon\"], traj[\"lat\"], c=\"k\", zorder=1)\n", + " T_scatter = ax.scatter(\n", + " traj[\"lon\"],\n", + " traj[\"lat\"],\n", + " c=traj[\"temperature\"],\n", + " cmap=plt.cm.inferno,\n", + " norm=mpl.colors.Normalize(vmin=22.0, vmax=24.0),\n", + " edgecolor=\"k\",\n", + " zorder=2,\n", + " )\n", "plt.colorbar(T_scatter, label=r\"T [$^{\\circ} C$]\")\n", "plt.show()" ] @@ -303,91 +301,6 @@ "Note that the Kernels above return the value of `U` and `V` in the units of the grid. That means that for a spherical grid, the velocities are in **degrees/s**. To convert these to **m/s**, see the [UnitConversion tutorial](https://docs.oceanparcels.org/en/latest/examples/tutorial_unitconverters.html).\n", "" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sampling initial values\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In some simulations only the particles initial value within the field is of interest: the variable does not need to be known along the entire trajectory. To reduce computing we can specify the `to_write` argument to the temperature `Variable`. This argument can have three values: `True`, `False` or `'once'`. It determines whether to write the `Variable` to the output file. If we want to know only the initial value, we can enter `'once'` and only the first value will be written to the output file.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SampleParticleOnce = parcels.Particle.add_variable(\n", - " parcels.Variable(\"temperature\", initial=np.nan, to_write=\"once\")\n", - ")\n", - "\n", - "pset = parcels.ParticleSet(\n", - " fieldset=fieldset,\n", - " pclass=SampleParticleOnce,\n", - " lon=lon,\n", - " lat=lat,\n", - " time=time,\n", - " z=z,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "hide-output" - ] - }, - "outputs": [], - "source": [ - "output_file = parcels.ParticleFile(\"writeonce.zarr\", outputdt=timedelta(hours=1))\n", - "\n", - "pset.execute(\n", - " [parcels.kernels.AdvectionRK2, SampleT],\n", - " runtime=timedelta(hours=24),\n", - " dt=timedelta(minutes=5),\n", - " output_file=output_file,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can compare the output where only the initial value is written to output, with the original simulation, where the temperature at each outputdt is written:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ds_particles_once = xr.open_zarr(\"writeonce.zarr\")\n", - "\n", - "plt.figure()\n", - "ax = plt.axes()\n", - "ax.set_ylabel(\"Temperature [$^{\\\\circ}$C]\")\n", - "ax.set_xlabel(\"Observation Number (-)\")\n", - "ax.set_ylim(22.2, 24.5)\n", - "l1 = ax.plot(ds_particles.obs, ds_particles.temperature.T, color=\"red\")\n", - "l2 = ax.plot(\n", - " ds_particles_once.obs,\n", - " np.tile(ds_particles_once.temperature, (ds_particles_once.lon.shape[1], 1)),\n", - " color=\"tab:blue\",\n", - ")\n", - "ax.legend([l1[0], l2[0]], [\"Write every outputdt\", \"Write once\"])\n", - "plt.show()" - ] } ], "metadata": { @@ -407,7 +320,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" }, "pycharm": { "stem_cell": { From b06a051ea70bcc90b97e178508ac7a19ce07233f Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 09:21:26 +0200 Subject: [PATCH 58/69] Removing old attributes from particlefile.repr --- src/parcels/_reprs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/parcels/_reprs.py b/src/parcels/_reprs.py index d27eee379..bd8379936 100644 --- a/src/parcels/_reprs.py +++ b/src/parcels/_reprs.py @@ -129,8 +129,6 @@ def particlefile_repr(pfile: Any) -> str: out = f"""<{type(pfile).__name__}> path : {pfile.path} outputdt : {pfile.outputdt!r} - chunks : {pfile.chunks!r} - create_new_zarrfile : {pfile.create_new_zarrfile!r} metadata : {_format_list_items_multiline(pfile.metadata, level=2, with_brackets=False)} """ From aa9fbd19da4399a5ea36284ff4f65ab6d6926c18 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 09:44:46 +0200 Subject: [PATCH 59/69] Using more intuitive variable names for polars subsetting --- docs/getting_started/tutorial_output.ipynb | 53 ++++++++++--------- .../examples/tutorial_croco_3D.ipynb | 8 +-- .../examples/tutorial_delaystart.ipynb | 18 +++---- .../examples/tutorial_dt_integrators.ipynb | 28 +++++----- .../examples/tutorial_interaction.ipynb | 36 +++++++------ .../tutorial_manipulating_field_data.ipynb | 4 +- .../user_guide/examples/tutorial_mitgcm.ipynb | 4 +- 7 files changed, 79 insertions(+), 72 deletions(-) diff --git a/docs/getting_started/tutorial_output.ipynb b/docs/getting_started/tutorial_output.ipynb index 7a7eb4401..a5e70a42e 100644 --- a/docs/getting_started/tutorial_output.ipynb +++ b/docs/getting_started/tutorial_output.ipynb @@ -225,11 +225,11 @@ "metadata": {}, "outputs": [], "source": [ - "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", + "for traj in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", " time_origin = pd.Timestamp(fieldset.time_interval.left).to_pydatetime()\n", - " time_in_hour = (g[\"time\"] - time_origin).dt.total_hours()\n", - " traj = g[\"particle_id\"][0]\n", - " print(f\"Particle {traj}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" + " time_in_hour = (traj[\"time\"] - time_origin).dt.total_hours()\n", + " traj_id = traj[\"particle_id\"][0]\n", + " print(f\"Particle {traj_id}: \" + \"\".join(f\"{int(t):2d} \" for t in time_in_hour))" ] }, { @@ -281,9 +281,9 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(5, 3))\n", - "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", - " traj = g[\"particle_id\"][0]\n", - " ax.plot(g[\"lon\"], g[\"lat\"], \".-\", label=f\"P{traj}\")\n", + "for traj in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", + " traj_id = traj[\"particle_id\"][0]\n", + " ax.plot(traj[\"lon\"], traj[\"lat\"], \".-\", label=f\"P{traj_id}\")\n", "ax.legend(loc=\"center left\", bbox_to_anchor=(1.02, 0.5), borderaxespad=0.0)\n", "plt.tight_layout()\n", "plt.show()" @@ -304,10 +304,10 @@ "source": [ "time_step = np.timedelta64(18, \"h\")\n", "time_to_plot = fieldset.time_interval.left + time_step\n", - "g = df_particles.filter(pl.col(\"time\") == pl.lit(time_to_plot))\n", + "particles = df_particles.filter(pl.col(\"time\") == pl.lit(time_to_plot))\n", "\n", "fig, ax = plt.subplots(figsize=(5, 3))\n", - "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", + "ax.plot(particles[\"lon\"], particles[\"lat\"], \"o\")\n", "title_time = pd.to_datetime(time_to_plot).strftime(\"%Y-%m-%d %H:%M:%S\")\n", "ax.set_title(f\"Particle locations at {title_time}\")\n", "plt.show()" @@ -327,12 +327,12 @@ "outputs": [], "source": [ "time_step = np.timedelta64(18, \"h\")\n", - "g = df_particles.filter(\n", + "particles = df_particles.filter(\n", " (pl.col(\"time\") - pl.col(\"time\").min().over(\"particle_id\")) == pl.lit(time_step)\n", ")\n", "\n", "fig, ax = plt.subplots(figsize=(5, 3))\n", - "ax.plot(g[\"lon\"], g[\"lat\"], \"o\")\n", + "ax.plot(particles[\"lon\"], particles[\"lat\"], \"o\")\n", "ax.set_title(f\"Particle locations {time_step} after their release\")\n", "plt.show()" ] @@ -352,12 +352,14 @@ "source": [ "fig, ax = plt.subplots(1, 2, figsize=(10, 4))\n", "\n", - "for g in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", - " distance = np.sqrt((g[\"lon\"] - g[\"lon\"][0]) ** 2 + (g[\"lat\"] - g[\"lat\"][0]) ** 2)\n", - " ax[0].plot(g[\"time\"], distance, \".-\", label=f\"P{g['particle_id'][0]}\")\n", + "for traj in df_particles.partition_by(\"particle_id\", maintain_order=True):\n", + " distance = np.sqrt(\n", + " (traj[\"lon\"] - traj[\"lon\"][0]) ** 2 + (traj[\"lat\"] - traj[\"lat\"][0]) ** 2\n", + " )\n", + " ax[0].plot(traj[\"time\"], distance, \".-\", label=f\"P{traj['particle_id'][0]}\")\n", "\n", - " rel_time = (g[\"time\"] - g[\"time\"][0]).dt.total_hours()\n", - " ax[1].plot(rel_time, distance, \".-\", label=f\"P{g['particle_id'][0]}\")\n", + " rel_time = (traj[\"time\"] - traj[\"time\"][0]).dt.total_hours()\n", + " ax[1].plot(rel_time, distance, \".-\", label=f\"P{traj['particle_id'][0]}\")\n", "\n", "ax[0].set_xlabel(\"Date\")\n", "ax[0].set_ylabel(\"Distance travelled [degrees]\")\n", @@ -435,9 +437,12 @@ "ax.add_feature(cfeature.LAND)\n", "\n", "# --> plot first timestep\n", - "g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "particles = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", "scatter = ax.scatter(\n", - " g[\"lon\"], g[\"lat\"], s=10, c=[trajectory_to_color[rt] for rt in g[\"particle_id\"]]\n", + " particles[\"lon\"],\n", + " particles[\"lat\"],\n", + " s=10,\n", + " c=[trajectory_to_color[p] for p in particles[\"particle_id\"]],\n", ")\n", "\n", "# --> initialize trails\n", @@ -456,13 +461,11 @@ " title.set_text(f\"Particles at t = {t_str}\")\n", "\n", " # Find particles at current time\n", - " trajs_at_timestep = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " particles = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", "\n", - " if len(trajs_at_timestep) > 0:\n", - " scatter.set_offsets(np.c_[trajs_at_timestep[\"lon\"], trajs_at_timestep[\"lat\"]])\n", - " scatter.set_color(\n", - " [trajectory_to_color[traj] for traj in trajs_at_timestep[\"particle_id\"]]\n", - " )\n", + " if len(particles) > 0:\n", + " scatter.set_offsets(np.c_[particles[\"lon\"], particles[\"lat\"]])\n", + " scatter.set_color([trajectory_to_color[p] for p in particles[\"particle_id\"]])\n", "\n", " # --> reset trails\n", " for trail in trail_plot:\n", @@ -470,7 +473,7 @@ " trail_plot.clear()\n", " trail_length = min(10, i) # trails will have max length of 10 time steps\n", " if trail_length > 0:\n", - " for traj in trajs_at_timestep[\"particle_id\"].unique():\n", + " for traj in particles[\"particle_id\"].unique():\n", " traj_trail = df_particles.filter(\n", " (pl.col(\"particle_id\") == traj)\n", " & (pl.col(\"time\") >= pl.lit(timerange[max(0, i - trail_length)]))\n", diff --git a/docs/user_guide/examples/tutorial_croco_3D.ipynb b/docs/user_guide/examples/tutorial_croco_3D.ipynb index c54508a58..9e6ad1865 100644 --- a/docs/user_guide/examples/tutorial_croco_3D.ipynb +++ b/docs/user_guide/examples/tutorial_croco_3D.ipynb @@ -162,8 +162,8 @@ "df = pl.read_parquet(\"croco_particles3D.parquet\")\n", "\n", "ax.plot(X / 1e3, Z, \"k.\", label=\"Initial positions\")\n", - "for g in df.partition_by(\"trajectory\", maintain_order=True):\n", - " ax.plot(g[\"lon\"] / 1e3, g[\"z\"], \".-\")\n", + "for traj in df.partition_by(\"trajectory\", maintain_order=True):\n", + " ax.plot(traj[\"lon\"] / 1e3, traj[\"z\"], \".-\")\n", "\n", "for z in ds_fields.s_w.values:\n", " ax.plot(\n", @@ -233,8 +233,8 @@ "df = pl.read_parquet(\"croco_particles_noW.parquet\")\n", "\n", "ax.plot(X / 1e3, Z, \"k.\", label=\"Initial positions\")\n", - "for g in df.partition_by(\"trajectory\", maintain_order=True):\n", - " ax.plot(g[\"lon\"] / 1e3, g[\"z\"], \".-\")\n", + "for traj in df.partition_by(\"trajectory\", maintain_order=True):\n", + " ax.plot(traj[\"lon\"] / 1e3, traj[\"z\"], \".-\")\n", "\n", "for z in ds_fields.s_w.values:\n", " ax.plot(\n", diff --git a/docs/user_guide/examples/tutorial_delaystart.ipynb b/docs/user_guide/examples/tutorial_delaystart.ipynb index 565187415..7678e2a17 100644 --- a/docs/user_guide/examples/tutorial_delaystart.ipynb +++ b/docs/user_guide/examples/tutorial_delaystart.ipynb @@ -154,17 +154,16 @@ "\n", "timerange = df_particles[\"time\"].unique()\n", "\n", - "g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", - "sc = ax.scatter(g[\"lon\"], g[\"lat\"])\n", + "particles = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "sc = ax.scatter(particles[\"lon\"], particles[\"lat\"])\n", "title = ax.set_title(f\"Particles at t = {timerange[0]}\")\n", "\n", "\n", "def animate(i):\n", + " particles = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " sc.set_offsets(np.c_[particles[\"lon\"], particles[\"lat\"]])\n", " title.set_text(f\"Particles at t = {timerange[i]}\")\n", "\n", - " g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", - " sc.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", - "\n", "\n", "anim = FuncAnimation(fig, animate, frames=len(timerange), interval=100)\n", "plt.close()\n", @@ -284,17 +283,16 @@ "\n", "timerange = df_particles[\"time\"].unique()\n", "\n", - "g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", - "sc = ax.scatter(g[\"lon\"], g[\"lat\"])\n", + "particles = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "sc = ax.scatter(particles[\"lon\"], particles[\"lat\"])\n", "title = ax.set_title(f\"Particles at t = {timerange[0]}\")\n", "\n", "\n", "def animate(i):\n", + " particles = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " sc.set_offsets(np.c_[particles[\"lon\"], particles[\"lat\"]])\n", " title.set_text(f\"Particles at t = {timerange[i]}\")\n", "\n", - " g = df_particles.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", - " sc.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", - "\n", "\n", "anim = FuncAnimation(fig, animate, frames=len(timerange), interval=100)\n", "plt.close(fig)\n", diff --git a/docs/user_guide/examples/tutorial_dt_integrators.ipynb b/docs/user_guide/examples/tutorial_dt_integrators.ipynb index ef86954a9..3dee169a4 100644 --- a/docs/user_guide/examples/tutorial_dt_integrators.ipynb +++ b/docs/user_guide/examples/tutorial_dt_integrators.ipynb @@ -293,10 +293,10 @@ " df = parcels.read_particlefile(\n", " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", - " for i, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " for i, traj in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", " ax.plot(\n", - " g[\"lon\"],\n", - " g[\"lat\"],\n", + " traj[\"lon\"],\n", + " traj[\"lat\"],\n", " alpha=0.75,\n", " color=plt.cm.viridis(dt_colours[j]),\n", " label=f\"dt = {dt}\" if i == 0 else None,\n", @@ -391,17 +391,17 @@ " df = df.with_columns(pl.Series(\"dist\", dist))\n", "\n", " # plot\n", - " for j, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " for j, traj in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", " axs[0].plot(\n", - " g[\"time\"],\n", - " g[\"dist\"],\n", + " traj[\"time\"],\n", + " traj[\"dist\"],\n", " alpha=0.75,\n", " color=plt.cm.viridis(dt_colours[i]),\n", " label=f\"dt = {dt}\" if j == 0 else None,\n", " )\n", " axs[1].plot(\n", - " g[\"time\"],\n", - " g[\"dist\"],\n", + " traj[\"time\"],\n", + " traj[\"dist\"],\n", " alpha=0.75,\n", " color=plt.cm.viridis(dt_colours[i]),\n", " label=f\"dt = {dt}\" if j == 0 else None,\n", @@ -622,10 +622,10 @@ " df = parcels.read_particlefile(\n", " f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", - " for i, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " for i, traj in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", " axs[m, n].plot(\n", - " g[\"lon\"],\n", - " g[\"lat\"],\n", + " traj[\"lon\"],\n", + " traj[\"lat\"],\n", " alpha=0.75,\n", " color=plt.cm.viridis(scheme_colours[j]),\n", " label=f\"{advection_scheme.__name__}\" if i == 0 else None,\n", @@ -679,10 +679,10 @@ "\n", " dist = dist_km(df[\"lon\"], df_RK4[\"lon\"], df[\"lat\"], df_RK4[\"lat\"])\n", " df = df.with_columns(pl.Series(\"dist\", dist))\n", - " for k, g in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", + " for k, traj in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", " axs[m, n].plot(\n", - " g[\"time\"],\n", - " g[\"dist\"],\n", + " traj[\"time\"],\n", + " traj[\"dist\"],\n", " alpha=0.75,\n", " color=plt.cm.viridis(scheme_colours[j]),\n", " label=f\"|{advection_scheme.__name__} - AdvectionRK4|\"\n", diff --git a/docs/user_guide/examples/tutorial_interaction.ipynb b/docs/user_guide/examples/tutorial_interaction.ipynb index a7ac853ba..5da9bbbe8 100644 --- a/docs/user_guide/examples/tutorial_interaction.ipynb +++ b/docs/user_guide/examples/tutorial_interaction.ipynb @@ -169,12 +169,14 @@ "ax.set_xlim(-1.1, 1.1)\n", "ax.set_ylim(-1.1, 1.1)\n", "\n", - "g = df_other.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", - "scatter = ax.scatter(g[\"lon\"], g[\"lat\"], c=\"b\", s=5, zorder=1)\n", - "g_attr = df_attr.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", - "scatter_attr = ax.scatter(g_attr[\"lon\"], g_attr[\"lat\"], c=\"r\", s=40, zorder=2)\n", + "particles = df_other.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "scatter = ax.scatter(particles[\"lon\"], particles[\"lat\"], c=\"b\", s=5, zorder=1)\n", + "particles_attr = df_attr.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "scatter_attr = ax.scatter(\n", + " particles_attr[\"lon\"], particles_attr[\"lat\"], c=\"r\", s=40, zorder=2\n", + ")\n", "circs = []\n", - "for lon_a, lat_a in zip(g_attr[\"lon\"], g_attr[\"lat\"], strict=True):\n", + "for lon_a, lat_a in zip(particles_attr[\"lon\"], particles_attr[\"lat\"], strict=True):\n", " circs.append(\n", " ax.add_patch(\n", " plt.Circle(\n", @@ -189,11 +191,13 @@ "\n", "\n", "def animate(i):\n", - " g = df_other.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", - " scatter.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", - " g_attr = df_attr.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", - " scatter_attr.set_offsets(np.c_[g_attr[\"lon\"], g_attr[\"lat\"]])\n", - " for c, lon_a, lat_a in zip(circs, g_attr[\"lon\"], g_attr[\"lat\"], strict=True):\n", + " particles = df_other.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " scatter.set_offsets(np.c_[particles[\"lon\"], particles[\"lat\"]])\n", + " particles_attr = df_attr.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " scatter_attr.set_offsets(np.c_[particles_attr[\"lon\"], particles_attr[\"lat\"]])\n", + " for c, lon_a, lat_a in zip(\n", + " circs, particles_attr[\"lon\"], particles_attr[\"lat\"], strict=True\n", + " ):\n", " c.center = (lon_a, lat_a)\n", " title.set_text(\n", " f\"Particles at t = {timerange[i].total_seconds()}s\\n(Red particles are attractors)\"\n", @@ -329,15 +333,17 @@ "ax.set_xlim(-1.1, 1.1)\n", "ax.set_ylim(-1.1, 1.1)\n", "\n", - "g = df.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", - "scatter = ax.scatter(g[\"lon\"], g[\"lat\"], c=\"b\", s=g[\"mass\"], zorder=1)\n", + "particles = df.filter(pl.col(\"time\") == pl.lit(timerange[0]))\n", + "scatter = ax.scatter(\n", + " particles[\"lon\"], particles[\"lat\"], c=\"b\", s=particles[\"mass\"], zorder=1\n", + ")\n", "title = ax.set_title(f\"Particles at t = {timerange[0].total_seconds()}s\")\n", "\n", "\n", "def animate(i):\n", - " g = df.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", - " scatter.set_offsets(np.c_[g[\"lon\"], g[\"lat\"]])\n", - " scatter.set_sizes(g[\"mass\"])\n", + " particles = df.filter(pl.col(\"time\") == pl.lit(timerange[i]))\n", + " scatter.set_offsets(np.c_[particles[\"lon\"], particles[\"lat\"]])\n", + " scatter.set_sizes(particles[\"mass\"])\n", " title.set_text(f\"Particles at t = {timerange[i].total_seconds()}s\")\n", "\n", "\n", diff --git a/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb b/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb index aabc59ee7..cf02db372 100644 --- a/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb +++ b/docs/user_guide/examples/tutorial_manipulating_field_data.ipynb @@ -158,8 +158,8 @@ "source": [ "# Plot the resulting particle trajectories overlapped for both cases\n", "summed_advection_wind = parcels.read_particlefile(\"summed_advection_wind.parquet\")\n", - "for g in summed_advection_wind.partition_by(\"particle_id\", maintain_order=True):\n", - " plt.plot(g[\"lon\"], g[\"lat\"], \"-\")\n", + "for traj in summed_advection_wind.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(traj[\"lon\"], traj[\"lat\"], \"-\")\n", "plt.show()" ] } diff --git a/docs/user_guide/examples/tutorial_mitgcm.ipynb b/docs/user_guide/examples/tutorial_mitgcm.ipynb index 7f48af15c..92258c6ca 100644 --- a/docs/user_guide/examples/tutorial_mitgcm.ipynb +++ b/docs/user_guide/examples/tutorial_mitgcm.ipynb @@ -122,8 +122,8 @@ "source": [ "df = parcels.read_particlefile(\"mitgcm_particles.parquet\")\n", "\n", - "for g in df.partition_by(\"particle_id\", maintain_order=True):\n", - " plt.plot(g[\"lon\"], g[\"lat\"], \".-\")\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(traj[\"lon\"], traj[\"lat\"], \".-\")\n", "plt.show()" ] } From b57e78f2b4fb930a851fb54aaedd1203a71f3c33 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 10:45:30 +0200 Subject: [PATCH 60/69] Fixing repr of particleset --- src/parcels/_reprs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parcels/_reprs.py b/src/parcels/_reprs.py index bd8379936..e87d4dc4c 100644 --- a/src/parcels/_reprs.py +++ b/src/parcels/_reprs.py @@ -97,7 +97,7 @@ def particleset_repr(pset: ParticleSet) -> str: def particlesetview_repr(pview: Any) -> str: """Return a pretty repr for ParticleSetView""" time_string = "not_yet_set" if pview.time is None or np.isnan(pview.time) else f"{pview.time:f}" - out = f"P[{pview.trajectory}]: time={time_string}, z={pview.z:f}, lat={pview.lat:f}, lon={pview.lon:f}" + out = f"P[{pview.particle_id}]: time={time_string}, z={pview.z:f}, lat={pview.lat:f}, lon={pview.lon:f}" vars = [v.name for v in pview._ptype.variables if v.to_write is True and v.name not in ["lon", "lat", "z", "time"]] for var in vars: out += f", {var}={getattr(pview, var):f}" From 9aa14596fb0d3a674e4d30bd7f53a8ee3f2502a7 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 10:50:49 +0200 Subject: [PATCH 61/69] Update tutorial_Argofloats.ipynb --- .../examples/tutorial_Argofloats.ipynb | 29 +++++-------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/docs/user_guide/examples/tutorial_Argofloats.ipynb b/docs/user_guide/examples/tutorial_Argofloats.ipynb index 1e6abb9a9..6704bf262 100644 --- a/docs/user_guide/examples/tutorial_Argofloats.ipynb +++ b/docs/user_guide/examples/tutorial_Argofloats.ipynb @@ -182,21 +182,6 @@ "First plot the depth as a function of time, with the temperature as color (only on the upcast)." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = parcels.read_particlefile(\"argo_float.parquet\")\n", - "\n", - "x = df[\"lon\"].values\n", - "y = df[\"lat\"].values\n", - "z = df[\"z\"].values\n", - "time = df[\"time\"].values\n", - "temp = df[\"temp\"].values" - ] - }, { "cell_type": "code", "execution_count": null, @@ -205,10 +190,12 @@ "source": [ "import matplotlib.pyplot as plt\n", "\n", + "df = parcels.read_particlefile(\"argo_float.parquet\")\n", + "\n", "fig = plt.figure(figsize=(13, 6))\n", "ax = plt.axes()\n", - "ax.plot(time, z, color=\"gray\")\n", - "cb = ax.scatter(time, z, c=temp, s=20, marker=\"o\", zorder=2)\n", + "ax.plot(df[\"time\"], df[\"z\"], color=\"gray\")\n", + "cb = ax.scatter(df[\"time\"], df[\"z\"], c=df[\"temp\"], s=20, marker=\"o\", zorder=2)\n", "ax.set_xlabel(\"Time [days]\")\n", "ax.set_ylabel(\"Depth (m)\")\n", "ax.invert_yaxis()\n", @@ -234,12 +221,12 @@ "fig = plt.figure(figsize=(13, 8))\n", "ax = plt.axes(projection=\"3d\")\n", "ax.view_init(azim=-145)\n", - "ax.plot3D(x, y, z, color=\"gray\")\n", - "cb = ax.scatter(x, y, z, c=temp, s=20, marker=\"o\", zorder=2)\n", + "ax.plot3D(df[\"lon\"], df[\"lat\"], df[\"z\"], color=\"gray\")\n", + "cb = ax.scatter(df[\"lon\"], df[\"lat\"], df[\"z\"], c=df[\"temp\"], s=20, marker=\"o\", zorder=2)\n", "ax.set_xlabel(\"Longitude\")\n", "ax.set_ylabel(\"Latitude\")\n", "ax.set_zlabel(\"Depth (m)\")\n", - "ax.set_zlim(np.max(z), 0)\n", + "ax.set_zlim(df[\"z\"].max(), 0)\n", "fig.colorbar(cb, label=\"Temperature (°C)\")\n", "plt.show()" ] @@ -261,7 +248,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.14.2" + "version": "3.14.4" } }, "nbformat": 4, From d521ad37f8f2ae1430feaa5cce7e35b41ff72d50 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 10:51:44 +0200 Subject: [PATCH 62/69] Update tutorial_quickstart to use parquet --- docs/getting_started/tutorial_quickstart.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/getting_started/tutorial_quickstart.md b/docs/getting_started/tutorial_quickstart.md index 318625271..5f96a5791 100644 --- a/docs/getting_started/tutorial_quickstart.md +++ b/docs/getting_started/tutorial_quickstart.md @@ -155,8 +155,8 @@ pset.execute( To start analyzing the trajectories computed by **Parcels**, we can open the `ParticleFile` using `xarray`: ```{code-cell} -df_particles = pl.read_parquet("output-quickstart.parquet") -df_particles +df = parcels.read_particlefile("output-quickstart.parquet") +df ``` The file contains 250 rows: 25 observations for the 10 particle trajectories. @@ -167,9 +167,9 @@ Let's verify that Parcels has computed the advection of the virtual particles! ```{code-cell} import matplotlib.pyplot as plt -# plot positions and color particles by number of observation -scatter = plt.scatter(df_particles['lon'], df_particles['lat'], c=np.repeat(df_particles['obs'].values, npart)) -plt.scatter(df_particles['lon'][:npart], df_particles['lat'][:npart], facecolors="none", edgecolors='r') # starting positions +# plot positions and color particles by time +scatter = plt.scatter(df['lon'], df['lat'], c=df['time']) +plt.scatter(df['lon'][:npart], df['lat'][:npart], facecolors="none", edgecolors='r') # starting positions plt.scatter(lon, lat, facecolors="none", edgecolors='r') # starting positions plt.xlim(31,33) plt.ylabel("Latitude [deg N]") @@ -209,10 +209,11 @@ pset.execute( When we check the output, we can see that the particles have returned to their original position! ```{code-cell} -df_particles_back = pl.read_parquet("output-backwards.parquet") +df_back = parcels.read_particlefile("output-backwards.parquet") -scatter = plt.scatter(df_particles_back['lon'], df_particles_back['lat'], c=np.repeat(df_particles_back['obs'].values, npart)) -plt.scatter(df_particles_back['lon'][:npart], df_particles_back['lat'][:npart], facecolors="none", edgecolors='r') # starting positions +scatter = plt.scatter(df_back['lon'], df_back['lat'], c=df_back['time']) +particles_at_start = df_back.filter(pl.col("time") == df_back["time"].min()) +plt.scatter(particles_at_start['lon'], particles_at_start['lat'], facecolors="none", edgecolors='r') # starting positions plt.xlabel("Longitude [deg E]") plt.xlim(31,33) plt.ylabel("Latitude [deg N]") @@ -225,6 +226,6 @@ Using Euler forward advection, the final positions are equal to the original pos ```{code-cell} # testing that final location == original location -np.testing.assert_almost_equal(df_particles_back['lat'].values[:,-1],df_particles['lat'].values[:,0], 2) -np.testing.assert_almost_equal(df_particles_back['lon'].values[:,-1],df_particles['lon'].values[:,0], 2) +np.testing.assert_almost_equal(particles_at_start["lat"], lat, 2) +np.testing.assert_almost_equal(particles_at_start['lon'], lon, 2) ``` From 3d0c55dfb0a09cd36d1569aecc84008fd82c9bef Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 10:53:44 +0200 Subject: [PATCH 63/69] Update tutorial_croco_3D.ipynb --- docs/user_guide/examples/tutorial_croco_3D.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user_guide/examples/tutorial_croco_3D.ipynb b/docs/user_guide/examples/tutorial_croco_3D.ipynb index 9e6ad1865..50917272e 100644 --- a/docs/user_guide/examples/tutorial_croco_3D.ipynb +++ b/docs/user_guide/examples/tutorial_croco_3D.ipynb @@ -162,7 +162,7 @@ "df = pl.read_parquet(\"croco_particles3D.parquet\")\n", "\n", "ax.plot(X / 1e3, Z, \"k.\", label=\"Initial positions\")\n", - "for traj in df.partition_by(\"trajectory\", maintain_order=True):\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", " ax.plot(traj[\"lon\"] / 1e3, traj[\"z\"], \".-\")\n", "\n", "for z in ds_fields.s_w.values:\n", @@ -233,7 +233,7 @@ "df = pl.read_parquet(\"croco_particles_noW.parquet\")\n", "\n", "ax.plot(X / 1e3, Z, \"k.\", label=\"Initial positions\")\n", - "for traj in df.partition_by(\"trajectory\", maintain_order=True):\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", " ax.plot(traj[\"lon\"] / 1e3, traj[\"z\"], \".-\")\n", "\n", "for z in ds_fields.s_w.values:\n", From 580d5fb03f55cc6e0c403f2e21addb5a33fd4aff Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 11:49:12 +0200 Subject: [PATCH 64/69] Using polars in tutorial_diffusion --- .../examples/tutorial_diffusion.ipynb | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/user_guide/examples/tutorial_diffusion.ipynb b/docs/user_guide/examples/tutorial_diffusion.ipynb index b05dd7166..8541433d5 100644 --- a/docs/user_guide/examples/tutorial_diffusion.ipynb +++ b/docs/user_guide/examples/tutorial_diffusion.ipynb @@ -289,14 +289,14 @@ "fig.set_figwidth(12)\n", "\n", "x = np.arange(0, 0.3, 0.001)\n", - "for _, g in M1_out.groupby(\"trajectory\"):\n", - " ax[0].plot(x, g.lat[: len(x)], alpha=0.3)\n", + "for traj in M1_out.partition_by(\"particle_id\", maintain_order=True):\n", + " ax[0].plot(x, traj[\"lat\"][: len(x)], alpha=0.3)\n", "ax[0].scatter(0, 0.75, s=20, c=\"r\", zorder=3)\n", "ax[0].set_ylabel(\"y\")\n", "ax[0].set_ylim(0, 1)\n", "\n", - "for _, g in M1_out.groupby(\"trajectory\"):\n", - " ax[1].plot(x, g.lon[: len(x)], alpha=0.3)\n", + "for traj in M1_out.partition_by(\"particle_id\", maintain_order=True):\n", + " ax[1].plot(x, traj[\"lon\"][: len(x)], alpha=0.3)\n", "ax[1].scatter(0, 0, s=20, c=\"r\", zorder=3)\n", "ax[1].set_ylabel(\"x\")\n", "ax[1].set_ylim(-1, 1)\n", @@ -362,14 +362,14 @@ "\n", "x = np.arange(0, 0.3, 0.001)\n", "x = np.arange(0, 0.3, 0.001)\n", - "for _, g in EM_out.groupby(\"trajectory\"):\n", - " ax[0].plot(x, g.lat[: len(x)], alpha=0.3)\n", + "for traj in EM_out.partition_by(\"particle_id\", maintain_order=True):\n", + " ax[0].plot(x, traj[\"lat\"][: len(x)], alpha=0.3)\n", "ax[0].scatter(0, 0.75, s=20, c=\"r\", zorder=3)\n", "ax[0].set_ylabel(\"y\")\n", "ax[0].set_ylim(0, 1)\n", "\n", - "for _, g in EM_out.groupby(\"trajectory\"):\n", - " ax[1].plot(x, g.lon[: len(x)], alpha=0.3)\n", + "for traj in EM_out.partition_by(\"particle_id\", maintain_order=True):\n", + " ax[1].plot(x, traj[\"lon\"][: len(x)], alpha=0.3)\n", "ax[1].scatter(0, 0, s=20, c=\"r\", zorder=3)\n", "ax[1].set_ylabel(\"x\")\n", "ax[1].set_ylim(-1, 1)\n", @@ -596,14 +596,14 @@ "metadata": {}, "outputs": [], "source": [ - "df_particles = parcels.read_particlefile(\"smagdiff.parquet\")\n", + "df = parcels.read_particlefile(\"smagdiff.parquet\")\n", "\n", "temperature = ds_fields.isel(time=0, depth=0).thetao.plot(cmap=\"magma\")\n", "velocity = ds_fields.isel(time=0, depth=0).plot.quiver(\n", " x=\"longitude\", y=\"latitude\", u=\"uo\", v=\"vo\"\n", ")\n", - "for _, g in df_particles.groupby(\"trajectory\"):\n", - " plt.plot(g.lon, g.lat, color=\"blue\")\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(traj[\"lon\"], traj[\"lat\"], color=\"blue\")\n", "plt.ylim(-31, -30)\n", "plt.xlim(31, 32.1)\n", "plt.show()" From 4571bcae687697dd3d38c446ebb41c221c9cfcaa Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 12:06:08 +0200 Subject: [PATCH 65/69] Use polars in tutorial_nemo --- docs/user_guide/examples/tutorial_nemo.ipynb | 27 ++++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/docs/user_guide/examples/tutorial_nemo.ipynb b/docs/user_guide/examples/tutorial_nemo.ipynb index 77de0c5e5..628f04649 100644 --- a/docs/user_guide/examples/tutorial_nemo.ipynb +++ b/docs/user_guide/examples/tutorial_nemo.ipynb @@ -48,7 +48,7 @@ "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", - "import xarray as xr\n", + "import polars as pl\n", "\n", "import parcels\n", "import parcels.tutorial" @@ -178,8 +178,8 @@ "source": [ "df = parcels.read_particlefile(\"output_curvilinear.parquet\")\n", "\n", - "for _, g in df.groupby(\"trajectory\"):\n", - " plt.plot(g.lon, g.lat, \".-\")\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.plot(traj[\"lon\"], traj[\"lat\"], \".-\")\n", "\n", "plt.vlines(np.arange(-180, 901, 360), -90, 90, color=\"r\", label=\"antimeridian\")\n", "plt.ylabel(\"Latitude [deg N]\")\n", @@ -204,8 +204,13 @@ "outputs": [], "source": [ "# post processing\n", - "df[\"lon\"] = df[\"lon\"] % 360\n", - "df[\"lon\"] = df[\"lon\"].where(df[\"lon\"] <= 180, df[\"lon\"] - 360)" + "df = df.with_columns((pl.col(\"lon\") % 360).alias(\"lon\"))\n", + "df = df.with_columns(\n", + " pl.when(pl.col(\"lon\") <= 180)\n", + " .then(pl.col(\"lon\"))\n", + " .otherwise(pl.col(\"lon\") - 360)\n", + " .alias(\"lon\")\n", + ")" ] }, { @@ -245,8 +250,8 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(1, 2, figsize=(10, 5))\n", - "for _, g in df.groupby(\"trajectory\"):\n", - " ax[0].plot(g.lon, g.lat, \".-\")\n", + "for traj in df.partition_by(\"particle_id\", maintain_order=True):\n", + " ax[0].plot(traj[\"lon\"], traj[\"lat\"], \".-\")\n", "ax[0].vlines(np.arange(-180, 360, 360), -90, 90, color=\"r\", label=\"antimeridian\")\n", "ax[0].set_ylabel(\"Latitude [deg N]\")\n", "ax[0].set_xlabel(\"Longitude [deg E]\")\n", @@ -256,8 +261,8 @@ "\n", "\n", "df_periodic = parcels.read_particlefile(\"output_curvilinear_periodic.parquet\")\n", - "for _, g in df_periodic.groupby(\"trajectory\"):\n", - " ax[1].plot(g.lon, g.lat, \".-\")\n", + "for traj in df_periodic.partition_by(\"particle_id\", maintain_order=True):\n", + " ax[1].plot(traj[\"lon\"], traj[\"lat\"], \".-\")\n", "\n", "ax[1].vlines(np.arange(-180, 360, 360), -90, 90, color=\"r\", label=\"antimeridian\")\n", "ax[1].set_ylabel(\"Latitude [deg N]\")\n", @@ -358,8 +363,8 @@ "plt.pcolormesh(fieldset.U.grid.lon, fieldset.U.grid.lat, field, cmap=\"RdBu\")\n", "\n", "df_out = parcels.read_particlefile(\"output_nemo3D.parquet\")\n", - "for _, g in df_out.groupby(\"trajectory\"):\n", - " plt.scatter(g.lon, g.lat, c=-g.z, marker=\".\")\n", + "for traj in df_out.partition_by(\"particle_id\", maintain_order=True):\n", + " plt.scatter(traj[\"lon\"], traj[\"lat\"], c=-traj[\"z\"], marker=\".\")\n", "plt.colorbar(label=\"Depth (m)\")\n", "plt.show()" ] From 3d29c72ec9c3459d2a84cf8123ed2436912646bf Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 12:08:25 +0200 Subject: [PATCH 66/69] Use parquet in explanation_kernelloop --- .../examples/explanation_kernelloop.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/user_guide/examples/explanation_kernelloop.md b/docs/user_guide/examples/explanation_kernelloop.md index c4a9a58d1..1bff904e2 100644 --- a/docs/user_guide/examples/explanation_kernelloop.md +++ b/docs/user_guide/examples/explanation_kernelloop.md @@ -108,7 +108,7 @@ lats = np.linspace(-32.5, -30.5, npart) pset = parcels.ParticleSet(fieldset, pclass=parcels.Particle, z=z, lat=lats, lon=lons) output_file = parcels.ParticleFile( - store="advection_then_wind.zarr", outputdt=np.timedelta64(6,'h') + path="advection_then_wind.parquet", outputdt=np.timedelta64(6,'h') ) pset.execute( [parcels.kernels.AdvectionRK2, wind_kernel], @@ -126,7 +126,7 @@ pset_reverse = parcels.ParticleSet( fieldset, pclass=parcels.Particle, z=z, lat=lats, lon=lons ) output_file_reverse = parcels.ParticleFile( - store="wind_then_advection.zarr", outputdt=np.timedelta64(6,"h") + path="wind_then_advection.parquet", outputdt=np.timedelta64(6,"h") ) pset_reverse.execute( [wind_kernel, parcels.kernels.AdvectionRK2], @@ -140,10 +140,14 @@ Finally, plot the trajectories to show that they are identical in the two simula ```{code-cell} # Plot the resulting particle trajectories overlapped for both cases -advection_then_wind = xr.open_zarr("advection_then_wind.zarr") -wind_then_advection = xr.open_zarr("wind_then_advection.zarr") -plt.plot(wind_then_advection.lon.T, wind_then_advection.lat.T, "-") -plt.plot(advection_then_wind.lon.T, advection_then_wind.lat.T, "--", c="k", alpha=0.7) +advection_then_wind = parcels.read_particlefile("advection_then_wind.parquet") +wind_then_advection = parcels.read_particlefile("wind_then_advection.parquet") + +fig, ax = plt.subplots(figsize=(5, 3)) +for traj in wind_then_advection.partition_by("particle_id", maintain_order=True): + ax.plot(traj["lon"], traj["lat"], "-") +for traj in advection_then_wind.partition_by("particle_id", maintain_order=True): + ax.plot(traj["lon"], traj["lat"], "--", c="k", alpha=0.7) plt.show() ``` From 45c9cf01bb9eee42925d0cd11e93a0f655475c9b Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 12:11:28 +0200 Subject: [PATCH 67/69] Update policies.md --- docs/development/policies.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/development/policies.md b/docs/development/policies.md index 6cc751240..be7404ed9 100644 --- a/docs/development/policies.md +++ b/docs/development/policies.md @@ -22,7 +22,7 @@ Parcels follows [Intended Effort Versioning (EffVer)](https://jacobtomlinson.dev When making backward incompatible changes, we will make sure these changes and instructions to upgrade are communicated to the user via change logs or migration guides, and (where applicable) informative error messaging. -Note when conducting research we highly recommend documenting which version of Parcels (and other packages) you are using. This can be as easy as doing `conda env export > environment.yml` alongside your project code. The Parcels version used to generate an output file is also stored as metadata entry in the `.zarr` output file. +Note when conducting research we highly recommend documenting which version of Parcels (and other packages) you are using. This can be as easy as doing `conda env export > environment.yml` alongside your project code. The Parcels version used to generate an output file is also stored as metadata entry in the `.parquet` output file. ## Changes in policies From d47202bb806a8955dde4c81dc84c7f8c9871af7c Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 30 Apr 2026 13:16:28 +0200 Subject: [PATCH 68/69] Fixing unit tests to use polars in parcels.read_particlefile --- tests/test_particlefile.py | 25 +++++++++++++------------ tests/utils.py | 3 ++- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/test_particlefile.py b/tests/test_particlefile.py index fa92b0f2a..80c0b2521 100755 --- a/tests/test_particlefile.py +++ b/tests/test_particlefile.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd +import polars as pl import pyarrow as pa import pyarrow.parquet as pq import pytest @@ -303,9 +304,10 @@ def IncreaseAge(particles, fieldset): # pragma: no cover df = parcels.read_particlefile(tmp_parquet) # Map sorted particle IDs to release times (0, 1, ..., npart-1 seconds) - for index, df_traj in df.groupby("particle_id"): - release_time = time[index] - np.testing.assert_equal(df_traj["age"].astype("timedelta64[s]").values, (df_traj["time"] - release_time).values) + for i, df_traj in enumerate(df.partition_by("particle_id", maintain_order=True)): + release_time = pd.Timestamp(time[i]).to_pydatetime() + traj_time = (df_traj["time"] - release_time).dt.total_seconds() + assert (df_traj["age"] == traj_time).all() def test_reset_dt(fieldset, tmp_parquet): @@ -367,9 +369,8 @@ def test_pset_execute_outputdt_forwards(fieldset): dt = timedelta(minutes=5) df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - particle_0_times = df[df["particle_id"] == 0].time.values - - np.testing.assert_equal(np.diff(particle_0_times), outputdt.seconds) + particle_0_times = df.filter(pl.col("particle_id") == 0)["time"] + np.testing.assert_equal(np.diff(particle_0_times) / 1e9, outputdt.seconds) def test_pset_execute_output_time_forwards(fieldset): @@ -379,8 +380,8 @@ def test_pset_execute_output_time_forwards(fieldset): dt = np.timedelta64(5, "m") df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - assert df.time.min() == pd.Timestamp(fieldset.time_interval.left) - assert df.time.max() - df.time.min() == runtime + assert df["time"].min() == pd.Timestamp(fieldset.time_interval.left) + assert df["time"].max() - df["time"].min() == runtime def test_pset_execute_outputdt_backwards(fieldset): @@ -390,8 +391,8 @@ def test_pset_execute_outputdt_backwards(fieldset): dt = -timedelta(minutes=5) df = setup_pset_execute(fieldset=fieldset, outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt)) - particle_0_times = df[df["particle_id"] == 0].time.values - np.testing.assert_equal(np.diff(particle_0_times), -outputdt.seconds) + particle_0_times = df.filter(pl.col("particle_id") == 0)["time"] + np.testing.assert_equal(np.diff(particle_0_times) / 1e9, -outputdt.seconds) def test_pset_execute_outputdt_backwards_fieldset_timevarying(): @@ -409,8 +410,8 @@ def test_pset_execute_outputdt_backwards_fieldset_timevarying(): fieldset = FieldSet.from_sgrid_conventions(ds_fset) df = setup_pset_execute(outputdt=outputdt, execute_kwargs=dict(runtime=runtime, dt=dt), fieldset=fieldset) - particle_0_times = df[df["particle_id"] == 0].time.values - np.testing.assert_equal(np.diff(particle_0_times), -outputdt.seconds) + particle_0_times = df.filter(pl.col("particle_id") == 0)["time"] + np.testing.assert_equal(np.diff(particle_0_times) / 1e9, -outputdt.seconds) def test_particlefile_init(tmp_parquet): diff --git a/tests/utils.py b/tests/utils.py index 33d6e0012..c87198fd7 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ import struct from collections import defaultdict +from datetime import datetime from pathlib import Path import cftime @@ -160,7 +161,7 @@ def assert_cftime_like_particlefile(parquet_path: Path) -> None: df = parcels.read_particlefile(parquet_path, decode_times=True) # check first value (and hence rest of array) is what we expect - assert isinstance(df["time"].values[0], (cftime.datetime, np.datetime64)), ( + assert isinstance(df["time"][0], (cftime.datetime, datetime)), ( "CF-time values in Parquet did not get properly decoded. Are the attributes correct?" ) return From 1b35bf9ff893bd2b667e33e1c0a8082dfc2ef7dc Mon Sep 17 00:00:00 2001 From: Vecko <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 1 May 2026 15:19:21 +0200 Subject: [PATCH 69/69] Doc fix: docs/user_guide/examples/tutorial_dt_integrators.ipynb --- .../examples/tutorial_dt_integrators.ipynb | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/docs/user_guide/examples/tutorial_dt_integrators.ipynb b/docs/user_guide/examples/tutorial_dt_integrators.ipynb index 3dee169a4..2acd3ebda 100644 --- a/docs/user_guide/examples/tutorial_dt_integrators.ipynb +++ b/docs/user_guide/examples/tutorial_dt_integrators.ipynb @@ -53,6 +53,8 @@ "metadata": {}, "outputs": [], "source": [ + "from pathlib import Path\n", + "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", @@ -67,6 +69,9 @@ ")\n", "ds_fields.load() # load the dataset into memory\n", "\n", + "OUTPUT_FOLDER = Path(\"output\")\n", + "OUTPUT_FOLDER.mkdir(exist_ok=True)\n", + "\n", "# Convert to SGRID-compliant dataset and create FieldSet\n", "fields = {\"U\": ds_fields[\"uo\"], \"V\": ds_fields[\"vo\"]}\n", "ds_fset = parcels.convert.copernicusmarine_to_sgrid(fields=fields)\n", @@ -239,7 +244,8 @@ " outputdt = dt\n", "\n", " pfile = parcels.ParticleFile(\n", - " path=f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\",\n", + " path=OUTPUT_FOLDER\n", + " / f\"AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\",\n", " outputdt=outputdt,\n", " )\n", "\n", @@ -291,7 +297,7 @@ "temperature = ds_fields.isel(time=0, depth=0).thetao.plot(cmap=\"Greys\")\n", "for j, dt in enumerate(dt_choices):\n", " df = parcels.read_particlefile(\n", - " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", + " OUTPUT_FOLDER / f\"AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", " for i, traj in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", " ax.plot(\n", @@ -376,10 +382,10 @@ "axs[1].set_ylim(0, 50)\n", "\n", "# set 5 minute dt as benchmark\n", - "df_5min = parcels.read_particlefile(\"output/AdvectionRK2_dt_300s.parquet\")\n", + "df_5min = parcels.read_particlefile(OUTPUT_FOLDER / \"AdvectionRK2_dt_300s.parquet\")\n", "for i, dt in enumerate(dt_choices[:-1]):\n", " df = parcels.read_particlefile(\n", - " f\"output/AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", + " OUTPUT_FOLDER / f\"AdvectionRK2_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", "\n", " # subset 5 minute data to match dt\n", @@ -581,7 +587,8 @@ " outputdt = dt\n", "\n", " pfile = parcels.ParticleFile(\n", - " path=f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\",\n", + " path=OUTPUT_FOLDER\n", + " / f\"KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\",\n", " outputdt=outputdt,\n", " )\n", "\n", @@ -620,7 +627,8 @@ " axs[m, n].set_xlabel(\"Longitude\")\n", " for j, advection_scheme in enumerate(advection_schemes):\n", " df = parcels.read_particlefile(\n", - " f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", + " OUTPUT_FOLDER\n", + " / f\"KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", " for i, traj in enumerate(df.partition_by(\"particle_id\", maintain_order=True)):\n", " axs[m, n].plot(\n", @@ -670,11 +678,13 @@ " axs[m, n].set_yscale(\"log\")\n", " axs[m, n].set_ylim(1e-4, 1e1)\n", " df_RK4 = parcels.read_particlefile(\n", - " f\"output/KernelCompare_AdvectionRK4_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", + " OUTPUT_FOLDER\n", + " / f\"KernelCompare_AdvectionRK4_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", " for j, advection_scheme in enumerate(advection_schemes[:-1]):\n", " df = parcels.read_particlefile(\n", - " f\"output/KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", + " OUTPUT_FOLDER\n", + " / f\"KernelCompare_{advection_scheme.__name__}_dt_{int(dt / np.timedelta64(1, 's'))}s.parquet\"\n", " )\n", "\n", " dist = dist_km(df[\"lon\"], df_RK4[\"lon\"], df[\"lat\"], df_RK4[\"lat\"])\n",