Skip to content

Commit 094d259

Browse files
Merge pull request #12 from deepesdl/tejas-xxx-implement-custom-linting-for-datasets
Implemented custom linting for datasets using xrlint
2 parents fbfc5da + 5a4b3fd commit 094d259

11 files changed

Lines changed: 578 additions & 235 deletions

CHANGES.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,14 @@
1313
- Support publishing to testing,staging and production repositories of
1414
open-science-metadata.
1515
- Implemented new cli command `generate-config` to generate starter templates for
16-
config files.
16+
config files.
17+
18+
## Changes in 0.1.3
19+
20+
- _Version bump only_; no code or functionality changes. This release was
21+
republished to update the package on PyPI.
22+
23+
## Changes in 0.1.4
24+
25+
- Implemented custom rules using xrlint to validate metadata in dataset, which is necessary to
26+
generate a STAC collection valid for ESA Open Science Catalog.
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright © 2025 Brockmann Consult GmbH.
2+
# This software is distributed under the terms and conditions of the
3+
# MIT license (https://mit-license.org/).
4+
5+
import unittest
6+
7+
import xarray as xr
8+
from xrlint.testing import RuleTest, RuleTester
9+
10+
from deep_code.utils.custom_xrlint_rules import (
11+
DatasetDescriptionRule,
12+
VariableGcmdKeywordUrlRule,
13+
)
14+
15+
16+
class TestDeepCodePlugin(unittest.TestCase):
17+
def setUp(self):
18+
"""Set up test datasets."""
19+
# Valid dataset with all required metadata
20+
self.valid_dataset = xr.Dataset(
21+
data_vars={
22+
"temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]),
23+
"precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]),
24+
},
25+
coords={"time": [1], "lat": [0, 1], "lon": [0, 1]},
26+
attrs={
27+
"description": "Test climate dataset",
28+
"title": "Climate Dataset 2025",
29+
},
30+
)
31+
self.valid_dataset["temperature"].attrs[
32+
"gcmd_keyword_url"
33+
] = "https://gcmd.nasa.gov/KeywordViewer/temperature"
34+
self.valid_dataset["temperature"].attrs["units"] = "K"
35+
self.valid_dataset["precipitation"].attrs[
36+
"gcmd_keyword_url"
37+
] = "https://gcmd.nasa.gov/KeywordViewer/precipitation"
38+
self.valid_dataset["precipitation"].attrs["units"] = "mm"
39+
40+
# Invalid dataset missing required metadata
41+
self.invalid_dataset = xr.Dataset(
42+
data_vars={
43+
"temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]),
44+
"precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]),
45+
},
46+
coords={"time": [1], "lat": [0, 1], "lon": [0, 1]},
47+
attrs={},
48+
)
49+
self.invalid_dataset["temperature"].attrs[
50+
"gcmd_keyword_url"
51+
] = "https://gcmd.nasa.gov/KeywordViewer/temperature"
52+
self.invalid_dataset["temperature"].attrs["units"] = "K"
53+
# Intentionally omit gcmd_keyword_url and units for precipitation
54+
55+
self.tester = RuleTester()
56+
57+
def test_dataset_description(self):
58+
"""Test DatasetDescriptionRule with valid and invalid dataset."""
59+
self.tester.run(
60+
"dataset-description",
61+
DatasetDescriptionRule,
62+
valid=[RuleTest(dataset=self.valid_dataset)],
63+
invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)],
64+
)
65+
66+
def test_variable_gcmd_keyword_url(self):
67+
"""Test VariableGcmdKeywordUrlRule with valid dataset."""
68+
self.tester.run(
69+
"variable-gcmd-keyword-url",
70+
VariableGcmdKeywordUrlRule,
71+
valid=[RuleTest(dataset=self.valid_dataset)],
72+
invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)],
73+
)

deep_code/tests/utils/test_dataset_stac_generator.py

Lines changed: 100 additions & 144 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
1-
import os
1+
#!/usr/bin/env python3
2+
# Copyright (c) 2025 by Brockmann Consult GmbH
3+
# Permissions are hereby granted under the terms of the MIT License:
4+
# https://opensource.org/licenses/MIT.
5+
26
import unittest
37
from datetime import datetime
48
from unittest.mock import MagicMock, patch
59

610
import numpy as np
7-
from pystac import Collection
8-
from xarray import Dataset
11+
from pystac import Catalog, Collection
12+
from xarray import DataArray, Dataset
913

10-
from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator
14+
from deep_code.constants import (
15+
DEEPESDL_COLLECTION_SELF_HREF,
16+
OSC_THEME_SCHEME,
17+
PRODUCT_BASE_CATALOG_SELF_HREF,
18+
VARIABLE_BASE_CATALOG_SELF_HREF,
19+
)
20+
from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator, Theme
1121

1222

1323
class TestOSCProductSTACGenerator(unittest.TestCase):
14-
@patch("deep_code.utils.dataset_stac_generator.new_data_store")
24+
@patch("deep_code.utils.dataset_stac_generator.open_dataset")
1525
def setUp(self, mock_data_store):
1626
"""Set up a mock dataset and generator."""
1727
self.mock_dataset = Dataset(
@@ -50,7 +60,7 @@ def setUp(self, mock_data_store):
5060
)
5161
mock_store = MagicMock()
5262
mock_store.open_data.return_value = self.mock_dataset
53-
mock_data_store.return_value = mock_store
63+
mock_data_store.return_value = self.mock_dataset
5464

5565
self.generator = OscDatasetStacGenerator(
5666
dataset_id="mock-dataset-id",
@@ -65,9 +75,8 @@ def setUp(self, mock_data_store):
6575
def test_open_dataset(self):
6676
"""Test if the dataset is opened correctly."""
6777
self.assertIsInstance(self.generator.dataset, Dataset)
68-
self.assertIn("lon", self.generator.dataset.coords)
69-
self.assertIn("lat", self.generator.dataset.coords)
70-
self.assertIn("time", self.generator.dataset.coords)
78+
for coord in ("lon", "lat", "time"):
79+
self.assertIn(coord, self.generator.dataset.coords)
7180

7281
def test_get_spatial_extent(self):
7382
"""Test spatial extent extraction."""
@@ -77,146 +86,93 @@ def test_get_spatial_extent(self):
7786
def test_get_temporal_extent(self):
7887
"""Test temporal extent extraction."""
7988
extent = self.generator._get_temporal_extent()
80-
expected_intervals = [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)]
81-
self.assertEqual(extent.intervals[0], expected_intervals)
89+
# TemporalExtent.intervals is a list of [start, end]
90+
interval = extent.intervals[0]
91+
self.assertEqual(interval[0], datetime(2023, 1, 1, 0, 0))
92+
self.assertEqual(interval[1], datetime(2023, 1, 2, 0, 0))
8293

8394
def test_get_variables(self):
84-
"""Test variable extraction."""
85-
variables = self.generator.get_variable_ids()
86-
self.assertEqual(variables, ["var1", "var2"])
95+
"""Test variable ID extraction."""
96+
vars_ = self.generator.get_variable_ids()
97+
self.assertCountEqual(vars_, ["var1", "var2"])
8798

8899
def test_get_general_metadata(self):
89100
"""Test general metadata extraction."""
90-
metadata = self.generator._get_general_metadata()
91-
self.assertEqual(metadata["description"], "Mock dataset for testing.")
92-
93-
@patch("pystac.Collection.add_link")
94-
@patch("pystac.Collection.set_self_href")
95-
def test_build_stac_collection(self, mock_set_self_href, mock_add_link):
96-
"""Test STAC collection creation."""
97-
collection = self.generator.build_dataset_stac_collection()
98-
self.assertIsInstance(collection, Collection)
99-
self.assertEqual(collection.id, "mock-collection-id")
100-
self.assertEqual(collection.description, "Mock dataset for testing.")
101-
self.assertEqual(
102-
collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0]
103-
)
104-
self.assertEqual(
105-
collection.extent.temporal.intervals[0],
106-
[datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)],
107-
)
108-
mock_set_self_href.assert_called_once()
109-
mock_add_link.assert_called()
110-
111-
def test_invalid_spatial_extent(self):
112-
"""Test spatial extent extraction with missing coordinates."""
113-
self.generator.dataset = Dataset(coords={"x": [], "y": []})
114-
with self.assertRaises(ValueError):
115-
self.generator._get_spatial_extent()
116-
117-
def test_invalid_temporal_extent(self):
118-
"""Test temporal extent extraction with missing time."""
119-
self.generator.dataset = Dataset(coords={})
120-
with self.assertRaises(ValueError):
121-
self.generator._get_temporal_extent()
122-
123-
@patch("deep_code.utils.dataset_stac_generator.new_data_store")
124-
@patch("deep_code.utils.dataset_stac_generator.logging.getLogger")
125-
def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store):
126-
"""Test dataset opening with the public store configuration."""
127-
# Create a mock store and mock its `open_data` method
128-
mock_store = MagicMock()
129-
mock_new_data_store.return_value = mock_store
130-
mock_store.open_data.return_value = self.mock_dataset
131-
132-
# Instantiate the generator (this will implicitly call _open_dataset)
133-
generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id")
134-
135-
# Validate that the dataset is assigned correctly
136-
self.assertEqual(generator.dataset, "mock_dataset")
137-
138-
# Validate that `new_data_store` was called once with the correct parameters
139-
mock_new_data_store.assert_called_once_with(
140-
"s3", root="deep-esdl-public", storage_options={"anon": True}
141-
)
142-
143-
# Ensure `open_data` was called once on the returned store
144-
mock_store.open_data.assert_called_once_with("mock-dataset-id")
145-
146-
# Validate logging behavior
147-
mock_logger().info.assert_any_call(
148-
"Attempting to open dataset with configuration: Public store"
149-
)
150-
mock_logger().info.assert_any_call(
151-
"Successfully opened dataset with configuration: Public store"
152-
)
153-
154-
@patch("deep_code.utils.dataset_stac_generator.new_data_store")
155-
@patch("deep_code.utils.dataset_stac_generator.logging.getLogger")
156-
def test_open_dataset_success_authenticated_store(
157-
self, mock_logger, mock_new_data_store
158-
):
159-
"""Test dataset opening with the authenticated store configuration."""
160-
# Simulate public store failure
161-
mock_store = MagicMock()
162-
mock_new_data_store.side_effect = [
163-
Exception("Public store failure"),
164-
# First call (public store) raises an exception
165-
mock_store,
166-
# Second call (authenticated store) returns a mock store
167-
]
168-
mock_store.open_data.return_value = self.mock_dataset
169-
170-
os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket"
171-
os.environ["S3_USER_STORAGE_KEY"] = "mock-key"
172-
os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret"
173-
174-
generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id")
175-
176-
# Validate that the dataset was successfully opened with the authenticated store
177-
self.assertEqual(generator.dataset, "mock_dataset")
178-
self.assertEqual(mock_new_data_store.call_count, 2)
179-
180-
# Validate calls to `new_data_store`
181-
mock_new_data_store.assert_any_call(
182-
"s3", root="deep-esdl-public", storage_options={"anon": True}
183-
)
184-
mock_new_data_store.assert_any_call(
185-
"s3",
186-
root="mock-bucket",
187-
storage_options={"anon": False, "key": "mock-key", "secret": "mock-secret"},
188-
)
189-
190-
# Validate logging calls
191-
mock_logger().info.assert_any_call(
192-
"Attempting to open dataset with configuration: Public store"
193-
)
194-
mock_logger().info.assert_any_call(
195-
"Attempting to open dataset with configuration: Authenticated store"
196-
)
197-
mock_logger().info.assert_any_call(
198-
"Successfully opened dataset with configuration: Authenticated store"
199-
)
200-
201-
@patch("deep_code.utils.dataset_stac_generator.new_data_store")
202-
@patch("deep_code.utils.dataset_stac_generator.logging.getLogger")
203-
def test_open_dataset_failure(self, mock_logger, mock_new_data_store):
204-
"""Test dataset opening failure with all configurations."""
205-
# Simulate all store failures
206-
mock_new_data_store.side_effect = Exception("Store failure")
207-
os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket"
208-
os.environ["S3_USER_STORAGE_KEY"] = "mock-key"
209-
os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret"
210-
211-
with self.assertRaises(ValueError) as context:
212-
OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id")
213-
214-
self.assertIn(
215-
"Failed to open Zarr dataset with ID mock-dataset-id",
216-
str(context.exception),
217-
)
218-
self.assertIn("Public store, Authenticated store", str(context.exception))
219-
self.assertEqual(mock_new_data_store.call_count, 2)
101+
meta = self.generator._get_general_metadata()
102+
self.assertEqual(meta.get("description"), "Mock dataset for testing.")
103+
104+
def test_extract_metadata_for_variable(self):
105+
"""Test single variable metadata extraction."""
106+
da: DataArray = self.mock_dataset.data_vars["var1"]
107+
var_meta = self.generator.extract_metadata_for_variable(da)
108+
self.assertEqual(var_meta["variable_id"], "var1")
109+
self.assertEqual(var_meta["description"], "dummy")
110+
self.assertEqual(var_meta["gcmd_keyword_url"], "https://dummy")
111+
112+
def test_get_variables_metadata(self):
113+
"""Test metadata dict for all variables."""
114+
meta_dict = self.generator.get_variables_metadata()
115+
self.assertIn("var1", meta_dict)
116+
self.assertIn("var2", meta_dict)
117+
self.assertIsInstance(meta_dict["var1"], dict)
118+
119+
def test_build_theme(self):
120+
"""Test Theme builder static method."""
121+
themes = ["a", "b"]
122+
theme_obj: Theme = OscDatasetStacGenerator.build_theme(themes)
123+
self.assertEqual(theme_obj.scheme, OSC_THEME_SCHEME)
124+
ids = [tc.id for tc in theme_obj.concepts]
125+
self.assertListEqual(ids, ["a", "b"])
126+
127+
@patch.object(OscDatasetStacGenerator, "_add_gcmd_link_to_var_catalog")
128+
@patch.object(OscDatasetStacGenerator, "add_themes_as_related_links_var_catalog")
129+
def test_build_variable_catalog(self, mock_add_themes, mock_add_gcmd):
130+
"""Test building of variable-level STAC catalog."""
131+
var_meta = self.generator.variables_metadata["var1"]
132+
catalog = self.generator.build_variable_catalog(var_meta)
133+
self.assertIsInstance(catalog, Catalog)
134+
self.assertEqual(catalog.id, "var1")
135+
# Title should be capitalized
136+
self.assertEqual(catalog.title, "Var1")
137+
# Self href ends with var1/catalog.json
138+
self.assertTrue(catalog.self_href.endswith("/var1/catalog.json"))
139+
140+
@patch("pystac.Catalog.from_file")
141+
def test_update_product_base_catalog(self, mock_from_file):
142+
"""Test linking product catalog."""
143+
mock_cat = MagicMock(spec=Catalog)
144+
mock_from_file.return_value = mock_cat
145+
146+
result = self.generator.update_product_base_catalog("path.json")
147+
self.assertIs(result, mock_cat)
148+
mock_cat.add_link.assert_called_once()
149+
mock_cat.set_self_href.assert_called_once_with(PRODUCT_BASE_CATALOG_SELF_HREF)
150+
151+
@patch("pystac.Catalog.from_file")
152+
def test_update_variable_base_catalog(self, mock_from_file):
153+
"""Test linking variable base catalog."""
154+
mock_cat = MagicMock(spec=Catalog)
155+
mock_from_file.return_value = mock_cat
156+
157+
vars_ = ["v1", "v2"]
158+
result = self.generator.update_variable_base_catalog("vars.json", vars_)
159+
self.assertIs(result, mock_cat)
160+
# Expect one add_link per variable
161+
self.assertEqual(mock_cat.add_link.call_count, len(vars_))
162+
mock_cat.set_self_href.assert_called_once_with(VARIABLE_BASE_CATALOG_SELF_HREF)
163+
164+
@patch("pystac.Collection.from_file")
165+
def test_update_deepesdl_collection(self, mock_from_file):
166+
"""Test updating DeepESDL collection."""
167+
mock_coll = MagicMock(spec=Collection)
168+
mock_from_file.return_value = mock_coll
169+
170+
result = self.generator.update_deepesdl_collection("deep.json")
171+
self.assertIs(result, mock_coll)
172+
# Expect child and theme related links for each theme
173+
calls = mock_coll.add_link.call_count
174+
self.assertGreaterEqual(calls, 1 + len(self.generator.osc_themes))
175+
mock_coll.set_self_href.assert_called_once_with(DEEPESDL_COLLECTION_SELF_HREF)
220176

221177

222178
class TestFormatString(unittest.TestCase):

0 commit comments

Comments
 (0)