From 4994bb086a8f75996889ee7deca69cf8aeb1826b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Jir=C3=A1t?= Date: Thu, 26 Feb 2026 14:09:41 +0100 Subject: [PATCH 1/4] Move _copy_input_files into TestScaffolder and add input_files_dir parameter - Add input_files_dir param to scaffold_from_json() so writer component input CSVs can be supplied without going through the CLI layer - Add TestScaffolder._copy_input_files() static method (previously lived only in datadirtest/__main__.py, invisible to API users) - Expand module docstring with tests/setup/ layout convention and a concrete writer example so other sessions can discover the feature Co-Authored-By: Claude Sonnet 4.6 --- src/keboola/vcr/scaffolder.py | 90 +++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/src/keboola/vcr/scaffolder.py b/src/keboola/vcr/scaffolder.py index bd20e19..42aa7a8 100644 --- a/src/keboola/vcr/scaffolder.py +++ b/src/keboola/vcr/scaffolder.py @@ -15,6 +15,37 @@ falls back to a numbered index). An optional *secrets file* can be provided to deep-merge real credentials at recording time while keeping only dummy values in the committed ``config.json``. + +## Standard repo layout (used by the CLI defaults) + +``tests/setup/`` is the conventional home for scaffold inputs: + +- ``tests/setup/configs.json`` — test definitions (wrapped or raw format) +- ``tests/setup/input_files/`` — CSV/files for writer components + +When ``input_files_dir`` is provided (default: ``tests/setup/input_files``), +the scaffolder reads each test's ``config.json`` after folder creation and +copies matching files into the test's ``in/tables/`` or ``in/files/`` based +on the ``storage.input.tables[].destination`` / ``storage.input.files[].destination`` +entries. This is the mechanism used to supply writer components with their +input data during recording without bundling large CSVs in the test tree. + +Example ``configs.json`` entry for a writer:: + + { + "name": "01_write_data", + "config": { + "parameters": {"#api_key": "DUMMY"}, + "storage": { + "input": { + "tables": [{"destination": "my_input_table.csv"}] + } + } + } + } + +Place ``tests/setup/input_files/my_input_table.csv`` and run scaffold — the +file is copied into each test's ``source/data/in/tables/`` automatically. """ from __future__ import annotations @@ -72,6 +103,7 @@ def scaffold_from_json( secrets_file: Path | None = None, chain_state: bool = False, regenerate: bool = False, + input_files_dir: Path | None = None, ) -> list[Path]: """ Create test folders from definitions file. @@ -90,6 +122,12 @@ def scaffold_from_json( regenerate: Delete existing cassettes before recording so fresh interactions are captured from the live API. When False, tests that already have a cassette are skipped. + input_files_dir: Optional directory containing CSV/files for writer + components. After scaffolding each test folder the contents are + copied into ``in/tables/`` or ``in/files/`` based on the + ``storage.input`` mappings in each test's ``config.json``. + Defaults to ``tests/setup/input_files`` when called via the CLI. + If the directory does not exist it is silently skipped. Returns: List of created test folder paths @@ -156,6 +194,9 @@ def scaffold_from_json( chained_state = json.load(f) logger.info(f"Chained state from {test_path.name} for next test") + if input_files_dir is not None: + self._copy_input_files(created_paths, Path(input_files_dir)) + return created_paths def scaffold_from_dict( @@ -373,6 +414,55 @@ def run_component(): # Private helpers # ------------------------------------------------------------------ + @staticmethod + def _copy_input_files(created_paths: list[Path], input_files_dir: Path) -> None: + """Copy input CSV/files into scaffolded test dirs from a shared input_files directory. + + Reads each test's ``config.json`` and copies files listed under + ``storage.input.tables[].destination`` into ``in/tables/`` and + ``storage.input.files[].destination`` into ``in/files/``. + + Silently skips if *input_files_dir* does not exist or a referenced + source file is missing — the user will get a clear runtime error from + the component if a required input is absent. + """ + if not input_files_dir.exists(): + return + + for test_dir in created_paths: + config_path = test_dir / "source" / "data" / "config.json" + if not config_path.exists(): + continue + + try: + config = json.loads(config_path.read_text()) + except (json.JSONDecodeError, OSError): + continue + + storage = config.get("storage", {}) + + for entry in storage.get("input", {}).get("tables", []): + dest = entry.get("destination", "") + if not dest: + continue + src = input_files_dir / dest + if src.exists(): + target_dir = test_dir / "source" / "data" / "in" / "tables" + target_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, target_dir / dest) + logger.info(f"Copied {src} -> {target_dir / dest}") + + for entry in storage.get("input", {}).get("files", []): + dest = entry.get("destination", "") + if not dest: + continue + src = input_files_dir / dest + if src.exists(): + target_dir = test_dir / "source" / "data" / "in" / "files" + target_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, target_dir / dest) + logger.info(f"Copied {src} -> {target_dir / dest}") + def _mask_secrets( self, config: dict[str, Any], From c0e08eab84a55f6b172e2a4807f1a842e02aa0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Jir=C3=A1t?= Date: Thu, 26 Feb 2026 14:30:47 +0100 Subject: [PATCH 2/4] Fix input_files_dir copy order: copy before recording, not after Previously _copy_input_files() was called after all tests were recorded, meaning writer components couldn't find their input CSVs during the live API run. Move the copy into _scaffold_single_test(), after the directory structure is written but before _record_test() is called. The files now live in in/tables/ or in/files/ for both recording and replay. Co-Authored-By: Claude Sonnet 4.6 --- src/keboola/vcr/scaffolder.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/keboola/vcr/scaffolder.py b/src/keboola/vcr/scaffolder.py index 42aa7a8..cb70c3f 100644 --- a/src/keboola/vcr/scaffolder.py +++ b/src/keboola/vcr/scaffolder.py @@ -184,6 +184,7 @@ def scaffold_from_json( secrets_override=secrets_override, input_state=chained_state, regenerate=regenerate, + input_files_dir=Path(input_files_dir) if input_files_dir is not None else None, ) created_paths.append(test_path) @@ -194,9 +195,6 @@ def scaffold_from_json( chained_state = json.load(f) logger.info(f"Chained state from {test_path.name} for next test") - if input_files_dir is not None: - self._copy_input_files(created_paths, Path(input_files_dir)) - return created_paths def scaffold_from_dict( @@ -249,6 +247,7 @@ def _scaffold_single_test( secrets_override: dict[str, Any] | None = None, input_state: dict[str, Any] | None = None, regenerate: bool = False, + input_files_dir: Path | None = None, ) -> Path: """Create folder structure for a single test.""" # Validate definition @@ -292,6 +291,10 @@ def _scaffold_single_test( logger.info(f"Created test folder structure: {test_dir}") + # Copy input files before running the component so the writer can find them + if input_files_dir is not None: + self._copy_input_files([test_dir], input_files_dir) + # Record cassette if requested if record and component_script: cassette_path = source_data_dir / "cassettes" / VCRRecorder.DEFAULT_CASSETTE_FILE From 58bd94c3d32916299515df4f04f91840af676cb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Jir=C3=A1t?= Date: Thu, 26 Feb 2026 15:36:17 +0100 Subject: [PATCH 3/4] Fix create_default_sanitizer to only collect #-prefixed secret values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces extract_values() with _collect_hash_values() so that non-sensitive metadata fields (oauthVersion, id, created, etc.) are skipped. Previously, short values like "2.0" were added to sensitive_values, causing URL path corruption in cassettes (e.g. api.xro/2.0 → api.xro/REDACTED). Co-Authored-By: Claude Sonnet 4.6 --- src/keboola/vcr/sanitizers.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/keboola/vcr/sanitizers.py b/src/keboola/vcr/sanitizers.py index 566f564..5e9206c 100644 --- a/src/keboola/vcr/sanitizers.py +++ b/src/keboola/vcr/sanitizers.py @@ -789,9 +789,12 @@ def create_default_sanitizer(secrets: dict[str, Any]) -> DefaultSanitizer: """ Create a default sanitizer from secrets. - Extracts all string values from the secrets dict and returns a - DefaultSanitizer that handles OAuth bodies, JSON responses, headers, - and URL parameters automatically. + Collects only values under #-prefixed keys (Keboola's encrypted-field + convention) and returns a DefaultSanitizer that handles OAuth bodies, + JSON responses, headers, and URL parameters automatically. + + Non-sensitive metadata fields (oauthVersion, id, created, etc.) are + intentionally skipped to avoid corrupting URL paths in cassettes. Args: secrets: Dictionary of secret values to redact @@ -799,7 +802,8 @@ def create_default_sanitizer(secrets: dict[str, Any]) -> DefaultSanitizer: Returns: A DefaultSanitizer with extracted secret values """ - secret_values = extract_values(secrets, []) + secret_values: list[str] = [] + _collect_hash_values(secrets, secret_values) return DefaultSanitizer(sensitive_values=secret_values) From 35f35346439518aa4d6a6645f15a8f5f79215a36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maty=C3=A1=C5=A1=20Jir=C3=A1t?= Date: Thu, 26 Feb 2026 16:16:20 +0100 Subject: [PATCH 4/4] Fix tests to use #-prefixed keys matching real secrets file format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous tests used plain keys (e.g. {"api_key": "my-key"}) which did not reflect the actual config.secrets.json structure. In Keboola, only #-prefixed keys are encrypted secrets — non-prefixed fields like oauthVersion or id are plain metadata that must not be redacted from cassettes (doing so was the bug fixed in 58bd94c). Update both failing tests to use #-prefixed keys, and add an explicit assertion that non-prefixed metadata values are not collected. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_recorder.py | 2 +- tests/test_sanitizers.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_recorder.py b/tests/test_recorder.py index b400a69..5f91b60 100644 --- a/tests/test_recorder.py +++ b/tests/test_recorder.py @@ -174,7 +174,7 @@ def test_custom_sanitizers_stored_in_composite(self, tmp_cassette_dir): def test_no_sanitizers_uses_create_default_sanitizer(self, tmp_cassette_dir): with patch("keboola.vcr.recorder.vcr") as mock_vcr: mock_vcr.VCR.return_value = MagicMock() - r = VCRRecorder(cassette_dir=tmp_cassette_dir, secrets={"k": "v"}) + r = VCRRecorder(cassette_dir=tmp_cassette_dir, secrets={"#k": "v"}) assert isinstance(r.sanitizer, DefaultSanitizer) assert "v" in r.sanitizer.sensitive_values diff --git a/tests/test_sanitizers.py b/tests/test_sanitizers.py index ee716cf..e4e2ad3 100644 --- a/tests/test_sanitizers.py +++ b/tests/test_sanitizers.py @@ -474,11 +474,12 @@ def test_ignores_non_hash_prefixed_keys(self): class TestCreateDefaultSanitizer: def test_creates_default_sanitizer_with_secret_values(self): - secrets = {"api_key": "my-key", "password": "s3cret"} + secrets = {"#api_key": "my-key", "#password": "s3cret", "oauthVersion": "2.0"} s = create_default_sanitizer(secrets) assert isinstance(s, DefaultSanitizer) assert "my-key" in s.sensitive_values assert "s3cret" in s.sensitive_values + assert "2.0" not in s.sensitive_values def test_empty_secrets(self): s = create_default_sanitizer({})