diff --git a/nodes/src/nodes/ocr/IGlobal.py b/nodes/src/nodes/ocr/IGlobal.py index 667b14a69..9552b6bd5 100644 --- a/nodes/src/nodes/ocr/IGlobal.py +++ b/nodes/src/nodes/ocr/IGlobal.py @@ -40,6 +40,12 @@ # img2table internally imports cv2, so this must come first from ai.common.opencv import cv2 # noqa: F401 - ensures correct opencv +# Same reason for polars: img2table depends on polars, and the default wheel +# crashes on x86_64 CPUs without AVX2. Importing ai.common.polars triggers +# the polars-lts-cpu install + cleanup before img2table loads. The actual +# `pl` symbol is imported in the methods that use it. +import ai.common.polars # noqa: F401 - imported for module-level install side effect + # img2table 2.0 (2026-05-10) rewrote its OCR plug-in API and moved the base # class. Detect which version is installed so this adapter works against both. try: @@ -262,7 +268,7 @@ def to_ocr_dataframe(self, content: List[List]) -> Any: OCRDataframe object """ from img2table.ocr.data import OCRDataframe - import polars as pl + from ai.common.polars import pl def _diag(msg): pass diff --git a/nodes/src/nodes/ocr/requirements.txt b/nodes/src/nodes/ocr/requirements.txt index 0d430a193..19612ca2d 100644 --- a/nodes/src/nodes/ocr/requirements.txt +++ b/nodes/src/nodes/ocr/requirements.txt @@ -7,6 +7,10 @@ # numpy and pillow are base engine dependencies - don't reinstall them here # as that can corrupt already-loaded modules in the server process. # +# Pin polars-lts-cpu before img2table so pip skips the AVX2-only `polars` wheel +# that crashes (SEH 0xc000001d / SIGILL) on x86_64 hosts without AVX2/FMA. +# Same import name, drop-in API. Marker scopes the pin to x86_64 only. +polars-lts-cpu; platform_machine == "x86_64" or platform_machine == "AMD64" img2table pillow numpy diff --git a/packages/ai/src/ai/common/models/__init__.py b/packages/ai/src/ai/common/models/__init__.py index bf92cf9cf..e4773a3c6 100644 --- a/packages/ai/src/ai/common/models/__init__.py +++ b/packages/ai/src/ai/common/models/__init__.py @@ -18,6 +18,11 @@ model type without model-specific branching. """ +# Patch default SSL context to use the OS trust store before any loader +# triggers a model download. Without this, embedded Python on Windows fails +# weight downloads with "unable to get local issuer certificate". +import ai.common.ssl # noqa: F401 + # Base loader class from .base import BaseLoader diff --git a/packages/ai/src/ai/common/polars/__init__.py b/packages/ai/src/ai/common/polars/__init__.py new file mode 100644 index 000000000..4779a1c21 --- /dev/null +++ b/packages/ai/src/ai/common/polars/__init__.py @@ -0,0 +1,81 @@ +# ============================================================================= +# MIT License +# Copyright (c) 2026 Aparavi Software AG +# ============================================================================= +""" +Polars wrapper that ensures polars-lts-cpu is the active install on x86_64. + +The default `polars` PyPI wheel requires AVX2/FMA/BMI1/BMI2/etc. and crashes +(SEH 0xc000001d / SIGILL) on x86_64 hosts without those features. The +`polars-lts-cpu` wheel ships an AVX2-free binary under the same `polars` +import name. Same Python API; GPU acceleration in the engine flows through +PyTorch (ai.common.torch) and is independent of this choice. + +The wrinkle: img2table and other libs declare `polars` as a hard dependency. +Pip/uv resolve them as separate distributions, both writing into the same +`polars/` namespace in site-packages. If the regular `polars` wheel ends up +authoritative for the compiled `_polars.pyd` / `_polars.abi3.so`, you crash; +if the .py files come from one version and the binary from another you get +ImportErrors like "cannot import name 'POLARS_STORAGE_CONFIG_KEYS'". + +This module follows the same pattern as ai.common.opencv (which solves the +identical problem for cv2's four conflicting PyPI wheels): + 1. Install polars-lts-cpu via the requirements file. + 2. Uninstall any plain `polars` that came in as a transitive dep. + 3. Force-reinstall polars-lts-cpu so its files are unambiguously on disk. + 4. Reset any cached `polars` modules so the next import is clean. + +ARM hosts (Linux aarch64, macOS arm64) don't need this — their default +`polars` wheel has no AVX requirement — so the cleanup is x86_64-only. + +Usage: + from ai.common.polars import pl + df = pl.DataFrame(...) + +Import this BEFORE any module that touches polars (img2table, deltalake, etc.) +so the right binary is in place when those modules load. +""" + +import os +import platform +import sys + +from depends import depends, pip + +# polars-lts-cpu only matters on x86_64; ARM wheels don't ship AVX2 code paths. +_NEEDS_LTS = platform.machine().lower() in ('x86_64', 'amd64') + +requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt' +depends(requirements) + +if _NEEDS_LTS: + try: + import importlib.metadata as _md + + _has_plain_polars = False + try: + _md.version('polars') + _has_plain_polars = True + except _md.PackageNotFoundError: + # Plain `polars` not installed — only polars-lts-cpu is on disk, + # which is exactly the desired state. No cleanup needed. + pass + + if _has_plain_polars: + # Plain `polars` was pulled in transitively (img2table etc.). + # Drop it and force-reinstall lts-cpu so its binary wins on disk. + pip('uninstall', '-y', 'polars') + pip('install', '--force-reinstall', '--no-deps', 'polars-lts-cpu') + + # Drop any already-loaded polars modules so the next import + # picks up the freshly-written files instead of cached state. + for _mod in [m for m in list(sys.modules) if m == 'polars' or m.startswith('polars.')]: + sys.modules.pop(_mod, None) + except Exception: + # Best-effort cleanup. If it fails, the import below will surface + # the underlying issue with a real traceback. + pass + +import polars as pl # noqa: E402 + +__all__ = ['pl'] diff --git a/packages/ai/src/ai/common/polars/requirements.txt b/packages/ai/src/ai/common/polars/requirements.txt new file mode 100644 index 000000000..e8e4da637 --- /dev/null +++ b/packages/ai/src/ai/common/polars/requirements.txt @@ -0,0 +1 @@ +polars-lts-cpu; platform_machine == "x86_64" or platform_machine == "AMD64" diff --git a/packages/ai/src/ai/common/ssl/__init__.py b/packages/ai/src/ai/common/ssl/__init__.py new file mode 100644 index 000000000..74be3cd4f --- /dev/null +++ b/packages/ai/src/ai/common/ssl/__init__.py @@ -0,0 +1,61 @@ +# ============================================================================= +# MIT License +# Copyright (c) 2026 Aparavi Software AG +# ============================================================================= +""" +SSL trust store integration. + +Embedded Python on Windows ships a default SSL context that loads only a +narrow subset of the Windows ROOT store (often <30 CAs in practice on +locked-down corporate machines). That breaks any model loader that +downloads weights from a public CDN — TLS validation fails with +"unable to get local issuer certificate" because the CA that signed the +server's chain isn't in the loaded subset. + +This module installs `truststore` and patches Python's default SSL context +to use the OS trust store directly (SChannel on Windows, SecureTransport on +macOS, OpenSSL system roots on Linux). Effects: + - All public CAs in the OS store are trusted, not just the subset + `load_default_certs()` exposes. + - Corporate root CAs deployed via Group Policy / MDM are picked up + automatically — needed for any environment with TLS-intercepting proxies + (Zscaler, Netskope, BlueCoat, etc.). + - urllib, requests, httpx, and anything using a default SSL context all + benefit from the same patch — no per-callsite changes needed. + +Usage: + import ai.common.ssl # noqa: F401 - patches default SSL context + +Import this once, early, in any module that triggers HTTPS downloads. +The `ai.common.models` package imports it at the top of its __init__.py, +so any model loader is covered transitively. + +If truststore can't be installed or injected (e.g. very old Python), this +module falls back to pointing OpenSSL at certifi's bundle — better than +the partial Windows store, but won't pick up corporate CAs. +""" + +import os +from depends import depends + +requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt' +depends(requirements) + +try: + import truststore + + truststore.inject_into_ssl() +except Exception: + # Fallback: point Python at certifi's CA bundle. Catches the "embedded + # Python's default trust store is too small" case but won't help with + # corporate TLS interception. Better than nothing. + try: + import certifi + + os.environ.setdefault('SSL_CERT_FILE', certifi.where()) + os.environ.setdefault('REQUESTS_CA_BUNDLE', certifi.where()) + except Exception: + # Both truststore and certifi fallback failed. Leave the default SSL + # context untouched — downstream HTTPS calls will surface their own + # error with a real traceback if validation fails. + pass diff --git a/packages/ai/src/ai/common/ssl/requirements.txt b/packages/ai/src/ai/common/ssl/requirements.txt new file mode 100644 index 000000000..5c96ae4a0 --- /dev/null +++ b/packages/ai/src/ai/common/ssl/requirements.txt @@ -0,0 +1 @@ +truststore