Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion nodes/src/nodes/ocr/IGlobal.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@
# img2table internally imports cv2, so this must come first
from ai.common.opencv import cv2 # noqa: F401 - ensures correct opencv

# Same reason for polars: img2table depends on polars, and the default wheel
# crashes on x86_64 CPUs without AVX2. Importing ai.common.polars triggers
# the polars-lts-cpu install + cleanup before img2table loads. The actual
# `pl` symbol is imported in the methods that use it.
import ai.common.polars # noqa: F401 - imported for module-level install side effect

# img2table 2.0 (2026-05-10) rewrote its OCR plug-in API and moved the base
# class. Detect which version is installed so this adapter works against both.
try:
Expand Down Expand Up @@ -262,7 +268,7 @@ def to_ocr_dataframe(self, content: List[List]) -> Any:
OCRDataframe object
"""
from img2table.ocr.data import OCRDataframe
import polars as pl
from ai.common.polars import pl

def _diag(msg):
pass
Expand Down
4 changes: 4 additions & 0 deletions nodes/src/nodes/ocr/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
# numpy and pillow are base engine dependencies - don't reinstall them here
# as that can corrupt already-loaded modules in the server process.
#
# Pin polars-lts-cpu before img2table so pip skips the AVX2-only `polars` wheel
# that crashes (SEH 0xc000001d / SIGILL) on x86_64 hosts without AVX2/FMA.
# Same import name, drop-in API. Marker scopes the pin to x86_64 only.
polars-lts-cpu; platform_machine == "x86_64" or platform_machine == "AMD64"
img2table
pillow
numpy
5 changes: 5 additions & 0 deletions packages/ai/src/ai/common/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
model type without model-specific branching.
"""

# Patch default SSL context to use the OS trust store before any loader
# triggers a model download. Without this, embedded Python on Windows fails
# weight downloads with "unable to get local issuer certificate".
import ai.common.ssl # noqa: F401

# Base loader class
from .base import BaseLoader

Expand Down
81 changes: 81 additions & 0 deletions packages/ai/src/ai/common/polars/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# =============================================================================
# MIT License
# Copyright (c) 2026 Aparavi Software AG
# =============================================================================
"""
Polars wrapper that ensures polars-lts-cpu is the active install on x86_64.

The default `polars` PyPI wheel requires AVX2/FMA/BMI1/BMI2/etc. and crashes
(SEH 0xc000001d / SIGILL) on x86_64 hosts without those features. The
`polars-lts-cpu` wheel ships an AVX2-free binary under the same `polars`
import name. Same Python API; GPU acceleration in the engine flows through
PyTorch (ai.common.torch) and is independent of this choice.

The wrinkle: img2table and other libs declare `polars` as a hard dependency.
Pip/uv resolve them as separate distributions, both writing into the same
`polars/` namespace in site-packages. If the regular `polars` wheel ends up
authoritative for the compiled `_polars.pyd` / `_polars.abi3.so`, you crash;
if the .py files come from one version and the binary from another you get
ImportErrors like "cannot import name 'POLARS_STORAGE_CONFIG_KEYS'".

This module follows the same pattern as ai.common.opencv (which solves the
identical problem for cv2's four conflicting PyPI wheels):
1. Install polars-lts-cpu via the requirements file.
2. Uninstall any plain `polars` that came in as a transitive dep.
3. Force-reinstall polars-lts-cpu so its files are unambiguously on disk.
4. Reset any cached `polars` modules so the next import is clean.

ARM hosts (Linux aarch64, macOS arm64) don't need this — their default
`polars` wheel has no AVX requirement — so the cleanup is x86_64-only.

Usage:
from ai.common.polars import pl
df = pl.DataFrame(...)

Import this BEFORE any module that touches polars (img2table, deltalake, etc.)
so the right binary is in place when those modules load.
"""

import os
import platform
import sys

from depends import depends, pip

# polars-lts-cpu only matters on x86_64; ARM wheels don't ship AVX2 code paths.
_NEEDS_LTS = platform.machine().lower() in ('x86_64', 'amd64')

requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
depends(requirements)

if _NEEDS_LTS:
try:
import importlib.metadata as _md

_has_plain_polars = False
try:
_md.version('polars')
_has_plain_polars = True
except _md.PackageNotFoundError:
# Plain `polars` not installed — only polars-lts-cpu is on disk,
# which is exactly the desired state. No cleanup needed.
pass

if _has_plain_polars:
# Plain `polars` was pulled in transitively (img2table etc.).
# Drop it and force-reinstall lts-cpu so its binary wins on disk.
pip('uninstall', '-y', 'polars')
pip('install', '--force-reinstall', '--no-deps', 'polars-lts-cpu')

# Drop any already-loaded polars modules so the next import
# picks up the freshly-written files instead of cached state.
for _mod in [m for m in list(sys.modules) if m == 'polars' or m.startswith('polars.')]:
sys.modules.pop(_mod, None)
except Exception:
# Best-effort cleanup. If it fails, the import below will surface
# the underlying issue with a real traceback.
pass
Comment on lines +51 to +77

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Fail fast when polars remediation does not succeed.

The x86_64 remediation path treats uninstall/reinstall as best-effort and swallows failures. Because pip(...) returns a boolean, a failed cleanup can silently leave incompatible polars active, which reintroduces the crash class this module is meant to prevent.

Suggested fix
 if _NEEDS_LTS:
     try:
         import importlib.metadata as _md
@@
         if _has_plain_polars:
@@
-            pip('uninstall', '-y', 'polars')
-            pip('install', '--force-reinstall', '--no-deps', 'polars-lts-cpu')
+            uninstalled = pip('uninstall', '-y', 'polars')
+            installed = pip('install', '--force-reinstall', '--no-deps', 'polars-lts-cpu')
+            if not (uninstalled and installed):
+                raise RuntimeError('Failed to enforce polars-lts-cpu on x86_64 host')
@@
-    except Exception:
-        # Best-effort cleanup. If it fails, the import below will surface
-        # the underlying issue with a real traceback.
-        pass
+    except Exception as exc:
+        raise RuntimeError('Polars runtime remediation failed before import') from exc
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if _NEEDS_LTS:
try:
import importlib.metadata as _md
_has_plain_polars = False
try:
_md.version('polars')
_has_plain_polars = True
except _md.PackageNotFoundError:
# Plain `polars` not installed — only polars-lts-cpu is on disk,
# which is exactly the desired state. No cleanup needed.
pass
if _has_plain_polars:
# Plain `polars` was pulled in transitively (img2table etc.).
# Drop it and force-reinstall lts-cpu so its binary wins on disk.
pip('uninstall', '-y', 'polars')
pip('install', '--force-reinstall', '--no-deps', 'polars-lts-cpu')
# Drop any already-loaded polars modules so the next import
# picks up the freshly-written files instead of cached state.
for _mod in [m for m in list(sys.modules) if m == 'polars' or m.startswith('polars.')]:
sys.modules.pop(_mod, None)
except Exception:
# Best-effort cleanup. If it fails, the import below will surface
# the underlying issue with a real traceback.
pass
if _NEEDS_LTS:
try:
import importlib.metadata as _md
_has_plain_polars = False
try:
_md.version('polars')
_has_plain_polars = True
except _md.PackageNotFoundError:
# Plain `polars` not installed — only polars-lts-cpu is on disk,
# which is exactly the desired state. No cleanup needed.
pass
if _has_plain_polars:
# Plain `polars` was pulled in transitively (img2table etc.).
# Drop it and force-reinstall lts-cpu so its binary wins on disk.
uninstalled = pip('uninstall', '-y', 'polars')
installed = pip('install', '--force-reinstall', '--no-deps', 'polars-lts-cpu')
if not (uninstalled and installed):
raise RuntimeError('Failed to enforce polars-lts-cpu on x86_64 host')
# Drop any already-loaded polars modules so the next import
# picks up the freshly-written files instead of cached state.
for _mod in [m for m in list(sys.modules) if m == 'polars' or m.startswith('polars.')]:
sys.modules.pop(_mod, None)
except Exception as exc:
raise RuntimeError('Polars runtime remediation failed before import') from exc
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/ai/src/ai/common/polars/__init__.py` around lines 51 - 77, The
current x86_64 remediation block (guarded by _NEEDS_LTS) swallows all exceptions
and ignores pip(...) return values, so uninstall/install failures can leave an
incompatible polars present; change the cleanup to fail fast: after each
pip('uninstall'...) and pip('install'...) call in the remediation branch, check
the boolean return and if False raise a clear RuntimeError (or re-raise the
underlying exception) so the import fails fast; also avoid the blanket except
Exception: pass — either remove it or re-raise after logging so failures in the
remediation (within the try around importlib.metadata, pip calls, or sys.modules
manipulations) surface immediately and prevent a silent fallback to an
incompatible polars.


import polars as pl # noqa: E402

__all__ = ['pl']
1 change: 1 addition & 0 deletions packages/ai/src/ai/common/polars/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
polars-lts-cpu; platform_machine == "x86_64" or platform_machine == "AMD64"
61 changes: 61 additions & 0 deletions packages/ai/src/ai/common/ssl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# =============================================================================
# MIT License
# Copyright (c) 2026 Aparavi Software AG
# =============================================================================
"""
SSL trust store integration.

Embedded Python on Windows ships a default SSL context that loads only a
narrow subset of the Windows ROOT store (often <30 CAs in practice on
locked-down corporate machines). That breaks any model loader that
downloads weights from a public CDN — TLS validation fails with
"unable to get local issuer certificate" because the CA that signed the
server's chain isn't in the loaded subset.

This module installs `truststore` and patches Python's default SSL context
to use the OS trust store directly (SChannel on Windows, SecureTransport on
macOS, OpenSSL system roots on Linux). Effects:
- All public CAs in the OS store are trusted, not just the subset
`load_default_certs()` exposes.
- Corporate root CAs deployed via Group Policy / MDM are picked up
automatically — needed for any environment with TLS-intercepting proxies
(Zscaler, Netskope, BlueCoat, etc.).
- urllib, requests, httpx, and anything using a default SSL context all
benefit from the same patch — no per-callsite changes needed.

Usage:
import ai.common.ssl # noqa: F401 - patches default SSL context

Import this once, early, in any module that triggers HTTPS downloads.
The `ai.common.models` package imports it at the top of its __init__.py,
so any model loader is covered transitively.

If truststore can't be installed or injected (e.g. very old Python), this
module falls back to pointing OpenSSL at certifi's bundle — better than
the partial Windows store, but won't pick up corporate CAs.
"""

import os
from depends import depends

requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
depends(requirements)
Comment on lines +41 to +42

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Move depends(requirements) inside the fallback try.

Lines 41-42 can raise before the fallback logic starts, so a failed truststore install aborts ai.common.ssl import entirely instead of degrading to certifi or leaving the default SSL context unchanged. Because packages/ai/src/ai/common/models/__init__.py imports this module eagerly, that turns a transient pip/network failure into a hard failure for every ai.common.models import.

Suggested fix
-requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
-depends(requirements)
-
 try:
+    requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
+    depends(requirements)
     import truststore

     truststore.inject_into_ssl()
 except Exception:

Based on learnings, import-time depends(...) in packages/ai/src/ai/**/__init__.py is intentional; the problem here is only that this call sits outside the graceful-fallback path.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
depends(requirements)
try:
requirements = os.path.dirname(os.path.realpath(__file__)) + '/requirements.txt'
depends(requirements)
import truststore
truststore.inject_into_ssl()
except Exception:
# fallback logic continues here
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/ai/src/ai/common/ssl/__init__.py` around lines 41 - 42, The call to
depends(requirements) is executed before the fallback try/except and can raise
during import; move the depends(requirements) invocation into the existing try
block that attempts the truststore installation so failures are caught by the
except and the module can fall back to certifi or the default SSL context.
Specifically, relocate the depends(requirements) call so it runs inside the same
try that wraps truststore installation (referencing the requirements variable
and the truststore installation logic) and do not call depends at module import
level outside that try/except.

Source: Learnings


try:
import truststore

truststore.inject_into_ssl()
except Exception:
# Fallback: point Python at certifi's CA bundle. Catches the "embedded
# Python's default trust store is too small" case but won't help with
# corporate TLS interception. Better than nothing.
try:
import certifi

os.environ.setdefault('SSL_CERT_FILE', certifi.where())
os.environ.setdefault('REQUESTS_CA_BUNDLE', certifi.where())
except Exception:
# Both truststore and certifi fallback failed. Leave the default SSL
# context untouched — downstream HTTPS calls will surface their own
# error with a real traceback if validation fails.
pass
1 change: 1 addition & 0 deletions packages/ai/src/ai/common/ssl/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
truststore
Loading