diff --git a/packages/ai/src/ai/common/models/audio/whisper.py b/packages/ai/src/ai/common/models/audio/whisper.py
index 40546efa7..7481a0fca 100644
--- a/packages/ai/src/ai/common/models/audio/whisper.py
+++ b/packages/ai/src/ai/common/models/audio/whisper.py
@@ -262,8 +262,20 @@ def load(
                     compute_type=compute_type,
                 )
         except Exception as e:
-            logger.error(f'Failed to load whisper model: {e}')
-            raise Exception(f'Failed to load whisper model {model_name}: {e}')
+            if torch_device != 'cpu':
+                logger.warning(f'Whisper GPU load failed ({e}), falling back to CPU')
+                torch_device = 'cpu'
+                gpu_index = -1
+                if compute_type == 'float16':
+                    compute_type = 'int8'
+                try:
+                    model = WhisperModel(model_name, device='cpu', compute_type=compute_type)
+                except Exception as cpu_e:
+                    logger.error(f'Failed to load whisper model on CPU: {cpu_e}')
+                    raise Exception(f'Failed to load whisper model {model_name}: {cpu_e}')
+            else:
+                logger.error(f'Failed to load whisper model: {e}')
+                raise Exception(f'Failed to load whisper model {model_name}: {e}')
 
         # Bundle model
         model_bundle = {
diff --git a/packages/ai/src/ai/common/models/gliner/gliner.py b/packages/ai/src/ai/common/models/gliner/gliner.py
index e1f1c4ec5..804d33afe 100644
--- a/packages/ai/src/ai/common/models/gliner/gliner.py
+++ b/packages/ai/src/ai/common/models/gliner/gliner.py
@@ -90,12 +90,17 @@ def load(
             model.eval()
         else:
             # === LOCAL MODE: Load directly to specified device ===
-            if device is None:
-                # Auto-detect
-                from ai.common.torch import torch
+            from ai.common.torch import torch, probe_cuda
 
+            if device is None:
                 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 
+            if 'cuda' in str(device):
+                dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0
+                if not probe_cuda(dev_idx):
+                    logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU')
+                    device = 'cpu'
+
             logger.info(f'Loading GLiNER {model_name} to {device}')
             model = GLiNERModel.from_pretrained(model_name)
             model.to(device)
diff --git a/packages/ai/src/ai/common/models/ocr/doctr.py b/packages/ai/src/ai/common/models/ocr/doctr.py
index 3b5f9688b..eba461def 100644
--- a/packages/ai/src/ai/common/models/ocr/doctr.py
+++ b/packages/ai/src/ai/common/models/ocr/doctr.py
@@ -64,7 +64,7 @@ def load(
         from ai.common.opencv import cv2  # noqa: F401
 
         from doctr.models import ocr_predictor
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         exclude_gpus = exclude_gpus or []
         memory_gb = 2.0
@@ -86,6 +86,11 @@ def load(
                 gpu_index = 0
                 torch_device = 'cuda:0'
 
+            if torch_device != 'cpu' and not probe_cuda(gpu_index):
+                logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU')
+                torch_device = 'cpu'
+                gpu_index = -1
+
         logger.info(f'Loading docTR with det={detection_model}, rec={recognition_model}')
 
         try:
diff --git a/packages/ai/src/ai/common/models/ocr/easyocr.py b/packages/ai/src/ai/common/models/ocr/easyocr.py
index 00e08212c..4fbd6e1eb 100644
--- a/packages/ai/src/ai/common/models/ocr/easyocr.py
+++ b/packages/ai/src/ai/common/models/ocr/easyocr.py
@@ -97,7 +97,7 @@ def load(
         from ai.common.opencv import cv2  # noqa: F401
 
         import easyocr
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         languages = languages or ['en']
         exclude_gpus = exclude_gpus or []
@@ -126,6 +126,12 @@ def load(
                 torch_device = 'cuda:0'
                 use_gpu = True
 
+        if use_gpu and not probe_cuda(gpu_index):
+            logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU for EasyOCR')
+            use_gpu = False
+            gpu_index = -1
+            torch_device = 'cpu'
+
         logger.info(f'Loading EasyOCR with languages {languages} on {torch_device}')
 
         try:
@@ -135,8 +141,18 @@ def load(
                 verbose=False,
             )
         except Exception as e:
-            logger.error(f'Failed to load EasyOCR: {e}')
-            raise Exception(f'Failed to load EasyOCR: {e}')
+            if use_gpu:
+                logger.warning(f'EasyOCR GPU load failed ({e}), falling back to CPU')
+                gpu_index = -1
+                torch_device = 'cpu'
+                try:
+                    reader = easyocr.Reader(languages, gpu=False, verbose=False)
+                except Exception as cpu_e:
+                    logger.error(f'Failed to load EasyOCR: {cpu_e}')
+                    raise Exception(f'Failed to load EasyOCR: {cpu_e}')
+            else:
+                logger.error(f'Failed to load EasyOCR: {e}')
+                raise Exception(f'Failed to load EasyOCR: {e}')
 
         # EasyOCR wraps its detector and recognizer in DataParallel, which
         # scatters every batch across ALL visible GPUs via parallel_apply().
diff --git a/packages/ai/src/ai/common/models/ocr/surya.py b/packages/ai/src/ai/common/models/ocr/surya.py
index 2300eb27f..1f1056a52 100644
--- a/packages/ai/src/ai/common/models/ocr/surya.py
+++ b/packages/ai/src/ai/common/models/ocr/surya.py
@@ -68,7 +68,7 @@ def load(
         from surya.foundation import FoundationPredictor  # contract-check: ignore  see comment above
         from surya.recognition import RecognitionPredictor  # contract-check: ignore  see comment above
         from surya.detection import DetectionPredictor  # contract-check: ignore  see comment above
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         languages = languages or ['en']
         exclude_gpus = exclude_gpus or []
@@ -91,6 +91,11 @@ def load(
                 gpu_index = 0
                 torch_device = 'cuda:0'
 
+            if torch_device != 'cpu' and not probe_cuda(gpu_index):
+                logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU')
+                torch_device = 'cpu'
+                gpu_index = -1
+
         logger.info(f'Loading Surya OCR on {torch_device}')
 
         try:
diff --git a/packages/ai/src/ai/common/models/ocr/trocr.py b/packages/ai/src/ai/common/models/ocr/trocr.py
index 27706aa04..729b1cfc9 100644
--- a/packages/ai/src/ai/common/models/ocr/trocr.py
+++ b/packages/ai/src/ai/common/models/ocr/trocr.py
@@ -89,7 +89,7 @@ def load(
         # disable contract check for craft_text_detector due to opencv conflict (see README)
         from craft_text_detector import Craft  # contract-check: ignore  requirements_trocr.txt is `disable`d
         from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         exclude_gpus = exclude_gpus or []
 
@@ -113,6 +113,11 @@ def load(
                 gpu_index = 0
                 torch_device = 'cuda:0'
 
+            if torch_device != 'cpu' and not probe_cuda(gpu_index):
+                logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU')
+                torch_device = 'cpu'
+                gpu_index = -1
+
         logger.info(f'Loading TrOCR pipeline on {torch_device}')
 
         try:
diff --git a/packages/ai/src/ai/common/models/transformers/sentence_transformers.py b/packages/ai/src/ai/common/models/transformers/sentence_transformers.py
index 2e2617316..3af2adbe8 100644
--- a/packages/ai/src/ai/common/models/transformers/sentence_transformers.py
+++ b/packages/ai/src/ai/common/models/transformers/sentence_transformers.py
@@ -97,12 +97,17 @@ def load(
             model.eval()
         else:
             # === LOCAL MODE: Load directly to specified device ===
-            if device is None:
-                # Auto-detect
-                from ai.common.torch import torch
+            from ai.common.torch import torch, probe_cuda
 
+            if device is None:
                 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 
+            if 'cuda' in str(device):
+                dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0
+                if not probe_cuda(dev_idx):
+                    logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU')
+                    device = 'cpu'
+
             logger.info(f'Loading SentenceTransformer {model_name} to {device}')
             model = ST(model_name_or_path=model_name, device=device, **kwargs)
             model.eval()
diff --git a/packages/ai/src/ai/common/models/transformers/transformers.py b/packages/ai/src/ai/common/models/transformers/transformers.py
index 840f2d727..729848ff6 100644
--- a/packages/ai/src/ai/common/models/transformers/transformers.py
+++ b/packages/ai/src/ai/common/models/transformers/transformers.py
@@ -118,7 +118,7 @@ def _load_model(
     ) -> Tuple[Any, Dict[str, Any], int]:
         """Load a transformers model with CPU-first loading."""
         from transformers import AutoModel
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         # Enable trust_remote_code by default (can be overridden via kwargs)
         kwargs.setdefault('trust_remote_code', True)
@@ -157,6 +157,12 @@ def _load_model(
             if device is None:
                 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
 
+            if 'cuda' in str(device):
+                dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0
+                if not probe_cuda(dev_idx):
+                    logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU')
+                    device = 'cpu'
+
             # Load directly to device
             model = ModelClass.from_pretrained(model_name, **kwargs)
             model = model.to(device)
@@ -190,7 +196,7 @@ def _load_pipeline(
     ) -> Tuple[Any, Dict[str, Any], int]:
         """Load a transformers pipeline."""
         from transformers import pipeline as hf_pipeline
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         # Enable trust_remote_code by default (can be overridden via kwargs)
         kwargs.setdefault('trust_remote_code', True)
@@ -217,6 +223,10 @@ def _load_pipeline(
                 elif device == 'cuda':
                     device = 0
 
+            if device >= 0 and not probe_cuda(device):
+                logger.warning(f'CUDA device {device} kernel probe failed, falling back to CPU')
+                device = -1
+
             pipe = hf_pipeline(task=task, model=model_name, device=device, **kwargs)
             gpu_index = device if device >= 0 else -1
             memory_gb = TransformersLoader._estimate_memory(model_name, task=task)
diff --git a/packages/ai/src/ai/common/models/vision/vision.py b/packages/ai/src/ai/common/models/vision/vision.py
index 172e25c53..5898c662f 100644
--- a/packages/ai/src/ai/common/models/vision/vision.py
+++ b/packages/ai/src/ai/common/models/vision/vision.py
@@ -59,7 +59,7 @@ def load(
         """
         VisionLoader._ensure_dependencies()
 
-        from ai.common.torch import torch
+        from ai.common.torch import torch, probe_cuda
 
         variant = (variant or 'clip').lower()
         if variant not in ('clip', 'vit'):
@@ -77,6 +77,11 @@ def load(
         else:
             if device is None:
                 device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+            if 'cuda' in str(device):
+                dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0
+                if not probe_cuda(dev_idx):
+                    logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU')
+                    device = 'cpu'
             gpu_index = int(device.split(':')[1]) if ':' in str(device) else (0 if device == 'cuda' else -1)
 
         if variant == 'clip':
diff --git a/packages/ai/src/ai/common/torch/__init__.py b/packages/ai/src/ai/common/torch/__init__.py
index 1492eb538..02779d322 100644
--- a/packages/ai/src/ai/common/torch/__init__.py
+++ b/packages/ai/src/ai/common/torch/__init__.py
@@ -14,4 +14,26 @@
 else:
     debug('    GPU processing disabled. Recommend using GPU for better performance.')
 
-__all__ = ['torch']
+
+def probe_cuda(device_index: int = 0) -> bool:
+    """Return True if CUDA compute kernels work on device_index, False otherwise.
+
+    Catches cudaErrorNoKernelImageForDevice that surfaces when the PyTorch build
+    does not include a kernel binary for the device's compute capability (e.g.
+    Pascal sm_61 on a Quadro P620).  The probe executes a tiny GEMM and then
+    calls synchronize() so any async CUDA error is raised here rather than
+    silently deferred to the first real inference call.
+    """
+    if not torch.cuda.is_available():
+        return False
+    try:
+        d = f'cuda:{device_index}'
+        a = torch.randn(2, 2, device=d)
+        _ = a @ a  # GEMM forces a compute kernel onto the device
+        torch.cuda.synchronize(d)
+        return True
+    except Exception:
+        return False
+
+
+__all__ = ['torch', 'probe_cuda']