diff --git a/packages/ai/src/ai/common/models/audio/whisper.py b/packages/ai/src/ai/common/models/audio/whisper.py index 40546efa7..7481a0fca 100644 --- a/packages/ai/src/ai/common/models/audio/whisper.py +++ b/packages/ai/src/ai/common/models/audio/whisper.py @@ -262,8 +262,20 @@ def load( compute_type=compute_type, ) except Exception as e: - logger.error(f'Failed to load whisper model: {e}') - raise Exception(f'Failed to load whisper model {model_name}: {e}') + if torch_device != 'cpu': + logger.warning(f'Whisper GPU load failed ({e}), falling back to CPU') + torch_device = 'cpu' + gpu_index = -1 + if compute_type == 'float16': + compute_type = 'int8' + try: + model = WhisperModel(model_name, device='cpu', compute_type=compute_type) + except Exception as cpu_e: + logger.error(f'Failed to load whisper model on CPU: {cpu_e}') + raise Exception(f'Failed to load whisper model {model_name}: {cpu_e}') + else: + logger.error(f'Failed to load whisper model: {e}') + raise Exception(f'Failed to load whisper model {model_name}: {e}') # Bundle model model_bundle = { diff --git a/packages/ai/src/ai/common/models/gliner/gliner.py b/packages/ai/src/ai/common/models/gliner/gliner.py index e1f1c4ec5..804d33afe 100644 --- a/packages/ai/src/ai/common/models/gliner/gliner.py +++ b/packages/ai/src/ai/common/models/gliner/gliner.py @@ -90,12 +90,17 @@ def load( model.eval() else: # === LOCAL MODE: Load directly to specified device === - if device is None: - # Auto-detect - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda + if device is None: device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + if 'cuda' in str(device): + dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0 + if not probe_cuda(dev_idx): + logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU') + device = 'cpu' + logger.info(f'Loading GLiNER {model_name} to {device}') model = GLiNERModel.from_pretrained(model_name) model.to(device) diff --git a/packages/ai/src/ai/common/models/ocr/doctr.py b/packages/ai/src/ai/common/models/ocr/doctr.py index 3b5f9688b..eba461def 100644 --- a/packages/ai/src/ai/common/models/ocr/doctr.py +++ b/packages/ai/src/ai/common/models/ocr/doctr.py @@ -64,7 +64,7 @@ def load( from ai.common.opencv import cv2 # noqa: F401 from doctr.models import ocr_predictor - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda exclude_gpus = exclude_gpus or [] memory_gb = 2.0 @@ -86,6 +86,11 @@ def load( gpu_index = 0 torch_device = 'cuda:0' + if torch_device != 'cpu' and not probe_cuda(gpu_index): + logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU') + torch_device = 'cpu' + gpu_index = -1 + logger.info(f'Loading docTR with det={detection_model}, rec={recognition_model}') try: diff --git a/packages/ai/src/ai/common/models/ocr/easyocr.py b/packages/ai/src/ai/common/models/ocr/easyocr.py index 00e08212c..4fbd6e1eb 100644 --- a/packages/ai/src/ai/common/models/ocr/easyocr.py +++ b/packages/ai/src/ai/common/models/ocr/easyocr.py @@ -97,7 +97,7 @@ def load( from ai.common.opencv import cv2 # noqa: F401 import easyocr - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda languages = languages or ['en'] exclude_gpus = exclude_gpus or [] @@ -126,6 +126,12 @@ def load( torch_device = 'cuda:0' use_gpu = True + if use_gpu and not probe_cuda(gpu_index): + logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU for EasyOCR') + use_gpu = False + gpu_index = -1 + torch_device = 'cpu' + logger.info(f'Loading EasyOCR with languages {languages} on {torch_device}') try: @@ -135,8 +141,18 @@ def load( verbose=False, ) except Exception as e: - logger.error(f'Failed to load EasyOCR: {e}') - raise Exception(f'Failed to load EasyOCR: {e}') + if use_gpu: + logger.warning(f'EasyOCR GPU load failed ({e}), falling back to CPU') + gpu_index = -1 + torch_device = 'cpu' + try: + reader = easyocr.Reader(languages, gpu=False, verbose=False) + except Exception as cpu_e: + logger.error(f'Failed to load EasyOCR: {cpu_e}') + raise Exception(f'Failed to load EasyOCR: {cpu_e}') + else: + logger.error(f'Failed to load EasyOCR: {e}') + raise Exception(f'Failed to load EasyOCR: {e}') # EasyOCR wraps its detector and recognizer in DataParallel, which # scatters every batch across ALL visible GPUs via parallel_apply(). diff --git a/packages/ai/src/ai/common/models/ocr/surya.py b/packages/ai/src/ai/common/models/ocr/surya.py index 2300eb27f..1f1056a52 100644 --- a/packages/ai/src/ai/common/models/ocr/surya.py +++ b/packages/ai/src/ai/common/models/ocr/surya.py @@ -68,7 +68,7 @@ def load( from surya.foundation import FoundationPredictor # contract-check: ignore see comment above from surya.recognition import RecognitionPredictor # contract-check: ignore see comment above from surya.detection import DetectionPredictor # contract-check: ignore see comment above - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda languages = languages or ['en'] exclude_gpus = exclude_gpus or [] @@ -91,6 +91,11 @@ def load( gpu_index = 0 torch_device = 'cuda:0' + if torch_device != 'cpu' and not probe_cuda(gpu_index): + logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU') + torch_device = 'cpu' + gpu_index = -1 + logger.info(f'Loading Surya OCR on {torch_device}') try: diff --git a/packages/ai/src/ai/common/models/ocr/trocr.py b/packages/ai/src/ai/common/models/ocr/trocr.py index 27706aa04..729b1cfc9 100644 --- a/packages/ai/src/ai/common/models/ocr/trocr.py +++ b/packages/ai/src/ai/common/models/ocr/trocr.py @@ -89,7 +89,7 @@ def load( # disable contract check for craft_text_detector due to opencv conflict (see README) from craft_text_detector import Craft # contract-check: ignore requirements_trocr.txt is `disable`d from transformers import TrOCRProcessor, VisionEncoderDecoderModel - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda exclude_gpus = exclude_gpus or [] @@ -113,6 +113,11 @@ def load( gpu_index = 0 torch_device = 'cuda:0' + if torch_device != 'cpu' and not probe_cuda(gpu_index): + logger.warning(f'CUDA device {gpu_index} kernel probe failed, falling back to CPU') + torch_device = 'cpu' + gpu_index = -1 + logger.info(f'Loading TrOCR pipeline on {torch_device}') try: diff --git a/packages/ai/src/ai/common/models/transformers/sentence_transformers.py b/packages/ai/src/ai/common/models/transformers/sentence_transformers.py index 2e2617316..3af2adbe8 100644 --- a/packages/ai/src/ai/common/models/transformers/sentence_transformers.py +++ b/packages/ai/src/ai/common/models/transformers/sentence_transformers.py @@ -97,12 +97,17 @@ def load( model.eval() else: # === LOCAL MODE: Load directly to specified device === - if device is None: - # Auto-detect - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda + if device is None: device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + if 'cuda' in str(device): + dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0 + if not probe_cuda(dev_idx): + logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU') + device = 'cpu' + logger.info(f'Loading SentenceTransformer {model_name} to {device}') model = ST(model_name_or_path=model_name, device=device, **kwargs) model.eval() diff --git a/packages/ai/src/ai/common/models/transformers/transformers.py b/packages/ai/src/ai/common/models/transformers/transformers.py index 840f2d727..729848ff6 100644 --- a/packages/ai/src/ai/common/models/transformers/transformers.py +++ b/packages/ai/src/ai/common/models/transformers/transformers.py @@ -118,7 +118,7 @@ def _load_model( ) -> Tuple[Any, Dict[str, Any], int]: """Load a transformers model with CPU-first loading.""" from transformers import AutoModel - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda # Enable trust_remote_code by default (can be overridden via kwargs) kwargs.setdefault('trust_remote_code', True) @@ -157,6 +157,12 @@ def _load_model( if device is None: device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + if 'cuda' in str(device): + dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0 + if not probe_cuda(dev_idx): + logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU') + device = 'cpu' + # Load directly to device model = ModelClass.from_pretrained(model_name, **kwargs) model = model.to(device) @@ -190,7 +196,7 @@ def _load_pipeline( ) -> Tuple[Any, Dict[str, Any], int]: """Load a transformers pipeline.""" from transformers import pipeline as hf_pipeline - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda # Enable trust_remote_code by default (can be overridden via kwargs) kwargs.setdefault('trust_remote_code', True) @@ -217,6 +223,10 @@ def _load_pipeline( elif device == 'cuda': device = 0 + if device >= 0 and not probe_cuda(device): + logger.warning(f'CUDA device {device} kernel probe failed, falling back to CPU') + device = -1 + pipe = hf_pipeline(task=task, model=model_name, device=device, **kwargs) gpu_index = device if device >= 0 else -1 memory_gb = TransformersLoader._estimate_memory(model_name, task=task) diff --git a/packages/ai/src/ai/common/models/vision/vision.py b/packages/ai/src/ai/common/models/vision/vision.py index 172e25c53..5898c662f 100644 --- a/packages/ai/src/ai/common/models/vision/vision.py +++ b/packages/ai/src/ai/common/models/vision/vision.py @@ -59,7 +59,7 @@ def load( """ VisionLoader._ensure_dependencies() - from ai.common.torch import torch + from ai.common.torch import torch, probe_cuda variant = (variant or 'clip').lower() if variant not in ('clip', 'vit'): @@ -77,6 +77,11 @@ def load( else: if device is None: device = 'cuda:0' if torch.cuda.is_available() else 'cpu' + if 'cuda' in str(device): + dev_idx = int(device.split(':')[1]) if ':' in str(device) else 0 + if not probe_cuda(dev_idx): + logger.warning(f'CUDA device {dev_idx} kernel probe failed, falling back to CPU') + device = 'cpu' gpu_index = int(device.split(':')[1]) if ':' in str(device) else (0 if device == 'cuda' else -1) if variant == 'clip': diff --git a/packages/ai/src/ai/common/torch/__init__.py b/packages/ai/src/ai/common/torch/__init__.py index 1492eb538..02779d322 100644 --- a/packages/ai/src/ai/common/torch/__init__.py +++ b/packages/ai/src/ai/common/torch/__init__.py @@ -14,4 +14,26 @@ else: debug(' GPU processing disabled. Recommend using GPU for better performance.') -__all__ = ['torch'] + +def probe_cuda(device_index: int = 0) -> bool: + """Return True if CUDA compute kernels work on device_index, False otherwise. + + Catches cudaErrorNoKernelImageForDevice that surfaces when the PyTorch build + does not include a kernel binary for the device's compute capability (e.g. + Pascal sm_61 on a Quadro P620). The probe executes a tiny GEMM and then + calls synchronize() so any async CUDA error is raised here rather than + silently deferred to the first real inference call. + """ + if not torch.cuda.is_available(): + return False + try: + d = f'cuda:{device_index}' + a = torch.randn(2, 2, device=d) + _ = a @ a # GEMM forces a compute kernel onto the device + torch.cuda.synchronize(d) + return True + except Exception: + return False + + +__all__ = ['torch', 'probe_cuda']