dimensionalOS · octo-patch · Jun 3, 2026 · greptile-apps · Jun 3, 2026 · greptile-apps
@@ -4,6 +4,8 @@ ALIBABA_API_KEY=
 ANTHROPIC_API_KEY=
 HF_TOKEN=
 HUGGINGFACE_PRV_ENDPOINT=
+MINIMAX_API_KEY=
+MINIMAX_BASE_URL=
 ROBOT_IP=
 CONN_TYPE=webrtc
 WEBRTC_SERVER_HOST=0.0.0.0

@@ -96,6 +96,7 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "skipif_in_ci: skip when CI env var is set")
     config.addinivalue_line("markers", "skipif_no_openai: skip when OPENAI_API_KEY is not set")
     config.addinivalue_line("markers", "skipif_no_alibaba: skip when ALIBABA_API_KEY is not set")
+    config.addinivalue_line("markers", "skipif_no_minimax: skip when MINIMAX_API_KEY is not set")
     config.addinivalue_line("markers", "skipif_no_ros: skip when ROS dependencies are not present")
     config.addinivalue_line("markers", "skipif_macos_bug: skip known-buggy tests on macOS")
     config.addinivalue_line("markers", "skipif_macos: skip tests not intended to run on macOS")
@@ -135,6 +136,7 @@ def pytest_collection_modifyitems(config, items):
         "skipif_in_ci": (bool(os.getenv("CI")), "Skipped in CI"),
         "skipif_no_openai": (not os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not set"),
         "skipif_no_alibaba": (not os.getenv("ALIBABA_API_KEY"), "ALIBABA_API_KEY not set"),
+        "skipif_no_minimax": (not os.getenv("MINIMAX_API_KEY"), "MINIMAX_API_KEY not set"),
         "skipif_no_ros": (not _has_ros(), "ROS dependencies are not present"),
         "skipif_macos_bug": (_is_macos(), "Some tests are buggy on Mac OS"),
         "skipif_macos": (_is_macos(), "Not intended to run on macOS"),

@@ -2,6 +2,39 @@
 
 This provides vision language model implementations for processing images and text queries.
 
+## MiniMax VLM
+
+The `MiniMaxVlModel` class provides access to MiniMax's M3 chat model (image
+input + text) via the OpenAI-compatible endpoint.
+
+**Prerequisites:**
+
+```bash
+export MINIMAX_API_KEY="your_api_key_here"
+# Optional: override the default base URL (https://api.minimax.io/v1)
+# export MINIMAX_BASE_URL="https://api.minimax.io/v1"
+```
+
+### Example Usage
+
+```python
+from dimos.models.vl.minimax import MiniMaxVlModel
+from dimos.msgs.sensor_msgs.Image import Image
+
+# Initialize the model (requires MINIMAX_API_KEY)
+model = MiniMaxVlModel()
+
+image = Image.from_file("path/to/your/image.jpg")
+response = model.query(image, "What do you see in this image?")
+print(response)
+```
+
+Notes:
+- `response_format` is not supported by MiniMax and is dropped with a warning
+  (use prompt-driven JSON extraction at a higher layer instead).
+- `temperature` is hard-pinned to `1.0` (MiniMax rejects `0`).
+- The default model is `MiniMax-M3` (512K context, image input).
+
 ## QwenVL Model
 
 The `QwenVlModel` class provides access to Alibaba's Qwen2.5-VL model for vision-language tasks.

@@ -0,0 +1,170 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MiniMax vision-language model.
+
+Uses MiniMax's OpenAI-compatible chat completions API. MiniMax exposes
+``https://api.minimax.io/v1`` and accepts the same ``client.chat.completions.create``
+shape as OpenAI, so the same ``openai`` SDK can target it via a custom
+``base_url``.
+
+Reference: https://platform.minimax.io/docs/api-reference/text-openai-api
+"""
+
+from functools import cached_property
+import os
+from typing import Any
+
+import numpy as np
+from openai import OpenAI
+
+from dimos.models.vl.base import VlModel, VlModelConfig
+from dimos.msgs.sensor_msgs.Image import Image
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+# Default MiniMax API base URL. Override with ``MINIMAX_BASE_URL`` if you
+# need to use the domestic endpoint or a proxy.
+DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1"
+
+# Only M3 is supported — the latest MiniMax chat model with image input.
+DEFAULT_MINIMAX_MODEL = "MiniMax-M3"
+
+
+class MiniMaxVlModelConfig(VlModelConfig):
+    """Configuration for the MiniMax VLM."""
+
+    model_name: str = DEFAULT_MINIMAX_MODEL
+    api_key: str | None = None
+    base_url: str | None = None
+
+
+class MiniMaxVlModel(VlModel):
+    """Vision-language model backed by MiniMax's OpenAI-compatible endpoint.
+
+    Auth: set ``MINIMAX_API_KEY`` (or pass ``api_key`` explicitly).
+    Optional override: ``MINIMAX_BASE_URL`` (default: ``https://api.minimax.io/v1``).
+    """
+
+    config: MiniMaxVlModelConfig
+
+    @cached_property
+    def _client(self) -> OpenAI:
+        api_key = self.config.api_key or os.getenv("MINIMAX_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "MiniMax API key must be provided or set in MINIMAX_API_KEY environment variable"
+            )
+
+        base_url = self.config.base_url or os.getenv("MINIMAX_BASE_URL", DEFAULT_MINIMAX_BASE_URL)
+
+        return OpenAI(api_key=api_key, base_url=base_url)
+
+    def query(
+        self,
+        image: Image | np.ndarray,
+        query: str,
+        response_format: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ) -> str:
+        if isinstance(image, np.ndarray):
+            import warnings
+
+            warnings.warn(
+                "MiniMaxVlModel.query should receive standard dimos Image type, not a numpy array",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+
+            image = Image.from_numpy(image)
+
+        # Apply auto_resize if configured
+        image, _ = self._prepare_image(image)
+
+        img_base64 = image.to_base64()
+
+        # ``response_format`` is not supported by MiniMax and will be rejected
+        # with a 4xx; if a caller asked for JSON output we have to fall back
+        # to a prompt-driven approach at a higher layer.
+        if response_format:
+            logger.warning(
+                "MiniMax does not support response_format; ignoring and relying on prompt."
+            )
+
+        api_kwargs: dict[str, Any] = {
+            "model": self.config.model_name,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/png;base64,{img_base64}"},
+                        },
+                        {"type": "text", "text": query},
+                    ],
+                }
+            ],
+            # MiniMax requires temperature in (0.0, 1.0]; 0 is rejected.
+            "temperature": 1.0,
+        }
+
+        response = self._client.chat.completions.create(**api_kwargs)
+
+        return response.choices[0].message.content  # type: ignore[no-any-return]
-        return response.choices[0].message.content  # type: ignore[no-any-return]
+        return response.choices[0].message.content or ""
-        return response.choices[0].message.content  # type: ignore[no-any-return]
+        return response.choices[0].message.content or ""
+
+    def query_batch(
+        self,
+        images: list[Image],
+        query: str,
+        response_format: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ) -> list[str]:
+        """Query VLM with multiple images using a single API call."""
+        if not images:
+            return []
+
+        if response_format:
+            logger.warning(
+                "MiniMax does not support response_format; ignoring and relying on prompt."
+            )
+
+        content: list[dict[str, Any]] = [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{self._prepare_image(img)[0].to_base64()}"
+                },
+            }
+            for img in images
+        ]
+        content.append({"type": "text", "text": query})
+
+        messages = [{"role": "user", "content": content}]
+        api_kwargs: dict[str, Any] = {
+            "model": self.config.model_name,
+            "messages": messages,
+            "temperature": 1.0,
+        }
+
+        response = self._client.chat.completions.create(**api_kwargs)
+        response_text = response.choices[0].message.content or ""
+        # Return one response per image (same response since API analyzes all images together)
+        return [response_text] * len(images)
+
+    def stop(self) -> None:
+        """Release the OpenAI client."""
+        if "_client" in self.__dict__:
+            del self.__dict__["_client"]
@@ -0,0 +1,104 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for the MiniMax VLM provider."""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from dimos.models.vl.base import VlModel
+from dimos.models.vl.minimax import (
+    DEFAULT_MINIMAX_BASE_URL,
+    DEFAULT_MINIMAX_MODEL,
+    MiniMaxVlModel,
+    MiniMaxVlModelConfig,
+)
+from dimos.msgs.sensor_msgs.Image import Image
+from dimos.utils.data import get_data
+
+
+def test_config_defaults() -> None:
+    """The default model and base URL point at MiniMax-M3 on the overseas endpoint."""
+    config = MiniMaxVlModelConfig()
+    assert config.model_name == DEFAULT_MINIMAX_MODEL
+    assert DEFAULT_MINIMAX_MODEL == "MiniMax-M3"
+    assert DEFAULT_MINIMAX_BASE_URL == "https://api.minimax.io/v1"
+
+
+def test_missing_api_key_raises() -> None:
+    """Constructing the client without an API key must fail loudly."""
+    with patch.dict("os.environ", {}, clear=True):
+        model = MiniMaxVlModel()
+        # ``_client`` is a cached_property — it's only built on first access,
+        # so trigger it explicitly here.
+        with pytest.raises(ValueError, match="MINIMAX_API_KEY"):
+            _ = model._client
+
+
+def test_query_uses_openai_compatible_client() -> None:
+    """The query call should hit ``chat.completions.create`` on the MiniMax base URL."""
+    image = Image.from_file(get_data("cafe.jpg"))
+
+    model = MiniMaxVlModel(api_key="test-key")
+
+    fake_response = MagicMock()
+    fake_response.choices = [MagicMock(message=MagicMock(content="a person sitting"))]
+
+    with patch.object(
+        model.__class__, "_client", new_callable=lambda: MagicMock()
+    ) as mock_client_prop:
+        # Replace the cached_property with a MagicMock that returns our fake OpenAI client.
+        mock_client = MagicMock()
+        mock_client.chat.completions.create.return_value = fake_response
+        model.__dict__["_client"] = mock_client
+
+        result = model.query(image, "What is in the image?")
+
+    assert result == "a person sitting"
+    mock_client.chat.completions.create.assert_called_once()
+    call_kwargs = mock_client.chat.completions.create.call_args.kwargs
+    assert call_kwargs["model"] == "MiniMax-M3"
+    # MiniMax rejects temperature=0 — default to 1.0.
+    assert call_kwargs["temperature"] == 1.0
+    # Should NOT pass response_format (unsupported on MiniMax).
+    assert "response_format" not in call_kwargs
+
+
+def test_query_batch_returns_one_response_per_image() -> None:
+    """query_batch returns the same model response replicated per image."""
+    images = [
+        Image.from_file(get_data("cafe.jpg")),
+        Image.from_file(get_data("cafe.jpg")),
+    ]
+
+    model = MiniMaxVlModel(api_key="test-key")
+
+    fake_response = MagicMock()
+    fake_response.choices = [MagicMock(message=MagicMock(content="cafe interior"))]
+
+    mock_client = MagicMock()
+    mock_client.chat.completions.create.return_value = fake_response
+    model.__dict__["_client"] = mock_client
+
+    results = model.query_batch(images, "describe the scene")
+
+    assert results == ["cafe interior", "cafe interior"]
+    mock_client.chat.completions.create.assert_called_once()
+
+
+def test_minimax_inherits_vlmodel_interface() -> None:
+    """MiniMaxVlModel must satisfy the abstract VlModel interface."""
+    model = MiniMaxVlModel(api_key="test-key")
+    assert isinstance(model, VlModel)