Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions default.env
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ ALIBABA_API_KEY=
ANTHROPIC_API_KEY=
HF_TOKEN=
HUGGINGFACE_PRV_ENDPOINT=
MINIMAX_API_KEY=
MINIMAX_BASE_URL=
ROBOT_IP=
CONN_TYPE=webrtc
WEBRTC_SERVER_HOST=0.0.0.0
Expand Down
2 changes: 2 additions & 0 deletions dimos/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def pytest_configure(config):
config.addinivalue_line("markers", "skipif_in_ci: skip when CI env var is set")
config.addinivalue_line("markers", "skipif_no_openai: skip when OPENAI_API_KEY is not set")
config.addinivalue_line("markers", "skipif_no_alibaba: skip when ALIBABA_API_KEY is not set")
config.addinivalue_line("markers", "skipif_no_minimax: skip when MINIMAX_API_KEY is not set")
config.addinivalue_line("markers", "skipif_no_ros: skip when ROS dependencies are not present")
config.addinivalue_line("markers", "skipif_macos_bug: skip known-buggy tests on macOS")
config.addinivalue_line("markers", "skipif_macos: skip tests not intended to run on macOS")
Expand Down Expand Up @@ -135,6 +136,7 @@ def pytest_collection_modifyitems(config, items):
"skipif_in_ci": (bool(os.getenv("CI")), "Skipped in CI"),
"skipif_no_openai": (not os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not set"),
"skipif_no_alibaba": (not os.getenv("ALIBABA_API_KEY"), "ALIBABA_API_KEY not set"),
"skipif_no_minimax": (not os.getenv("MINIMAX_API_KEY"), "MINIMAX_API_KEY not set"),
"skipif_no_ros": (not _has_ros(), "ROS dependencies are not present"),
"skipif_macos_bug": (_is_macos(), "Some tests are buggy on Mac OS"),
"skipif_macos": (_is_macos(), "Not intended to run on macOS"),
Expand Down
33 changes: 33 additions & 0 deletions dimos/models/vl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,39 @@

This provides vision language model implementations for processing images and text queries.

## MiniMax VLM

The `MiniMaxVlModel` class provides access to MiniMax's M3 chat model (image
input + text) via the OpenAI-compatible endpoint.

**Prerequisites:**

```bash
export MINIMAX_API_KEY="your_api_key_here"
# Optional: override the default base URL (https://api.minimax.io/v1)
# export MINIMAX_BASE_URL="https://api.minimax.io/v1"
```

### Example Usage

```python
from dimos.models.vl.minimax import MiniMaxVlModel
from dimos.msgs.sensor_msgs.Image import Image

# Initialize the model (requires MINIMAX_API_KEY)
model = MiniMaxVlModel()

image = Image.from_file("path/to/your/image.jpg")
response = model.query(image, "What do you see in this image?")
print(response)
```

Notes:
- `response_format` is not supported by MiniMax and is dropped with a warning
(use prompt-driven JSON extraction at a higher layer instead).
- `temperature` is hard-pinned to `1.0` (MiniMax rejects `0`).
- The default model is `MiniMax-M3` (512K context, image input).

## QwenVL Model

The `QwenVlModel` class provides access to Alibaba's Qwen2.5-VL model for vision-language tasks.
Expand Down
170 changes: 170 additions & 0 deletions dimos/models/vl/minimax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# Copyright 2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""MiniMax vision-language model.

Uses MiniMax's OpenAI-compatible chat completions API. MiniMax exposes
``https://api.minimax.io/v1`` and accepts the same ``client.chat.completions.create``
shape as OpenAI, so the same ``openai`` SDK can target it via a custom
``base_url``.

Reference: https://platform.minimax.io/docs/api-reference/text-openai-api
"""

from functools import cached_property
import os
from typing import Any

import numpy as np
from openai import OpenAI

from dimos.models.vl.base import VlModel, VlModelConfig
from dimos.msgs.sensor_msgs.Image import Image
from dimos.utils.logging_config import setup_logger

logger = setup_logger()

# Default MiniMax API base URL. Override with ``MINIMAX_BASE_URL`` if you
# need to use the domestic endpoint or a proxy.
DEFAULT_MINIMAX_BASE_URL = "https://api.minimax.io/v1"

# Only M3 is supported — the latest MiniMax chat model with image input.
DEFAULT_MINIMAX_MODEL = "MiniMax-M3"


class MiniMaxVlModelConfig(VlModelConfig):
"""Configuration for the MiniMax VLM."""

model_name: str = DEFAULT_MINIMAX_MODEL
api_key: str | None = None
base_url: str | None = None


class MiniMaxVlModel(VlModel):
"""Vision-language model backed by MiniMax's OpenAI-compatible endpoint.

Auth: set ``MINIMAX_API_KEY`` (or pass ``api_key`` explicitly).
Optional override: ``MINIMAX_BASE_URL`` (default: ``https://api.minimax.io/v1``).
"""

config: MiniMaxVlModelConfig

@cached_property
def _client(self) -> OpenAI:
api_key = self.config.api_key or os.getenv("MINIMAX_API_KEY")
if not api_key:
raise ValueError(
"MiniMax API key must be provided or set in MINIMAX_API_KEY environment variable"
)

base_url = self.config.base_url or os.getenv("MINIMAX_BASE_URL", DEFAULT_MINIMAX_BASE_URL)

return OpenAI(api_key=api_key, base_url=base_url)

def query(
self,
image: Image | np.ndarray,
query: str,
response_format: dict[str, Any] | None = None,
**kwargs: Any,
) -> str:
if isinstance(image, np.ndarray):
import warnings

warnings.warn(
"MiniMaxVlModel.query should receive standard dimos Image type, not a numpy array",
DeprecationWarning,
stacklevel=2,
)

image = Image.from_numpy(image)

# Apply auto_resize if configured
image, _ = self._prepare_image(image)

img_base64 = image.to_base64()

# ``response_format`` is not supported by MiniMax and will be rejected
# with a 4xx; if a caller asked for JSON output we have to fall back
# to a prompt-driven approach at a higher layer.
if response_format:
logger.warning(
"MiniMax does not support response_format; ignoring and relying on prompt."
)

api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": [
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{img_base64}"},
},
{"type": "text", "text": query},
],
}
],
# MiniMax requires temperature in (0.0, 1.0]; 0 is rejected.
"temperature": 1.0,
}

response = self._client.chat.completions.create(**api_kwargs)

return response.choices[0].message.content # type: ignore[no-any-return]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 response.choices[0].message.content is typed str | None in the OpenAI SDK. If the model returns a tool call or an empty completion the value will be None, and any downstream caller that treats the return as a plain str (e.g., query_json, caption, query_detections) will raise AttributeError. The sibling query_batch already guards against this with or ""; query should do the same.

Suggested change
return response.choices[0].message.content # type: ignore[no-any-return]
return response.choices[0].message.content or ""


def query_batch(
self,
images: list[Image],
query: str,
response_format: dict[str, Any] | None = None,
**kwargs: Any,
) -> list[str]:
"""Query VLM with multiple images using a single API call."""
if not images:
return []

if response_format:
logger.warning(
"MiniMax does not support response_format; ignoring and relying on prompt."
)

content: list[dict[str, Any]] = [
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{self._prepare_image(img)[0].to_base64()}"
},
}
for img in images
]
content.append({"type": "text", "text": query})

messages = [{"role": "user", "content": content}]
api_kwargs: dict[str, Any] = {
"model": self.config.model_name,
"messages": messages,
"temperature": 1.0,
}

response = self._client.chat.completions.create(**api_kwargs)
response_text = response.choices[0].message.content or ""
# Return one response per image (same response since API analyzes all images together)
return [response_text] * len(images)
Comment on lines +128 to +165
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 query_batch sends all images to the model in a single call and then replicates the one response — [response_text] * len(images) — for every image. Callers that follow the standard VlModel.query_batch contract (one independent answer per image) will silently receive the same string for every element. The comment acknowledges this, but the behavior is meaningfully different from the base-class contract and could produce incorrect results in any pipeline that routes results per-image (e.g., object-detection scoring).


def stop(self) -> None:
"""Release the OpenAI client."""
if "_client" in self.__dict__:
del self.__dict__["_client"]
104 changes: 104 additions & 0 deletions dimos/models/vl/test_minimax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2026 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for the MiniMax VLM provider."""

from unittest.mock import MagicMock, patch

import pytest

from dimos.models.vl.base import VlModel
from dimos.models.vl.minimax import (
DEFAULT_MINIMAX_BASE_URL,
DEFAULT_MINIMAX_MODEL,
MiniMaxVlModel,
MiniMaxVlModelConfig,
)
from dimos.msgs.sensor_msgs.Image import Image
from dimos.utils.data import get_data


def test_config_defaults() -> None:
"""The default model and base URL point at MiniMax-M3 on the overseas endpoint."""
config = MiniMaxVlModelConfig()
assert config.model_name == DEFAULT_MINIMAX_MODEL
assert DEFAULT_MINIMAX_MODEL == "MiniMax-M3"
assert DEFAULT_MINIMAX_BASE_URL == "https://api.minimax.io/v1"


def test_missing_api_key_raises() -> None:
"""Constructing the client without an API key must fail loudly."""
with patch.dict("os.environ", {}, clear=True):
model = MiniMaxVlModel()
# ``_client`` is a cached_property — it's only built on first access,
# so trigger it explicitly here.
with pytest.raises(ValueError, match="MINIMAX_API_KEY"):
_ = model._client


def test_query_uses_openai_compatible_client() -> None:
"""The query call should hit ``chat.completions.create`` on the MiniMax base URL."""
image = Image.from_file(get_data("cafe.jpg"))

model = MiniMaxVlModel(api_key="test-key")

fake_response = MagicMock()
fake_response.choices = [MagicMock(message=MagicMock(content="a person sitting"))]

with patch.object(
model.__class__, "_client", new_callable=lambda: MagicMock()
) as mock_client_prop:
# Replace the cached_property with a MagicMock that returns our fake OpenAI client.
mock_client = MagicMock()
mock_client.chat.completions.create.return_value = fake_response
model.__dict__["_client"] = mock_client

result = model.query(image, "What is in the image?")

assert result == "a person sitting"
mock_client.chat.completions.create.assert_called_once()
call_kwargs = mock_client.chat.completions.create.call_args.kwargs
assert call_kwargs["model"] == "MiniMax-M3"
# MiniMax rejects temperature=0 — default to 1.0.
assert call_kwargs["temperature"] == 1.0
# Should NOT pass response_format (unsupported on MiniMax).
assert "response_format" not in call_kwargs


def test_query_batch_returns_one_response_per_image() -> None:
"""query_batch returns the same model response replicated per image."""
images = [
Image.from_file(get_data("cafe.jpg")),
Image.from_file(get_data("cafe.jpg")),
]

model = MiniMaxVlModel(api_key="test-key")

fake_response = MagicMock()
fake_response.choices = [MagicMock(message=MagicMock(content="cafe interior"))]

mock_client = MagicMock()
mock_client.chat.completions.create.return_value = fake_response
model.__dict__["_client"] = mock_client

results = model.query_batch(images, "describe the scene")

assert results == ["cafe interior", "cafe interior"]
mock_client.chat.completions.create.assert_called_once()


def test_minimax_inherits_vlmodel_interface() -> None:
"""MiniMaxVlModel must satisfy the abstract VlModel interface."""
model = MiniMaxVlModel(api_key="test-key")
assert isinstance(model, VlModel)
Loading