diff --git a/README.md b/README.md
index 6c1275928..c597da528 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@
 
 ## Latest News
 
+* 06/30/2026 7.1.0-dev `main`: ✨ Added `minimax_m3_vl` / MiniMax M3 model support
 * 06/25/2026 7.1.0-dev `main`: ✨ Added `cohere2_moe` model support
 * 05/25/2026 7.1.0-dev `main`: ✨ Added `hy_3` and `ministral3` model support
 * 05/25/2026 7.1.0-dev `main`: ✨ Added `hunyuan_v1_dense` and `hunyuan_v1_moe` model support
@@ -266,7 +267,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
 | Dream                    | ✅ | GRIN-MoE                        | ✅ | Instella         | ✅ | Phi 1-4                         | ✅ | Voxtral                | ✅ |
 | ERNIE 4.5 / MoE / VL MoE | ✅ | GLM 4/4V/4.5V/4.6V/5/5.1/OCR/ASR | ✅ | GLM4 MoE / Lite / 4.5V MoE | ✅ | MiniCPM 3/O/V/V 4_6             | ✅ | PanGu-α                | ✅ |
 | XVERSE                   | ✅ | Brumby                          | ✅ | Hymba            | ✅ | Mistral                         | ✅ | Qwen 1/2/3/3.5         | ✅ |
-| MiniMax M2               | ✅ | AfMoE                           | ✅ | Bailing-MoE      | ✅ | LFM2-MoE                        | ✅ | Marin                  | ✅ |
+| MiniMax M2/M3            | ✅ | AfMoE                           | ✅ | Bailing-MoE      | ✅ | LFM2-MoE                        | ✅ | Marin                  | ✅ |
 | InternVL Chat            | ✅ | Laguna                          | ✅ | Mimo / Mimo V2   | ✅ | Zamba / Zamba2                  | ✅ | Intern S1              | ✅ |
 | HunYuan V1 Dense / MoE   | ✅ | HY-V3                           | ✅ |    |  |                  |  |                |   |
 
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
index 58be756de..f2103551f 100644
--- a/gptqmodel/models/auto.py
+++ b/gptqmodel/models/auto.py
@@ -142,6 +142,7 @@
 from .definitions.minicpmv import MiniCPMVQModel  # noqa: E402
 from .definitions.minicpmv_4_6 import MiniCPMV4_6QModel  # noqa: E402
 from .definitions.minimax_m2 import MiniMaxM2GPTQ  # noqa: E402
+from .definitions.minimax_m3_vl import MiniMaxM3VLGPTQ  # noqa: E402
 from .definitions.ministral3 import Ministral3GPTQ  # noqa: E402
 from .definitions.mistral3 import Mistral3GPTQ
 from .definitions.mixtral import MixtralQModel  # noqa: E402
@@ -277,6 +278,7 @@
     "minicpmv4_6": MiniCPMV4_6QModel,
     "minimax": MiniMaxM2GPTQ,
     "minimax_m2": MiniMaxM2GPTQ,
+    "minimax_m3_vl": MiniMaxM3VLGPTQ,
     "ministral3": Ministral3GPTQ,
     "qwen2_moe": Qwen2MoeQModel,
     "qwen3_moe": Qwen3MoeQModel,
diff --git a/gptqmodel/models/definitions/__init__.py b/gptqmodel/models/definitions/__init__.py
index e7bd5d9cf..26490e4e8 100644
--- a/gptqmodel/models/definitions/__init__.py
+++ b/gptqmodel/models/definitions/__init__.py
@@ -59,6 +59,7 @@
 from .minicpmv import MiniCPMVQModel
 from .minicpmv_4_6 import MiniCPMV4_6QModel
 from .minimax_m2 import MiniMaxM2GPTQ
+from .minimax_m3_vl import MiniMaxM3VLGPTQ
 from .ministral3 import Ministral3GPTQ
 from .mimo_v2 import MimoV2QModel
 from .mixtral import MixtralQModel
diff --git a/gptqmodel/models/definitions/minimax_m3_vl.py b/gptqmodel/models/definitions/minimax_m3_vl.py
new file mode 100644
index 000000000..d66e1fee6
--- /dev/null
+++ b/gptqmodel/models/definitions/minimax_m3_vl.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-FileCopyrightText: 2026 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from transformers import AutoModelForImageTextToText
+
+from ..base import BaseQModel
+from ..moe_lifecycle import GateUpDownMoELifecycleHooks
+
+class MiniMaxM3VLGPTQ(BaseQModel):
+    loader = AutoModelForImageTextToText
+    require_load_processor = False
+    support_batch_quantize = False
+
+    pre_lm_head_norm_module = "model.language_model.norm"
+    rotary_embedding = "model.language_model.rotary_emb"
+
+    # MiniMax-M3 starts with dense MLP layers, then switches to sparse MoE.
+    layer_modules_strict = False
+    dynamic_expert_index = "num_local_experts"
+
+    # Defuser splits MiniMax-M3 packed expert tensors into gate/up/down modules.
+    moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()
+
+    module_tree = [
+        "model",
+        "language_model",
+        "layers",
+        "#",
+        {
+            "input_layernorm": ("input_layernorm:!",),
+            "self_attn": (
+                "q_proj:0",
+                "q_norm:0:!",
+                "k_proj:0",
+                "k_norm:0:!",
+                "v_proj:0",
+                "indexer.q_proj:0",
+                "indexer.q_norm:0:!",
+                "indexer.k_proj:0",
+                "indexer.k_norm:0:!",
+                "o_proj:1",
+            ),
+            "post_attention_layernorm": ("post_attention_layernorm:!",),
+            "mlp:moe": {
+                # Dense fallback used by early decoder blocks.
+                "": ("gate_up_proj:0", "down_proj:1"),
+                "gate": ("gate:!", "e_score_correction_bias:!"),
+                "shared_experts": ("gate_up_proj:0", "down_proj:1"),
+                "experts": {
+                    "#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
+                },
+            },
+        },
+    ]
+
+
+__all__ = ["MiniMaxM3VLGPTQ"]
diff --git a/gptqmodel/models/loader.py b/gptqmodel/models/loader.py
index 45233949c..eaac2fffa 100644
--- a/gptqmodel/models/loader.py
+++ b/gptqmodel/models/loader.py
@@ -545,6 +545,7 @@ def from_pretrained(
         model_init_kwargs_without_internal["trust_remote_code"] = trust_remote_code
 
         config = AutoConfig.from_pretrained(model_local_path, **model_init_kwargs_without_internal, **hf_gguf_load_kwargs)
+        print("config", config)
 
         defuser.replace_fused_blocks(config.model_type)
 
@@ -721,6 +722,8 @@ def skip(*args, **kwargs):
 
         if quantize_config.offload_to_disk:
             shell_config = copy.deepcopy(config)
+            print("shell_config", shell_config)
+            print("shell_config", shell_config.vision_config)
             try:
                 model = build_shell_model(cls.loader, config=shell_config, **model_init_kwargs_without_internal)
             except RuntimeError as exc:
diff --git a/gptqmodel/utils/structure.py b/gptqmodel/utils/structure.py
index 5b68e61cd..00015fd20 100644
--- a/gptqmodel/utils/structure.py
+++ b/gptqmodel/utils/structure.py
@@ -1855,6 +1855,57 @@ def _resolve_converter_tensor_source(
 
         return None, None, None, None
 
+    def _resolve_concat_checkpoint_tensor_sources(
+        self,
+        module_path: str,
+        rel_name: str,
+    ) -> Optional[tuple[list[str], int]]:
+        """Resolve one fused runtime tensor backed by several split checkpoint tensors."""
+
+        combined_name = self._join_tensor_name(module_path, rel_name)
+
+        for converter in self._runtime_to_checkpoint_converters:
+            if "Concatenate" not in converter.operation_names:
+                continue
+            if len(converter.source_patterns) != 1 or len(converter.target_patterns) < 2:
+                continue
+
+            runtime_pattern = converter.source_patterns[0]
+            if _LazyWeightRenaming(runtime_pattern, runtime_pattern).rename_source_key(combined_name)[1] is None:
+                continue
+
+            checkpoint_names = []
+            for checkpoint_pattern in converter.target_patterns:
+                renamed, matched_pattern = _LazyWeightRenaming(
+                    runtime_pattern,
+                    checkpoint_pattern,
+                ).rename_source_key(combined_name)
+                if matched_pattern is None or renamed == combined_name:
+                    checkpoint_names = []
+                    break
+
+                resolved_name = None
+                for candidate in self._runtime_to_checkpoint_alias_candidates(renamed):
+                    if candidate in self._weight_map:
+                        resolved_name = candidate
+                        break
+                if resolved_name is None:
+                    checkpoint_names = []
+                    break
+                checkpoint_names.append(resolved_name)
+
+            if len(checkpoint_names) != len(converter.target_patterns):
+                continue
+
+            concat_dim = 0
+            for operation in converter.operations:
+                if type(operation).__name__ == "Concatenate":
+                    concat_dim = getattr(operation, "dim", 0)
+                    break
+            return checkpoint_names, concat_dim
+
+        return None
+
     def _resolve_direct_checkpoint_tensor_source(
         self,
         module_path: str,
@@ -2130,7 +2181,14 @@ def _copy_checkpoint_tensors_into_submodule(
         missing_nonpersistent_buffers: list[tuple[str, str]] = []
 
         grouped_names: Dict[str, list[tuple[str, str, str, Optional[int], Optional[int], Optional[int]]]] = {}
+        concat_entries: list[tuple[str, str, list[str], int]] = []
         for rel_name in t_params:
+            concat_source = self._resolve_concat_checkpoint_tensor_sources(module_path, rel_name)
+            if concat_source is not None:
+                full_names, concat_dim = concat_source
+                concat_entries.append(("param", rel_name, full_names, concat_dim))
+                continue
+
             full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, rel_name)
             if full_name is None:
                 continue
@@ -2153,6 +2211,12 @@ def _copy_checkpoint_tensors_into_submodule(
             grouped_names.setdefault(shard, []).append(("param", rel_name, full_name, expert_index, split_index, split_dim))
 
         for rel_name, target_buffer in list(t_bufs.items()):
+            concat_source = self._resolve_concat_checkpoint_tensor_sources(module_path, rel_name)
+            if concat_source is not None:
+                full_names, concat_dim = concat_source
+                concat_entries.append(("buffer", rel_name, full_names, concat_dim))
+                continue
+
             full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, rel_name)
             if full_name is None:
                 full_name = self._resolve_checkpoint_tensor_name(module_path, rel_name)
@@ -2177,7 +2241,7 @@ def _copy_checkpoint_tensors_into_submodule(
                 continue
             grouped_names.setdefault(shard, []).append(("buffer", rel_name, full_name, expert_index, split_index, split_dim))
 
-        total_entries = sum(len(entries) for entries in grouped_names.values())
+        total_entries = sum(len(entries) for entries in grouped_names.values()) + len(concat_entries)
         progress = None
         loaded_entries = 0
         if total_entries:
@@ -2189,6 +2253,112 @@ def _copy_checkpoint_tensors_into_submodule(
 
         try:
             with torch.inference_mode():
+                for kind, rel_name, full_names, concat_dim in concat_entries:
+                    if progress is not None:
+                        progress.current_iter_step = loaded_entries
+                        progress.subtitle(f"{rel_name}: {loaded_entries + 1}/{total_entries}")
+                        progress.draw()
+
+                    target_tensor = t_params.get(rel_name) if kind == "param" else t_bufs.get(rel_name)
+                    expected_shape = tuple(target_tensor.shape) if target_tensor is not None else None
+                    parts = []
+                    for full_name in full_names:
+                        shard = self._weight_map.get(full_name)
+                        if shard is None:
+                            raise RuntimeError(
+                                self._materialization_issue_message(
+                                    phase="submodule materialization",
+                                    kind=kind,
+                                    module_path=module_path,
+                                    rel_name=rel_name,
+                                    reason="checkpoint tensor mapping resolved to a missing shard",
+                                    full_name=full_name,
+                                    target_shape=expected_shape,
+                                )
+                            )
+                        shard_path = os.path.join(self.model_local_path, shard)
+                        with safe_open(shard_path, framework="pt", device="cpu") as handler:
+                            parts.append(handler.get_tensor(full_name))
+
+                    try:
+                        tensor = torch.cat(parts, dim=concat_dim).contiguous()
+                    except Exception as exc:
+                        raise RuntimeError(
+                            self._materialization_issue_message(
+                                phase="submodule materialization",
+                                kind=kind,
+                                module_path=module_path,
+                                rel_name=rel_name,
+                                reason=f"checkpoint tensors could not be concatenated: {exc}",
+                                full_name=", ".join(full_names),
+                                target_shape=expected_shape,
+                            )
+                        ) from exc
+
+                    if expected_shape is not None and tuple(tensor.shape) != expected_shape:
+                        raise RuntimeError(
+                            self._materialization_issue_message(
+                                phase="submodule materialization",
+                                kind=kind,
+                                module_path=module_path,
+                                rel_name=rel_name,
+                                reason="concatenated checkpoint tensor shape does not match target tensor",
+                                full_name=", ".join(full_names),
+                                source_shape=tuple(tensor.shape),
+                                target_shape=expected_shape,
+                            )
+                        )
+
+                    if kind == "param":
+                        target_param = t_params.get(rel_name)
+                        if target_param is None:
+                            raise RuntimeError(
+                                self._materialization_issue_message(
+                                    phase="submodule materialization",
+                                    kind=kind,
+                                    module_path=module_path,
+                                    rel_name=rel_name,
+                                    reason="target tensor disappeared before materialization",
+                                    full_name=", ".join(full_names),
+                                    source_shape=tuple(tensor.shape),
+                                )
+                            )
+                        target_param_new = _ensure_target_storage_on_device_(target_param, device)
+                        if target_param_new is not target_param:
+                            t_parent, leaf = _get_parent_and_leaf_by_path(target_submodule, rel_name)
+                            setattr(t_parent, leaf, target_param_new)
+                            target_param = target_param_new
+                        source = tensor.detach()
+                        if source.dtype != target_param.dtype:
+                            source = source.to(dtype=target_param.dtype)
+                        target_param.detach().copy_(source, non_blocking=(non_blocking and source.is_pinned()))
+                    else:
+                        target_buffer = t_bufs.get(rel_name)
+                        t_parent, leaf = _get_parent_and_leaf_by_path(target_submodule, rel_name)
+                        persistent = leaf not in getattr(t_parent, "_non_persistent_buffers_set", set())
+                        source = tensor.detach()
+                        if target_buffer is None:
+                            new_buffer = source.to(device=device)
+                            t_parent.register_buffer(leaf, new_buffer, persistent=persistent)
+                            t_bufs[rel_name] = new_buffer
+                        elif getattr(target_buffer, "is_meta", False) or target_buffer.device.type == "meta":
+                            new_buffer = torch.empty_like(target_buffer, device=device)
+                            new_buffer.copy_(
+                                source.to(dtype=new_buffer.dtype),
+                                non_blocking=(non_blocking and source.is_pinned()),
+                            )
+                            t_parent.register_buffer(leaf, new_buffer, persistent=persistent)
+                            t_bufs[rel_name] = new_buffer
+                        else:
+                            if source.dtype != target_buffer.dtype:
+                                source = source.to(dtype=target_buffer.dtype)
+                            target_buffer.copy_(source, non_blocking=(non_blocking and source.is_pinned()))
+
+                    loaded_entries += 1
+                    if progress is not None:
+                        progress.current_iter_step = loaded_entries
+                        progress.draw()
+
                 for shard, entries in grouped_names.items():
                     shard_path = os.path.join(self.model_local_path, shard)
                     with safe_open(shard_path, framework="pt", device="cpu") as handler:
@@ -2457,6 +2627,66 @@ def _materialize_direct_meta_tensors(
                 if not _is_meta_tensor(shell_param):
                     continue
 
+                concat_source = self._resolve_concat_checkpoint_tensor_sources(module_path, name)
+                if concat_source is not None:
+                    full_names, concat_dim = concat_source
+                    parts = []
+                    for full_name in full_names:
+                        shard = self._weight_map.get(full_name)
+                        if shard is None:
+                            raise RuntimeError(self._materialization_issue_message(
+                                phase="direct-meta sync",
+                                kind="param",
+                                module_path=module_path,
+                                rel_name=name,
+                                reason="checkpoint tensor mapping resolved to a missing shard",
+                                full_name=full_name,
+                                target_shape=tuple(shell_param.shape),
+                            ))
+                        source_path = os.path.join(self.model_local_path, shard)
+                        with safe_open(source_path, framework="pt", device="cpu") as handler:
+                            parts.append(handler.get_tensor(full_name))
+
+                    try:
+                        source_param = torch.cat(parts, dim=concat_dim).contiguous()
+                    except Exception as exc:
+                        raise RuntimeError(self._materialization_issue_message(
+                            phase="direct-meta sync",
+                            kind="param",
+                            module_path=module_path,
+                            rel_name=name,
+                            reason=f"checkpoint tensors could not be concatenated: {exc}",
+                            full_name=", ".join(full_names),
+                            target_shape=tuple(shell_param.shape),
+                        )) from exc
+
+                    if shell_param.shape != source_param.shape:
+                        raise RuntimeError(self._materialization_issue_message(
+                            phase="direct-meta sync",
+                            kind="param",
+                            module_path=module_path,
+                            rel_name=name,
+                            reason="concatenated checkpoint tensor shape does not match target tensor",
+                            full_name=", ".join(full_names),
+                            source_shape=tuple(source_param.shape),
+                            target_shape=tuple(shell_param.shape),
+                        ))
+
+                    cache_key = (",".join(full_names), None, None, concat_dim, shell_param.dtype, shell_param.requires_grad)
+                    new_param = param_cache.get(cache_key)
+                    if new_param is None:
+                        if source_param.dtype != shell_param.dtype:
+                            source_param = source_param.to(dtype=shell_param.dtype)
+                        new_param = nn.Parameter(
+                            source_param.clone(),
+                            requires_grad=shell_param.requires_grad,
+                        )
+                        param_cache[cache_key] = new_param
+
+                    shell_sub.register_parameter(name, new_param)
+                    synced += 1
+                    continue
+
                 full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, name)
                 if full_name is None:
                     continue
@@ -2534,6 +2764,64 @@ def _materialize_direct_meta_tensors(
                 if not _is_meta_tensor(shell_buffer):
                     continue
 
+                concat_source = self._resolve_concat_checkpoint_tensor_sources(module_path, name)
+                if concat_source is not None:
+                    full_names, concat_dim = concat_source
+                    parts = []
+                    for full_name in full_names:
+                        shard = self._weight_map.get(full_name)
+                        if shard is None:
+                            raise RuntimeError(self._materialization_issue_message(
+                                phase="direct-meta sync",
+                                kind="buffer",
+                                module_path=module_path,
+                                rel_name=name,
+                                reason="checkpoint tensor mapping resolved to a missing shard",
+                                full_name=full_name,
+                                target_shape=tuple(shell_buffer.shape),
+                            ))
+                        source_path = os.path.join(self.model_local_path, shard)
+                        with safe_open(source_path, framework="pt", device="cpu") as handler:
+                            parts.append(handler.get_tensor(full_name))
+
+                    try:
+                        source_buffer = torch.cat(parts, dim=concat_dim).contiguous()
+                    except Exception as exc:
+                        raise RuntimeError(self._materialization_issue_message(
+                            phase="direct-meta sync",
+                            kind="buffer",
+                            module_path=module_path,
+                            rel_name=name,
+                            reason=f"checkpoint tensors could not be concatenated: {exc}",
+                            full_name=", ".join(full_names),
+                            target_shape=tuple(shell_buffer.shape),
+                        )) from exc
+
+                    if shell_buffer.shape != source_buffer.shape:
+                        raise RuntimeError(self._materialization_issue_message(
+                            phase="direct-meta sync",
+                            kind="buffer",
+                            module_path=module_path,
+                            rel_name=name,
+                            reason="concatenated checkpoint tensor shape does not match target tensor",
+                            full_name=", ".join(full_names),
+                            source_shape=tuple(source_buffer.shape),
+                            target_shape=tuple(shell_buffer.shape),
+                        ))
+
+                    persistent = name not in getattr(shell_sub, "_non_persistent_buffers_set", set())
+                    cache_key = (",".join(full_names), None, None, concat_dim, shell_buffer.dtype)
+                    new_buffer = buffer_cache.get(cache_key)
+                    if new_buffer is None:
+                        if source_buffer.dtype != shell_buffer.dtype:
+                            source_buffer = source_buffer.to(dtype=shell_buffer.dtype)
+                        new_buffer = source_buffer.clone()
+                        buffer_cache[cache_key] = new_buffer
+
+                    shell_sub.register_buffer(name, new_buffer, persistent=persistent)
+                    synced += 1
+                    continue
+
                 full_name, expert_index, split_index, split_dim = self._resolve_checkpoint_tensor_source(module_path, name)
                 if full_name is None:
                     continue
diff --git a/tests/models/test_minimax_m3.py b/tests/models/test_minimax_m3.py
new file mode 100644
index 000000000..3efa51dc9
--- /dev/null
+++ b/tests/models/test_minimax_m3.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
+# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+from gptqmodel import BACKEND
+from model_test import ModelTest
+
+
+class TestMinimaxM3(ModelTest):
+
+    NATIVE_MODEL_ID = "/monster/data/model/MiniMax-M3"
+    USE_FLASH_ATTN = False
+    TRUST_REMOTE_CODE = False
+    DELETE_QUANTIZED_MODEL = False
+    DATASET_SIZE = 1024
+    GROUP_SIZE = 32
+    EVAL_TASKS_SLOW = {
+        "arc_challenge": {
+            "acc": {"value": 0.5026, "floor_pct": 0.04},
+            "acc_norm": {"value": 0.5171, "floor_pct": 0.04},
+        },
+    }
+    EVAL_BATCH_SIZE = 1
+    EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)
+    LOAD_BACKEND = BACKEND.AUTO
+
+    def test_minimax_m3(self):
+        self.quantize_and_evaluate()
diff --git a/tests/test_lazy_turtle_conversion_mapping.py b/tests/test_lazy_turtle_conversion_mapping.py
index c003283c5..49b5d715c 100644
--- a/tests/test_lazy_turtle_conversion_mapping.py
+++ b/tests/test_lazy_turtle_conversion_mapping.py
@@ -124,6 +124,50 @@ def __init__(self):
         self.mlp = _MlpShell()
 
 
+class _FusedDenseMlpShell(nn.Module):
+    def __init__(self, hidden_dim: int = 4, intermediate_dim: int = 3):
+        super().__init__()
+        self.gate_up_proj = nn.Linear(hidden_dim, 2 * intermediate_dim, bias=False, device="meta")
+
+
+class _FusedDenseLayerShell(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mlp = _FusedDenseMlpShell()
+
+
+class _MiniMaxM3Shell(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.config = SimpleNamespace(model_type="minimax_m3_vl")
+        self.model = nn.Module()
+        self.model.language_model = nn.Module()
+        self.model.language_model.layers = nn.ModuleList([_FusedDenseLayerShell()])
+
+
+class _MiniMaxM3SharedExpertsMlpShell(nn.Module):
+    def __init__(self, hidden_dim: int = 4, intermediate_dim: int = 3):
+        super().__init__()
+        self.shared_experts = nn.Module()
+        self.shared_experts.gate_up_proj = nn.Linear(hidden_dim, 2 * intermediate_dim, bias=False, device="meta")
+
+
+class _MiniMaxM3SharedExpertsLayerShell(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mlp = _MiniMaxM3SharedExpertsMlpShell()
+
+
+class _MiniMaxM3SharedExpertsShell(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.config = SimpleNamespace(model_type="minimax_m3_vl")
+        self.model = nn.Module()
+        self.model.language_model = nn.Module()
+        self.model.language_model.layers = nn.ModuleList([nn.Identity() for _ in range(4)])
+        self.model.language_model.layers.append(_MiniMaxM3SharedExpertsLayerShell())
+
+
 class _DeepseekV4Shell(nn.Module):
     base_model_prefix = "model"
 
@@ -970,6 +1014,137 @@ def test_lazy_turtle_materializes_defused_deepseek_v4_expert_linears_from_w123_a
     assert torch.equal(expert.down_proj.weight, checkpoint_tensors["model.layers.0.mlp.experts.0.w2.weight"])
 
 
+def test_lazy_turtle_materializes_fused_dense_mlp_from_split_gate_up_checkpoint(tmp_path):
+    reversed_map = LazyTurtle.reverse_hf_conversion_map(
+        [
+            _WeightRenamingStub(
+                r"^language_model\.model\.",
+                r"model.language_model.",
+            ),
+            _WeightConverterStub(
+                source_patterns=[
+                    "mlp.gate_proj.weight",
+                    "mlp.up_proj.weight",
+                ],
+                target_patterns="mlp.gate_up_proj.weight",
+                operations=[Concatenate(dim=0)],
+            ),
+        ]
+    )
+    assert reversed_map is not None
+
+    gate = torch.arange(12, dtype=torch.float32).reshape(3, 4)
+    up = torch.arange(12, 24, dtype=torch.float32).reshape(3, 4)
+    turtle = _build_lazy_turtle(
+        tmp_path,
+        {
+            "language_model.model.layers.0.mlp.gate_proj.weight": gate,
+            "language_model.model.layers.0.mlp.up_proj.weight": up,
+        },
+        hf_conversion_map_reversed=reversed_map,
+    )
+
+    shell = _MiniMaxM3Shell()
+    target_submodule = shell.model.language_model.layers[0]
+    turtle._copy_checkpoint_tensors_into_submodule(
+        target_model=shell,
+        target_submodule=target_submodule,
+        module_path="model.language_model.layers.0",
+        device=torch.device("cpu"),
+        recurse=True,
+        non_blocking=False,
+    )
+
+    weight = target_submodule.mlp.gate_up_proj.weight
+    assert weight.device.type != "meta"
+    assert torch.equal(weight, torch.cat([gate, up], dim=0))
+
+
+def test_lazy_turtle_sync_all_meta_materializes_fused_dense_mlp_from_split_gate_up_checkpoint(tmp_path):
+    reversed_map = LazyTurtle.reverse_hf_conversion_map(
+        [
+            _WeightRenamingStub(
+                r"^language_model\.model\.",
+                r"model.language_model.",
+            ),
+            _WeightConverterStub(
+                source_patterns=[
+                    "mlp.gate_proj.weight",
+                    "mlp.up_proj.weight",
+                ],
+                target_patterns="mlp.gate_up_proj.weight",
+                operations=[Concatenate(dim=0)],
+            ),
+        ]
+    )
+    assert reversed_map is not None
+
+    gate = torch.arange(12, dtype=torch.float32).reshape(3, 4)
+    up = torch.arange(12, 24, dtype=torch.float32).reshape(3, 4)
+    turtle = _build_lazy_turtle(
+        tmp_path,
+        {
+            "language_model.model.layers.0.mlp.gate_proj.weight": gate,
+            "language_model.model.layers.0.mlp.up_proj.weight": up,
+        },
+        hf_conversion_map_reversed=reversed_map,
+    )
+
+    shell = _MiniMaxM3Shell()
+    materialized = turtle.sync_all_meta(shell_model=shell, tie_after=False)
+
+    weight = shell.model.language_model.layers[0].mlp.gate_up_proj.weight
+    assert materialized == 1
+    assert weight.device.type != "meta"
+    assert torch.equal(weight, torch.cat([gate, up], dim=0))
+
+
+def test_lazy_turtle_direct_meta_sync_materializes_minimax_m3_shared_expert_gate_up(tmp_path):
+    reversed_map = LazyTurtle.reverse_hf_conversion_map(
+        [
+            _WeightRenamingStub(
+                r"^language_model\.model\.",
+                r"model.language_model.",
+            ),
+            _WeightRenamingStub(
+                r"\.block_sparse_moe\.shared_experts\.",
+                r".mlp.shared_experts.",
+            ),
+            _WeightConverterStub(
+                source_patterns=[
+                    "mlp.shared_experts.gate_proj.weight",
+                    "mlp.shared_experts.up_proj.weight",
+                ],
+                target_patterns="mlp.shared_experts.gate_up_proj.weight",
+                operations=[Concatenate(dim=0)],
+            ),
+        ]
+    )
+    assert reversed_map is not None
+
+    gate = torch.arange(12, dtype=torch.float32).reshape(3, 4)
+    up = torch.arange(12, 24, dtype=torch.float32).reshape(3, 4)
+    turtle = _build_lazy_turtle(
+        tmp_path,
+        {
+            "language_model.model.layers.4.block_sparse_moe.shared_experts.gate_proj.weight": gate,
+            "language_model.model.layers.4.block_sparse_moe.shared_experts.up_proj.weight": up,
+        },
+        hf_conversion_map_reversed=reversed_map,
+    )
+
+    shell = _MiniMaxM3SharedExpertsShell()
+    leaf = shell.model.language_model.layers[4].mlp.shared_experts.gate_up_proj
+    turtle.materialize_direct_meta_tensors(
+        target_model=shell,
+        target_submodule=leaf,
+        device=torch.device("cpu"),
+    )
+
+    assert leaf.weight.device.type != "meta"
+    assert torch.equal(leaf.weight, torch.cat([gate, up], dim=0))
+
+
 def test_lazy_turtle_falls_back_to_legacy_checkpoint_conversion_mapping(tmp_path, monkeypatch):
     def _raise_import_error(_name: str):
         raise ImportError("transformers.conversion_mapping is unavailable")
diff --git a/tests/test_minimax_m3_support.py b/tests/test_minimax_m3_support.py
new file mode 100644
index 000000000..d7f724a35
--- /dev/null
+++ b/tests/test_minimax_m3_support.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: 2026 ModelCloud.ai
+# SPDX-FileCopyrightText: 2026 qubitium@modelcloud.ai
+# SPDX-License-Identifier: Apache-2.0
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+from torch import nn
+from transformers import AutoConfig, AutoModelForImageTextToText
+
+from gptqmodel.models import auto
+from gptqmodel.models.definitions.minimax_m3_vl import MiniMaxM3VLGPTQ
+
+
+MODEL_PATH = Path("/monster/data/model/MiniMax-M3")
+
+
+def test_minimax_m3_model_type_selects_definition(monkeypatch):
+    fake_config = SimpleNamespace(model_type="minimax_m3_vl")
+
+    monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
+    monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)
+
+    assert auto.check_and_get_model_definition("/tmp/minimax-m3") is MiniMaxM3VLGPTQ
+
+
+@pytest.mark.skipif(not MODEL_PATH.exists(), reason="MiniMax-M3 model not found")
+def test_minimax_m3_local_config_selects_definition():
+    config = AutoConfig.from_pretrained(MODEL_PATH, trust_remote_code=False)
+
+    assert config.model_type == "minimax_m3_vl"
+    assert auto.check_and_get_model_definition(MODEL_PATH) is MiniMaxM3VLGPTQ
+
+
+def test_minimax_m3_module_tree_covers_dense_sparse_and_indexer_paths():
+    layer_modules = MiniMaxM3VLGPTQ.simple_layer_modules(
+        model_config=SimpleNamespace(text_config=SimpleNamespace(num_local_experts=2)),
+        quantize_config=SimpleNamespace(dynamic=None),
+    )
+    flat_modules = {name for block in layer_modules for name in block}
+
+    assert MiniMaxM3VLGPTQ.layer_modules_strict is False
+    assert MiniMaxM3VLGPTQ.require_load_processor is False
+    assert MiniMaxM3VLGPTQ.pre_lm_head_norm_module == "model.language_model.norm"
+    assert MiniMaxM3VLGPTQ.rotary_embedding == "model.language_model.rotary_emb"
+
+    assert "self_attn.q_proj" in flat_modules
+    assert "self_attn.indexer.q_proj" in flat_modules
+    assert "self_attn.indexer.k_proj" in flat_modules
+    assert "self_attn.o_proj" in flat_modules
+    assert "mlp.gate_up_proj" in flat_modules
+    assert "mlp.down_proj" in flat_modules
+    assert "mlp.shared_experts.gate_up_proj" in flat_modules
+    assert "mlp.shared_experts.down_proj" in flat_modules
+    assert "mlp.experts.0.gate_proj" in flat_modules
+    assert "mlp.experts.1.up_proj" in flat_modules
+    assert "mlp.experts.0.down_proj" in flat_modules
+    assert not any("block_sparse_moe" in name for name in flat_modules)
+
+
+def test_minimax_m3_multimodal_base_modules_include_non_language_children():
+    class _LanguageModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layers = nn.ModuleList([nn.Identity()])
+            self.embed_tokens = nn.Embedding(4, 4)
+            self.norm = nn.LayerNorm(4)
+            self.rotary_emb = nn.Identity()
+
+    class _MiniMaxM3Core(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.language_model = _LanguageModel()
+            self.vision_tower = nn.Identity()
+            self.multi_modal_projector = nn.Identity()
+
+    class _MiniMaxM3Wrapper(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.model = _MiniMaxM3Core()
+            self.lm_head = nn.Linear(4, 4, bias=False)
+
+    base_modules = set(MiniMaxM3VLGPTQ.get_base_modules(_MiniMaxM3Wrapper()))
+
+    assert MiniMaxM3VLGPTQ.extract_layers_node() == ["model.language_model.layers"]
+    assert "model.vision_tower" in base_modules
+    assert "model.multi_modal_projector" in base_modules
+    assert "model.language_model.embed_tokens" in base_modules
+    assert "model.language_model.norm" in base_modules
+    assert "model.language_model.rotary_emb" in base_modules
+
+
+@pytest.mark.skipif(not MODEL_PATH.exists(), reason="MiniMax-M3 model not found")
+def test_minimax_m3_defuser_splits_native_packed_experts():
+    from defuser import convert_model
+    from defuser.model_registry import MODEL_CONFIG
+
+    config = _tiny_minimax_m3_config()
+    model = AutoModelForImageTextToText.from_config(config, trust_remote_code=False)
+
+    experts = model.model.language_model.layers[3].mlp.experts
+    assert "minimax_m3_vl" in MODEL_CONFIG
+    assert hasattr(experts, "gate_up_proj")
+    assert hasattr(experts, "down_proj")
+
+    assert convert_model(model, cleanup_original=False) is True
+
+    experts = model.model.language_model.layers[3].mlp.experts
+    expert0 = getattr(experts, "0")
+    assert not hasattr(experts, "gate_up_proj")
+    assert hasattr(expert0, "gate_proj")
+    assert hasattr(expert0, "up_proj")
+    assert hasattr(expert0, "down_proj")
+
+
+def _tiny_minimax_m3_config():
+    config = AutoConfig.from_pretrained(MODEL_PATH, trust_remote_code=False)
+    text_config = config.text_config
+    text_config.hidden_size = 16
+    text_config.intermediate_size = 8
+    text_config.dense_intermediate_size = 32
+    text_config.shared_intermediate_size = 8
+    text_config.num_hidden_layers = 4
+    text_config.num_attention_heads = 2
+    text_config.num_key_value_heads = 1
+    text_config.head_dim = 8
+    text_config.num_local_experts = 2
+    text_config.num_experts_per_tok = 1
+    text_config.n_shared_experts = 1
+    text_config.moe_layer_freq = [0, 0, 0, 1]
+    text_config.vocab_size = 128
+    config.vocab_size = 128
+
+    vision_config = config.vision_config
+    vision_config.hidden_size = 8
+    vision_config.intermediate_size = 16
+    vision_config.num_hidden_layers = 1
+    vision_config.num_attention_heads = 2
+    vision_config.image_size = 16
+    vision_config.patch_size = 8
+    if hasattr(vision_config, "spatial_merge_size"):
+        vision_config.spatial_merge_size = 1
+
+    return config