Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

## Latest News

* 05/25/2026 7.1.0-dev `main`: ✨ Added `hy_3` and `ministral3` model support
* 05/25/2026 7.1.0-dev `main`: ✨ Added `hunyuan_v1_dense` and `hunyuan_v1_moe` model support
* 05/21/2026 7.1.0-dev `main`: ✨ Added `nemotron_labs_diffusion` model support
* 05/20/2026 7.1.0-dev `main`: ✨ Added `interns1`, `ovis2_5`, `ovis2_6_moe` and `ovis2_6_next` model support
Expand Down Expand Up @@ -251,7 +252,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode

| Model | | | | | | | | | |
|--------------------------|---|---------------------------------|--|------------------|--|---------------------------------|--|------------------------|---|
| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ |
| Apertus | ✅ | EXAONE 3/4 | ✅ | Dots1 | ✅ | Mistral3 / Ministral3 | ✅ | Qwen 2/3/3.5 (Next/MoE) | ✅ |
| Baichuan | ✅ | Falcon (H1 / Mamba) | ✅ | InternLM 1/2/2.5 | ✅ | Mixtral | ✅ | Qwen 2/2.5/3 VL | ✅ |
| Bloom | ✅ | FastVLM | ✅ | Kimi K2 | ✅ | MobileLLM | ✅ | Qwen 2.5/3 Omni | ✅ |
| ChatGLM | ✅ | Gemma 1-4 / 3n | ✅ | Klear | ✅ | MOSS | ✅ | RefinedWeb | ✅ |
Expand All @@ -266,7 +267,7 @@ Selected public references where teams or companies explicitly mention GPT-QMode
| XVERSE | ✅ | Brumby | ✅ | Hymba | ✅ | Mistral | ✅ | Qwen 1/2/3/3.5 | ✅ |
| MiniMax M2 | ✅ | AfMoE | ✅ | Bailing-MoE | ✅ | LFM2-MoE | ✅ | Marin | ✅ |
| InternVL Chat | ✅ | Laguna | ✅ | Mimo / Mimo V2 | ✅ | Zamba / Zamba2 | ✅ | Intern S1 | ✅ |
| HunYuan V1 Dense / MoE | ✅ | | | | | | | | |
| HunYuan V1 Dense / MoE | ✅ | HY-V3 | ✅ | | | | | | |

Prism Bonsai GGUF checkpoints are supported for inference only through GPT-QModel's native GGUF path and internal GGUF runtime. Bonsai checkpoints load through the normal model path or repo argument and do not require the external `gguf` package. Prism model quantization is not included.

Expand Down
4 changes: 4 additions & 0 deletions gptqmodel/models/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@
from .definitions.hrm_text import HrmTextQModel # noqa: E402
from .definitions.hunyuan_v1_dense import HunYuanDenseV1QModel # noqa: E402
from .definitions.hunyuan_v1_moe import HunYuanMoEV1QModel # noqa: E402
from .definitions.hy_v3 import HYV3QModel # noqa: E402
from .definitions.hymba import HymbaQModel # noqa: E402
from .definitions.instella import InstellaQModel # noqa: E402
from .definitions.internlm import InternLMQModel # noqa: E402
Expand All @@ -140,6 +141,7 @@
from .definitions.minicpmv import MiniCPMVQModel # noqa: E402
from .definitions.minicpmv_4_6 import MiniCPMV4_6QModel # noqa: E402
from .definitions.minimax_m2 import MiniMaxM2GPTQ # noqa: E402
from .definitions.ministral3 import Ministral3GPTQ # noqa: E402
from .definitions.mistral3 import Mistral3GPTQ
from .definitions.mixtral import MixtralQModel # noqa: E402
from .definitions.mllama import MLlamaQModel, MLlamaTextQModel # noqa: E402
Expand Down Expand Up @@ -237,6 +239,7 @@
"hrm_text": HrmTextQModel,
"hunyuan_v1_dense": HunYuanDenseV1QModel,
"hunyuan_v1_moe": HunYuanMoEV1QModel,
"hy_v3": HYV3QModel,
"qwen": QwenQModel,
"mistral": LlamaQModel, # 100% llama clone
"yi": LlamaQModel, # 100% llama clone
Expand Down Expand Up @@ -272,6 +275,7 @@
"minicpmv4_6": MiniCPMV4_6QModel,
"minimax": MiniMaxM2GPTQ,
"minimax_m2": MiniMaxM2GPTQ,
"ministral3": Ministral3GPTQ,
"qwen2_moe": Qwen2MoeQModel,
"qwen3_moe": Qwen3MoeQModel,
"qwen3_next": Qwen3NextGPTQ,
Expand Down
2 changes: 2 additions & 0 deletions gptqmodel/models/definitions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
from .hrm_text import HrmTextQModel
from .hunyuan_v1_dense import HunYuanDenseV1QModel
from .hunyuan_v1_moe import HunYuanMoEV1QModel
from .hy_v3 import HYV3QModel
from .hymba import HymbaQModel
from .instella import InstellaQModel
from .internlm import InternLMQModel
Expand All @@ -57,6 +58,7 @@
from .minicpmv import MiniCPMVQModel
from .minicpmv_4_6 import MiniCPMV4_6QModel
from .minimax_m2 import MiniMaxM2GPTQ
from .ministral3 import Ministral3GPTQ
from .mimo_v2 import MimoV2QModel
from .mixtral import MixtralQModel
from .mllama import MLlamaQModel, MLlamaTextQModel
Expand Down
46 changes: 46 additions & 0 deletions gptqmodel/models/definitions/hy_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
# SPDX-License-Identifier: Apache-2.0

from ..base import BaseQModel
from ..moe_lifecycle import GateUpDownMoELifecycleHooks


class HYV3QModel(BaseQModel):
# HYV3 uses a dense first MLP layer and sparse MoE layers after it.
layer_modules_strict = False
dynamic_expert_index = "num_experts"

pre_lm_head_norm_module = "model.norm"

awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"]

moe_lifecycle_hooks = GateUpDownMoELifecycleHooks()

module_tree = [
"model",
"layers",
"#",
{
"input_layernorm": ("input_layernorm:!",),
"self_attn": (
"q_norm:!",
"k_norm:!",
"q_proj:0",
"k_proj:0",
"v_proj:0",
"o_proj:1",
),
"post_attention_layernorm": ("post_attention_layernorm:!",),
"mlp:moe": {
"gate": ("gate:!",),
"experts": {
"#": ("gate_proj:0", "up_proj:0", "down_proj:1"),
},
"shared_experts": ("gate_proj:0", "up_proj:0", "down_proj:1"),
"": ("gate_proj:0", "up_proj:0", "down_proj:1"),
},
},
]


__all__ = ["HYV3QModel"]
23 changes: 23 additions & 0 deletions gptqmodel/models/definitions/ministral3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# SPDX-FileCopyrightText: 2026 ModelCloud.ai
# SPDX-License-Identifier: Apache-2.0

from ..base import BaseQModel


class Ministral3GPTQ(BaseQModel):
pre_lm_head_norm_module = "model.norm"

module_tree = [
"model",
"layers",
"#",
{
"input_layernorm": ("input_layernorm:!",),
"self_attn": ("q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"),
"post_attention_layernorm": ("post_attention_layernorm:!",),
"mlp": ("gate_proj:0", "up_proj:0", "down_proj:1"),
},
]


__all__ = ["Ministral3GPTQ"]
7 changes: 3 additions & 4 deletions tests/models/test_hunyuan_v1_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from model_test import ModelTest


class TestNemotronUltra(ModelTest):
class TestHunyuanV1Dense(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/HY-MT1.5-1.8B" # tencent/HY-MT1.5-1.8B
EVAL_TASKS_SLOW = {
"arc_challenge": {
Expand All @@ -24,6 +24,5 @@ class TestNemotronUltra(ModelTest):
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)

def test_nemotron_ultra(self):
# self.quantize_and_evaluate()
print(self.evaluate_model(self.SAVE_PATH))
def test_hunyuan_v1_dense(self):
self.quantize_and_evaluate()
4 changes: 2 additions & 2 deletions tests/models/test_hunyuan_v1_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from model_test import ModelTest


class TestNemotronUltra(ModelTest):
class TestHunyuanV1MoE(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/Hunyuan-A13B-Instruct" # tencent/Hunyuan-A13B-Instruct
EVAL_TASKS_SLOW = {
"arc_challenge": {
Expand All @@ -24,5 +24,5 @@ class TestNemotronUltra(ModelTest):
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)

def test_nemotron_ultra(self):
def test_hunyuan_v1_moe(self):
self.quantize_and_evaluate()
21 changes: 21 additions & 0 deletions tests/models/test_hy_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium

from model_test import ModelTest


class TestHyV3(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/Hy-MT2-30B-A3B" # tencent/Hy-MT2-30B-A3B
EVAL_TASKS_SLOW = {
"arc_challenge": {
"chat_template": True,
"acc": {"value": 0.5324, "floor_pct": 0.04},
"acc_norm": {"value": 0.5341, "floor_pct": 0.04},
},
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)

def test_hy_v3(self):
self.quantize_and_evaluate()
29 changes: 29 additions & 0 deletions tests/models/test_ministral3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
# SPDX-License-Identifier: Apache-2.0
# Contact: qubitium@modelcloud.ai, x.com/qubitium

from model_test import ModelTest


class TestMinistral3(ModelTest):
NATIVE_MODEL_ID = "/monster/data/model/Ministral-3-3B-Instruct-2512-TextOnly-BF16" # "Aratako/Ministral-3-3B-Instruct-2512-TextOnly"
NATIVE_ARC_CHALLENGE_ACC = 0.5870
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.6032
NATIVE_ARC_CHALLENGE_ACC_SLOW = NATIVE_ARC_CHALLENGE_ACC
NATIVE_ARC_CHALLENGE_ACC_NORM_SLOW = NATIVE_ARC_CHALLENGE_ACC_NORM
NATIVE_ARC_CHALLENGE_ACC_FAST = NATIVE_ARC_CHALLENGE_ACC_SLOW
NATIVE_ARC_CHALLENGE_ACC_NORM_FAST = NATIVE_ARC_CHALLENGE_ACC_NORM_SLOW
TRUST_REMOTE_CODE = False
EVAL_BATCH_SIZE = 6
EVAL_TASKS_SLOW = {
"arc_challenge": {
"chat_template": False,
"acc": {"value": NATIVE_ARC_CHALLENGE_ACC},
"acc_norm": {"value": NATIVE_ARC_CHALLENGE_ACC_NORM},
},
}
EVAL_TASKS_FAST = ModelTest.derive_fast_eval_tasks(EVAL_TASKS_SLOW)

def test_ministral3(self):
self.quantize_and_evaluate()
43 changes: 43 additions & 0 deletions tests/test_hy_v3_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from types import SimpleNamespace

from gptqmodel.models import auto
from gptqmodel.models.definitions.hy_v3 import HYV3QModel

def test_hy_v3_model_type_selects_definition(monkeypatch):
fake_config = SimpleNamespace(model_type="hy_v3")

monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)

assert auto.check_and_get_model_definition("/tmp/hy_v3") is HYV3QModel


def test_hy_v3_module_tree_expands_dense_and_sparse_moe_paths():
layer_modules = HYV3QModel.simple_layer_modules(
model_config=SimpleNamespace(num_experts=3),
quantize_config=SimpleNamespace(dynamic=None),
)
flat_modules = {name for block in layer_modules for name in block}
first_expert_block = next(i for i, block in enumerate(layer_modules) if "mlp.experts.0.gate_proj" in block)
shared_block = next(i for i, block in enumerate(layer_modules) if "mlp.shared_experts.gate_proj" in block)

assert HYV3QModel.layer_modules_strict is False
assert HYV3QModel.dynamic_expert_index == "num_experts"
assert "self_attn.q_proj" in flat_modules
assert "self_attn.k_proj" in flat_modules
assert "self_attn.v_proj" in flat_modules
assert "self_attn.o_proj" in flat_modules
assert "self_attn.q_norm" not in flat_modules
assert "self_attn.k_norm" not in flat_modules
assert "mlp.gate_proj" in flat_modules
assert "mlp.up_proj" in flat_modules
assert "mlp.down_proj" in flat_modules
assert "mlp.shared_experts.gate_proj" in flat_modules
assert "mlp.shared_experts.up_proj" in flat_modules
assert "mlp.shared_experts.down_proj" in flat_modules
assert "mlp.experts.0.gate_proj" in flat_modules
assert "mlp.experts.1.up_proj" in flat_modules
assert "mlp.experts.2.down_proj" in flat_modules
assert "mlp.gate" not in flat_modules
assert "mlp.e_score_correction_bias" not in flat_modules
assert first_expert_block < shared_block
31 changes: 31 additions & 0 deletions tests/test_ministral3_support.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from types import SimpleNamespace

from gptqmodel.models import auto
from gptqmodel.models.definitions.ministral3 import Ministral3GPTQ


def test_ministral3_model_type_selects_definition(monkeypatch):
fake_config = SimpleNamespace(model_type="ministral3")

monkeypatch.setattr(auto, "resolve_trust_remote_code", lambda path, trust_remote_code=False: trust_remote_code)
monkeypatch.setattr(auto.AutoConfig, "from_pretrained", lambda *args, **kwargs: fake_config)

assert auto.check_and_get_model_definition("/tmp/ministral3") is Ministral3GPTQ


def test_ministral3_module_tree_matches_text_only_layout():
layer_modules = Ministral3GPTQ.simple_layer_modules(
model_config=SimpleNamespace(),
quantize_config=SimpleNamespace(dynamic=None),
)
flat_modules = {name for block in layer_modules for name in block}

assert Ministral3GPTQ.module_tree[:3] == ["model", "layers", "#"]
assert "self_attn.q_proj" in flat_modules
assert "self_attn.k_proj" in flat_modules
assert "self_attn.v_proj" in flat_modules
assert "self_attn.o_proj" in flat_modules
assert "mlp.gate_proj" in flat_modules
assert "mlp.up_proj" in flat_modules
assert "mlp.down_proj" in flat_modules