对 LLM 进行性能测试,包括 F1-Score,Rogue-L,困惑度等
以下内容均以 uv 为例
uv pip install git+ssh://git@github.com/SJTU-DDST/LLMTest.git下载并放入 3rd 文件夹
git submodule add git@github.com:SJTU-DDST/LLMTest.git 3rd/llmtest
# git submodule update --init --recursive安装
# uv venv / uv sync
uv pip install -e 3rd/llmtest创建 test.py,写入
from LLMTest import LLMTest
# from LLMTest import change_log_level
# change_log_level("DEBUG")
def LLM(prompts):
return ["The Answer is C"] * len(prompts)
tester = LLMTest("cais/mmlu", 'high_school_biology')
batch_id, prompts = tester.get()
answers = LLM(prompts)
score = tester.score(batch_id, answers)
print(score)uv run test.pyuv pip install -e .
uv run tests/test.py