2 Commits

Author SHA1 Message Date
  zhulinJulia24 c45970c3d1
[ci] add mllm eval (#4194) 10 hours ago
  littlegy 9756373638
Add test for "generate" endpoint (#4181) 15 hours ago
22 changed files with 2102 additions and 260 deletions
Split View
  1. +1
    -1
      .github/workflows/api_eval.yml
  2. +1
    -1
      .github/workflows/api_eval_h800.yml
  3. +2
    -2
      .github/workflows/daily_ete_test.yml
  4. +2
    -2
      .github/workflows/daily_ete_test_3090.yml
  5. +2
    -2
      .github/workflows/daily_ete_test_5080.yml
  6. +2
    -2
      .github/workflows/daily_ete_test_h800.yml
  7. +1
    -1
      .github/workflows/evaluate_remote.yml
  8. +223
    -0
      .github/workflows/mllm_api_eval.yml
  9. +1
    -0
      autotest/config-ascend.yaml
  10. +5
    -0
      autotest/config-h.yaml
  11. +40
    -7
      autotest/config.yaml
  12. +42
    -64
      autotest/evaluate/test_api_evaluate.py
  13. +290
    -0
      autotest/evaluate/test_mllm_api_evaluate.py
  14. +1169
    -0
      autotest/interface/restful/test_restful_generate.py
  15. +7
    -35
      autotest/utils/benchmark_utils.py
  16. +60
    -0
      autotest/utils/common_utils.py
  17. +34
    -109
      autotest/utils/config_utils.py
  18. +22
    -0
      autotest/utils/constant.py
  19. +149
    -33
      autotest/utils/evaluate_utils.py
  20. +39
    -0
      autotest/utils/toolkit.py
  21. +1
    -1
      lmdeploy/serve/openai/api_server.py
  22. +9
    -0
      lmdeploy/serve/openai/serving_generate.py

+ 1
- 1
.github/workflows/api_eval.yml View File

@@ -15,7 +15,7 @@ on:
default: 'main'
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
execution_mode:


+ 1
- 1
.github/workflows/api_eval_h800.yml View File

@@ -15,7 +15,7 @@ on:
default: 'main'
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
execution_mode:


+ 2
- 2
.github/workflows/daily_ete_test.yml View File

@@ -15,12 +15,12 @@ on:
default: 'main'
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
model:
required: true
description: 'Set testcase module filter: llm, vllm. Default contains all models'
description: 'Set testcase module filter: llm, mllm. Default contains all models'
type: string
default: "['llm','mllm']"
function:


+ 2
- 2
.github/workflows/daily_ete_test_3090.yml View File

@@ -15,12 +15,12 @@ on:
default: 'main'
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
model:
required: true
description: 'Set testcase module filter: llm, vllm. Default contains all models'
description: 'Set testcase module filter: llm, mllm. Default contains all models'
type: string
default: "['llm','mllm']"
function:


+ 2
- 2
.github/workflows/daily_ete_test_5080.yml View File

@@ -15,12 +15,12 @@ on:
default: 'main'
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
model:
required: true
description: 'Set testcase module filter: llm, vllm. Default contains all models'
description: 'Set testcase module filter: llm, mllm. Default contains all models'
type: string
default: "['llm','mllm']"
function:


+ 2
- 2
.github/workflows/daily_ete_test_h800.yml View File

@@ -15,12 +15,12 @@ on:
default: 'main'
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
model:
required: true
description: 'Set testcase module filter: llm, vllm. Default contains all models'
description: 'Set testcase module filter: llm, mllm. Default contains all models'
type: string
default: "['llm','mllm']"
function:


+ 1
- 1
.github/workflows/evaluate_remote.yml View File

@@ -35,7 +35,7 @@ on:
default: "['chat_models','base_models']"
backend:
required: true
description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"



+ 223
- 0
.github/workflows/mllm_api_eval.yml View File

@@ -0,0 +1,223 @@
name: mllm_api_eval

on:
workflow_dispatch:
inputs:
repo_org:
required: false
description: 'Tested repository organization name. Default is InternLM/lmdeploy'
type: string
default: 'InternLM/lmdeploy'
repo_ref:
required: false
description: 'Set branch or tag or commit id. Default is "main"'
type: string
default: 'main'
backend:
required: true
description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
type: string
default: "['turbomind', 'pytorch']"
execution_mode:
required: false
description: 'Select execution mode: infer, eval, or both. Default is "both"'
type: choice
options:
- both
- infer
- eval
default: 'both'
run_id:
required: false
description: 'Set custom run ID. If not provided, github.run_id will be used'
type: string
default: ''


env:
HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ github.run_id }}
COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
FAIL_CONFIG: '--lf'
TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
LMUData: /nvme/qa_test_models/LMUData
LOCAL_LLM: Qwen2.5-32B-Instruct
OPENAI_API_KEY: sk-empty
HF_DATASETS_OFFLINE: 1
HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets
HF_HUB_OFFLINE: 1
HF_EVALUATE_OFFLINE: 1

jobs:
linux-build:
if: ${{ !cancelled() }}
strategy:
matrix:
pyver: [py310]
runs-on: ubuntu-latest
env:
PYTHON_VERSION: ${{ matrix.pyver }}
PLAT_NAME: manylinux2014_x86_64
DOCKER_TAG: cuda12.8
OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
steps:
- name: Free disk space
uses: jlumbroso/free-disk-space@main
with:
# This might remove tools that are actually needed, if set to "true" but frees about 6 GB
tool-cache: false
docker-images: false
# All of these default to true, but feel free to set to "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
swap-storage: false
- name: Checkout repository
uses: actions/checkout@v3
with:
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Build
run: |
echo ${PYTHON_VERSION}
echo ${PLAT_NAME}
echo ${DOCKER_TAG}
echo ${OUTPUT_FOLDER}
echo ${GITHUB_RUN_ID}
# remove -it
sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
- name: Upload Artifacts
uses: actions/upload-artifact@v4
with:
if-no-files-found: error
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
retention-days: 1
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}

download_pkgs:
needs: linux-build
if: ${{!cancelled()}}
runs-on: [self-hosted, linux-a100]
timeout-minutes: 50
container:
image: openmmlab/lmdeploy:latest-cu12.8
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
volumes:
- /nvme/qa_test_models:/nvme/qa_test_models
- /mnt/121:/mnt/121
- /mnt/104:/mnt/104
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Clone repository
uses: actions/checkout@v2
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
with:
repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
ref: ${{github.event.inputs.repo_ref || 'main'}}
- name: Copy repository
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
- name: Copy repository - offline
if: ${{inputs.offline_mode}}
run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
- name: Download Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Copy Artifacts
if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
- name: Copy Artifacts - offline
if: ${{inputs.offline_mode}}
run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
- name: Mark as start
run: |
chmod -R 777 ${{env.TEST_CODE_PATH}}
mkdir ${{env.REPORT_DIR}} -p
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt

test_evaluation:
needs: download_pkgs
if: ${{ !cancelled() }}
runs-on: [self-hosted, test-140]
timeout-minutes: 2400
strategy:
fail-fast: false
matrix:
backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8']
include:
- n: 8
gpu_num: gpu_num_1
- n: 4
gpu_num: gpu_num_2
- n: 2
gpu_num: gpu_num_4
- n: 1
gpu_num: gpu_num_8
container:
image: openmmlab/lmdeploy:latest-cu12.8
options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
volumes:
- /nvme/github-actions/pip-cache:/root/.cache/pip
- /nvme/github-actions/packages:/root/packages
- /nvme/github-actions/resources:/root/resources
- /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
- /nvme/qa_test_models:/nvme/qa_test_models
- /nvme/huggingface_hub:/nvme/huggingface_hub
- /mnt/121:/mnt/121
- /mnt/104:/mnt/104
- /mnt/bigdisk:/mnt/bigdisk
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
steps:
- name: Copy repository and Artifacts
run: |
cp -r ${{env.TEST_CODE_PATH}}/. .
mkdir ${{env.REPORT_DIR}} -p
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
- name: Install lmdeploy - dependency
run: |
python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
- name: Install lmdeploy
run: |
python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
python3 -m pip install -r requirements/test.txt
- name: Install vlmeval
run: |
python3 -m pip install pandas datasets scikit-learn pylatexenc math_verify
apt update && apt install -y libgl1 libglib2.0-0
cp -r /nvme/qa_test_models/offline_pkg/VLMEvalKit .
cd VLMEvalKit && pip install .
- name: Check env
run: |
python3 -m pip list
lmdeploy check_env
mkdir ${{env.REPORT_DIR}} -p
echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
- name: Setup paths for evaluation
if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
run: |
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
cd VLMEvalKit && cp -r ../autotest .
execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and infer" -n ${{matrix.n}} --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
fi
if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
fi
exit $overall_exit
- name: Clear workspace
if: always()
run: |
echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
export workdir=$(pwd)
rm -rf $workdir/*

+ 1
- 0
autotest/config-ascend.yaml View File

@@ -2,6 +2,7 @@ model_path: /mnt/vc-intern-delivery/qa-llm-cicd/qa_test_models
resource_path: /mnt/vc-intern-delivery/qa-llm-cicd/resource
log_path: /mnt/vc-intern-delivery/qa-llm-cicd/log
eval_log_path: /mnt/vc-intern-delivery/qa-llm-cicd/evaluation_report
mllm_eval_log_path: /mnt/vc-intern-delivery/qa-llm-cicd/mllm_evaluation_report
benchmark_path: /mnt/vc-intern-delivery/qa-llm-cicd/benchmark-reports
dataset_path: /mnt/vc-intern-delivery/qa-llm-cicd/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
prefix_dataset_path: /mnt/vc-intern-delivery/qa-llm-cicd/datasets/prefix_cache_test.json


+ 5
- 0
autotest/config-h.yaml View File

@@ -2,6 +2,7 @@ model_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/model
resource_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/resource
log_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/log
eval_log_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/evaluation_report
mllm_eval_log_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/mllm_evaluation_report
benchmark_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/benchmark-reports
dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
prefix_dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/prefix_cache_test.json
@@ -204,3 +205,7 @@ evaluate_model:
- deepseek/DeepSeek-V3.1
- moonshotai/Kimi-K2-Instruct-0905
- Qwen/Qwen3-235B-A22B-Thinking-2507

mllm_evaluate_model:
- internlm/Intern-S1
- internlm/Intern-S1-mini

+ 40
- 7
autotest/config.yaml View File

@@ -2,6 +2,7 @@ model_path: /nvme/qa_test_models
resource_path: /nvme/qa_test_models/resource
log_path: /nvme/qa_test_models/autotest_model/log
eval_log_path: /nvme/qa_test_models/evaluation_report
mllm_eval_log_path: /nvme/qa_test_models/mllm_evaluation_report
benchmark_path: /nvme/qa_test_models/benchmark-reports
dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
prefix_dataset_path: /nvme/qa_test_models/datasets/prefix_cache_test.json
@@ -23,6 +24,8 @@ tp_config:
Qwen3-235B-A22B: 8
Qwen3-32B: 2
Qwen3-30B-A3B: 2
Qwen3-VL-32B-Instruct: 2
Qwen3-VL-30B-A3B-Instruct: 2
Qwen3-30B-A3B-Base: 2
Qwen2.5-32B-Instruct: 2
Qwen2.5-72B-Instruct: 4
@@ -38,6 +41,7 @@ tp_config:
InternVL2-Llama3-76B-AWQ: 4
gpt-oss-20b-BF16: 2
gpt-oss-120b-BF16: 4
InternVL3_5-30B-A3B: 2



@@ -56,6 +60,7 @@ turbomind_chat_model:
- internlm/internlm3-8b-instruct-awq
- internlm/internlm2_5-7b-chat
- internlm/internlm2_5-20b-chat
- OpenGVLab/InternVL3_5-30B-A3B
- OpenGVLab/InternVL3-2B
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-38B
@@ -74,6 +79,9 @@ turbomind_chat_model:
- Qwen/Qwen3-32B
- Qwen/Qwen3-30B-A3B
- Qwen/Qwen3-235B-A22B
- Qwen/Qwen3-VL-8B-Instruct
- Qwen/Qwen3-VL-32B-Instruct
- Qwen/Qwen3-VL-30B-A3B-Instruct
- Qwen/Qwen2.5-0.5B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-32B-Instruct
@@ -119,6 +127,7 @@ pytorch_chat_model:
- internlm/internlm3-8b-instruct
- internlm/internlm2_5-7b-chat
- internlm/internlm2_5-20b-chat
- OpenGVLab/InternVL3_5-30B-A3B
- OpenGVLab/InternVL3-2B
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-38B
@@ -138,6 +147,9 @@ pytorch_chat_model:
- Qwen/Qwen3-32B
- Qwen/Qwen3-30B-A3B
- Qwen/Qwen3-235B-A22B
- Qwen/Qwen3-VL-8B-Instruct
- Qwen/Qwen3-VL-32B-Instruct
- Qwen/Qwen3-VL-30B-A3B-Instruct
- Qwen/Qwen2.5-0.5B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-32B-Instruct
@@ -181,6 +193,7 @@ turbomind_vl_model:
- internlm/Intern-S1-mini
- OpenGVLab/InternVL2_5-26B-MPO
- OpenGVLab/Mini-InternVL-Chat-2B-V1-5
- OpenGVLab/InternVL3_5-30B-A3B
- OpenGVLab/InternVL3-2B
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-38B
@@ -191,6 +204,9 @@ turbomind_vl_model:
- OpenGVLab/InternVL2-2B
- OpenGVLab/InternVL2-40B
- OpenGVLab/InternVL2-Llama3-76B-AWQ
- Qwen/Qwen3-VL-8B-Instruct
- Qwen/Qwen3-VL-32B-Instruct
- Qwen/Qwen3-VL-30B-A3B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct
- Qwen/Qwen2.5-VL-32B-Instruct
- Qwen/Qwen2-VL-2B-Instruct
@@ -208,6 +224,7 @@ pytorch_vl_model:
- internlm/Intern-S1-mini
- OpenGVLab/InternVL2_5-26B-MPO
- OpenGVLab/Mini-InternVL-Chat-2B-V1-5
- OpenGVLab/InternVL3_5-30B-A3B
- OpenGVLab/InternVL3-2B
- OpenGVLab/InternVL3-8B
- OpenGVLab/InternVL3-38B
@@ -219,6 +236,9 @@ pytorch_vl_model:
- OpenGVLab/InternVL2-4B
- OpenGVLab/InternVL2-40B
- OpenGVLab/Mono-InternVL-2B
- Qwen/Qwen3-VL-8B-Instruct
- Qwen/Qwen3-VL-32B-Instruct
- Qwen/Qwen3-VL-30B-A3B-Instruct
- Qwen/Qwen2-VL-2B-Instruct
- Qwen/Qwen2-VL-7B-Instruct
- Qwen/Qwen2.5-VL-7B-Instruct
@@ -283,6 +303,9 @@ turbomind_quantization:
- Qwen/Qwen3-30B-A3B
- Qwen/Qwen3-235B-A22B
- Qwen/Qwen3-30B-A3B-Base
- Qwen/Qwen3-VL-8B-Instruct
- Qwen/Qwen3-VL-32B-Instruct
- Qwen/Qwen3-VL-30B-A3B-Instruct
- Qwen/Qwen2.5-0.5B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-32B-Instruct
@@ -388,10 +411,20 @@ benchmark_model:


evaluate_model:
- google/gemma-2-9b-it
- google/gemma-2-27b-it
- meta-llama/Meta-Llama-3-1-8B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-32B-Instruct
- Qwen/Qwen1.5-MoE-A2.7B-Chat
- Qwen/Qwen3-30B-A3B
- google/gemma-2-9b-it
- google/gemma-2-27b-it
- meta-llama/Meta-Llama-3-1-8B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen2.5-32B-Instruct
- Qwen/Qwen1.5-MoE-A2.7B-Chat
- Qwen/Qwen3-30B-A3B


mllm_evaluate_model:
- internlm/Intern-S1-mini
- OpenGVLab/InternVL3-8B
- Qwen/Qwen3-VL-8B-Instruct
- Qwen/Qwen3-VL-32B-Instruct
- Qwen/Qwen3-VL-30B-A3B-Instruct
- internlm/Intern-S1
- OpenGVLab/InternVL3_5-30B-A3B

+ 42
- 64
autotest/evaluate/test_api_evaluate.py View File

@@ -2,35 +2,13 @@ import os
import time

import pytest
import utils.constant as constant
from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid
from utils.evaluate_utils import restful_test
from utils.evaluate_utils import eval_test
from utils.proxy_distributed_utils import ApiServerPerTest, proxy_worker_node_wait
from utils.ray_distributed_utils import ray_worker_node_wait
from utils.run_restful_chat import start_proxy_server, start_restful_api, stop_restful_api

DEFAULT_PORT = 23333
PROXY_PORT = 8000

EVAL_CONFIGS = {
'default': {
'query_per_second': 4,
'max_out_len': 64000,
'max_seq_len': 65536,
'batch_size': 500,
'temperature': 0.6,
},
'gpt': {
'query_per_second': 4,
'max_out_len': 64000,
'max_seq_len': 65536,
'batch_size': 500,
'temperature': 0.6,
'openai_extra_kwargs': {
'reasoning_effort': 'high',
}
}
}


@pytest.fixture(scope='function')
def prepare_environment(request, config, worker_id):
@@ -46,9 +24,9 @@ def prepare_environment(request, config, worker_id):
@pytest.fixture(scope='function')
def prepare_environment_judge_evaluate(request, config, worker_id):
if get_workerid(worker_id) is None:
port = PROXY_PORT
port = constant.PROXY_PORT
else:
port = PROXY_PORT + get_workerid(worker_id)
port = constant.PROXY_PORT + get_workerid(worker_id)
judge_config = {
'model': 'Qwen/Qwen2.5-32B-Instruct',
'backend': 'turbomind',
@@ -92,25 +70,25 @@ def _run_ray_distributed_test(
assert manager is not None, 'Manager instance must be provided'
if 'gpt' in model_param.get('model', '').lower():
eval_config_name = 'gpt'
preset_config = EVAL_CONFIGS.get(eval_config_name, {})
preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

if manager.is_master:
model_name = model_param['model']
model_path = os.path.join(config['model_path'], model_name)
preset_config = EVAL_CONFIGS.get(eval_config_name, {})
preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

# Start API Server for current model (master node starts/stops, worker nodes verify)
manager.start_lmdeploy_api_server(model_path=model_path, model_param=model_param)

try:
print(f'🧪 Master node executing {test_type} test ({eval_config_name})...')
result, msg = restful_test(config,
run_id,
model_param,
worker_id=worker_id,
port=PROXY_PORT,
test_type=test_type,
**preset_config)
result, msg = eval_test(config,
run_id,
model_param,
worker_id=worker_id,
port=constant.PROXY_PORT,
test_type=test_type,
**preset_config)
assert result, f'❌ {test_type} test failed: {msg}'
print(f'✅ {test_type} test passed')

@@ -134,7 +112,7 @@ def _run_proxy_distributed_test(config,
if 'gpt' in model_param.get('model', '').lower():
eval_config_name = 'gpt'

preset_config = EVAL_CONFIGS.get(eval_config_name, {})
preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})
model_name = model_param['model']
model_path = os.path.join(config['model_path'], model_name)

@@ -146,13 +124,13 @@ def _run_proxy_distributed_test(config,
api_server.wait_until_ready()
print(f'🧪 Master node executing {test_type} test ({eval_config_name})...')

result, msg = restful_test(config,
run_id,
model_param,
worker_id=worker_id,
port=PROXY_PORT,
test_type=test_type,
**preset_config)
result, msg = eval_test(config,
run_id,
model_param,
worker_id=worker_id,
port=constant.PROXY_PORT,
test_type=test_type,
**preset_config)
assert result, f'❌ {test_type} test failed: {msg}'
print(f'✅ {test_type} test passed')

@@ -171,9 +149,9 @@ def get_turbomind_model_list(tp_num):
new_model_list = []
for model in model_list:
if 'Qwen3-235B-A22B-Thinking-2507' in model['model']:
model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
else:
model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 '
model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 '
model['cuda_prefix'] = None
new_model_list.append(model)
return new_model_list
@@ -184,9 +162,9 @@ def get_pytorch_model_list(tp_num):
new_model_list = []
for model in model_list:
if 'Qwen3-235B-A22B-Thinking-2507' in model['model']:
model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
else:
model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 '
model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 '
model['cuda_prefix'] = None
new_model_list.append(model)
return new_model_list
@@ -196,29 +174,29 @@ def run_test(config, run_id, prepare_environment, worker_id, test_type='infer',
"""Run test with specified evaluation configuration."""
if 'gpt' in prepare_environment.get('model', '').lower():
eval_config_name = 'gpt'
preset_config = EVAL_CONFIGS.get(eval_config_name, {})
preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

if test_type == 'infer':
port = DEFAULT_PORT
port = constant.DEFAULT_PORT
else: # eval
port = PROXY_PORT
port = constant.PROXY_PORT

if get_workerid(worker_id) is None:
result, msg = restful_test(config,
run_id,
prepare_environment,
worker_id=worker_id,
port=port,
test_type=test_type,
**preset_config)
result, msg = eval_test(config,
run_id,
prepare_environment,
worker_id=worker_id,
port=port,
test_type=test_type,
**preset_config)
else:
result, msg = restful_test(config,
run_id,
prepare_environment,
worker_id=worker_id,
port=port + get_workerid(worker_id),
test_type=test_type,
**preset_config)
result, msg = eval_test(config,
run_id,
prepare_environment,
worker_id=worker_id,
port=port + get_workerid(worker_id),
test_type=test_type,
**preset_config)
return result, msg




+ 290
- 0
autotest/evaluate/test_mllm_api_evaluate.py View File

@@ -0,0 +1,290 @@
import pytest
import utils.constant as constant
from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid
from utils.evaluate_utils import mllm_eval_test
from utils.run_restful_chat import start_proxy_server, start_restful_api, stop_restful_api


@pytest.fixture(scope='function')
def prepare_environment(request, config, worker_id):
param = request.param
model = param['model']
backend = param['backend']
param['extra'] = ' '.join(
[param.get('extra', ''), '--model-name',
model.split('/')[-1], '--cache-max-entry-count 0.6']) # noqa
model_path = config.get('model_path') + '/' + model
pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
yield param
stop_restful_api(pid, startRes, param)


@pytest.fixture(scope='function')
def prepare_environment_judge_evaluate(request, config, worker_id):
if get_workerid(worker_id) is None:
port = constant.PROXY_PORT
else:
port = constant.PROXY_PORT + get_workerid(worker_id)
judge_config = {
'model': 'Qwen/Qwen2.5-32B-Instruct',
'backend': 'turbomind',
'param': {
'tp_num':
2,
'extra':
'--server-name 127.0.0.1 --proxy-url http://127.0.0.1:{} --session-len 46000 '
'--model-name Qwen2.5-32B-Instruct '
'--cache-max-entry-count 0.7 '.format(port),
'cuda_prefix':
None
},
'log_path': config.get('log_path'),
}

param = judge_config['param']
model = judge_config['model']
backend = judge_config['backend']
model_path = config.get('model_path') + '/' + model

proxy_pid, proxy_process = start_proxy_server(config, worker_id)

judge_pid, judge_start_res = start_restful_api(config, param, model, model_path, backend, worker_id)

try:
yield request.param
finally:
stop_restful_api(judge_pid, judge_start_res, request.param)
stop_restful_api(proxy_pid, proxy_process, request.param)


def get_turbomind_vl_model_list(tp_num):
model_list = get_evaluate_turbomind_model_list(tp_num, is_mllm=True, kvint_list=[4, 8])
new_model_list = []
for model in model_list:
model['cuda_prefix'] = None
new_model_list.append(model)
return new_model_list


def get_pytorch_vl_model_list(tp_num):
model_list = get_evaluate_pytorch_model_list(tp_num, is_mllm=True)
new_model_list = []
for model in model_list:
model['cuda_prefix'] = None
new_model_list.append(model)
return new_model_list


def run_test(config, run_id, prepare_environment, worker_id, test_type='infer', eval_config_name='default'):
if 'gpt' in prepare_environment.get('model', '').lower():
eval_config_name = 'gpt'
preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

if test_type == 'infer':
port = constant.DEFAULT_PORT
else: # eval
port = constant.PROXY_PORT

if get_workerid(worker_id) is None:
result, msg = mllm_eval_test(config,
run_id,
prepare_environment,
worker_id=worker_id,
port=port,
test_type=test_type,
**preset_config)
else:
result, msg = mllm_eval_test(config,
run_id,
prepare_environment,
worker_id=worker_id,
port=port + get_workerid(worker_id),
test_type=test_type,
**preset_config)
return result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=1), indirect=True)
def test_turbomind_vl_eval_tp1(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=2), indirect=True)
def test_turbomind_vl_eval_tp2(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=4), indirect=True)
def test_turbomind_vl_eval_tp4(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=8), indirect=True)
def test_turbomind_vl_eval_tp8(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=1), indirect=True)
def test_pytorch_vl_eval_tp1(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=2), indirect=True)
def test_pytorch_vl_eval_tp2(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=4), indirect=True)
def test_pytorch_vl_eval_tp4(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=8), indirect=True)
def test_pytorch_vl_eval_tp8(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_16
@pytest.mark.test_ascend
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=16), indirect=True)
def test_pytorch_vl_eval_tp16(config, run_id, prepare_environment, worker_id):
result, msg = run_test(config, run_id, prepare_environment, worker_id)
assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.gpu_num_1
@pytest.mark.test_ascend
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=1), indirect=True)
def test_pytorch_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.gpu_num_2
@pytest.mark.test_ascend
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=2), indirect=True)
def test_pytorch_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.flaky(reruns=0)
@pytest.mark.gpu_num_4
@pytest.mark.test_ascend
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=4), indirect=True)
def test_pytorch_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.flaky(reruns=0)
@pytest.mark.gpu_num_8
@pytest.mark.test_ascend
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=8), indirect=True)
def test_pytorch_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.flaky(reruns=0)
@pytest.mark.gpu_num_16
@pytest.mark.test_ascend
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=16), indirect=True)
def test_pytorch_judgeeval_tp16(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=1), indirect=True)
def test_turbomind_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=2), indirect=True)
def test_turbomind_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=4), indirect=True)
def test_turbomind_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=8), indirect=True)
def test_turbomind_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id):
result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
assert result, msg

+ 1169
- 0
autotest/interface/restful/test_restful_generate.py View File

@@ -0,0 +1,1169 @@
import json
import os
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from typing import Any, Dict, List

import pytest
import requests
from transformers import AutoTokenizer
from utils.toolkit import encode_text, parse_sse_stream

BASE_HTTP_URL = 'http://127.0.0.1'
DEFAULT_PORT = 23333
MODEL_LIST = ['Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B']
BASE_URL = ':'.join([BASE_HTTP_URL, str(DEFAULT_PORT)])


@pytest.mark.parametrize('model_name', MODEL_LIST)
class TestGenerateComprehensive:

@pytest.fixture(autouse=True)
def setup_api(self, request, config, model_name):
self.api_url = f'{BASE_URL}/generate'
self.headers = {'Content-Type': 'application/json'}
self.model_name = model_name

test_name = request.node.name
safe_test_name = re.sub(r'[^\w\.-]', '_', test_name)
safe_model_name = self.model_name.replace('/', '_')
log_base = config.get('log_path', './logs')
self.log_dir = os.path.join(log_base, safe_model_name)
os.makedirs(self.log_dir, exist_ok=True)
self.log_file = os.path.join(self.log_dir, f'{safe_test_name}.log')

def _log_request_response(self, payload, response_data, stream_raw=None):
log_entry = {
'timestamp': datetime.now().isoformat(),
'model': self.model_name,
'request': payload,
'response': response_data,
}
if stream_raw is not None:
log_entry['stream_raw'] = stream_raw

try:
with open(self.log_file, 'a', encoding='utf-8') as f:
json.dump(log_entry, f, indent=2, ensure_ascii=False)
f.write('\n')
except Exception as e:
print(f'[LOG WARN] Failed to write {self.log_file}: {e}')

def _post(self, payload, stream=False):
if 'model' not in payload:
payload['model'] = self.model_name

resp = requests.post(self.api_url, json=payload, headers=self.headers, stream=stream, timeout=60)
resp.raise_for_status()

if stream:
raw_content = ''
for chunk in resp.iter_content(chunk_size=None):
if chunk:
raw_content += chunk.decode('utf-8')

events = parse_sse_stream(raw_content)
accumulated_text = ''
output_ids = []
stream_events_count = 0

for event in events:
if event == '[DONE]':
break
try:
data_str = event.replace('data: ', '').strip()
if not data_str:
continue
data = json.loads(data_str)
delta = data.get('text', '')
if isinstance(delta, str):
accumulated_text += delta
ids = data.get('output_ids')
if isinstance(ids, list):
output_ids.extend(ids)
stream_events_count += 1
except Exception as e:
print(f'Error parsing stream event: {e}')
continue

fake_resp = {
'text': accumulated_text,
'output_ids': output_ids,
'meta_info': {
'stream_events': stream_events_count
}
}
self._log_request_response(payload, fake_resp, raw_content)

class MockResp:

def json(self):
return fake_resp

@property
def status_code(self):
return 200

return MockResp()

else:
data = resp.json()
self._log_request_response(payload, data)
return resp

def _validate_generation_response(self,
data: Dict[str, Any],
expected_fields: List[str] = None,
validate_tokens: bool = True,
expect_logprobs: bool = False,
validate_experts: bool = False) -> None:
assert isinstance(data, dict), f'Response should be a dict, got {type(data)}'

required_fields = ['text']
for field in required_fields:
assert field in data, f'Missing required field: {field}'
assert data[field] is not None, f'Field {field} should not be None'

assert isinstance(data['text'], str), \
f"text should be string, got {type(data['text'])}"

if validate_experts:
assert 'routed_experts' in data[
'meta_info'], "Response should contain 'routed_experts' when validate_experts=True"

experts_data = data['meta_info']['routed_experts']

assert isinstance(experts_data, list)
assert len(experts_data) > 0

total_steps = len(experts_data)

for step_idx in range(total_steps):
token_experts = experts_data[step_idx]

assert isinstance(token_experts, list)
assert len(token_experts) > 0

for layer_idx in range(len(token_experts)):
layer_experts = token_experts[layer_idx]

assert isinstance(layer_experts, list)
assert len(layer_experts) == 8

for expert_idx, expert_id in enumerate(layer_experts):
assert isinstance(expert_id, int)
assert 0 <= expert_id < 256, f'Invalid expert_id: {expert_id}. Must be in [0, 256)'

if validate_tokens:
assert 'output_ids' in data, "Response should contain 'output_ids'"
output_ids = data['output_ids']

assert isinstance(output_ids, list), \
f'output_ids should be list, got {type(output_ids)}'
assert len(output_ids) >= 0, 'output_ids should not be empty'

for i, token_id in enumerate(output_ids):
assert isinstance(token_id, int), \
f'output_ids[{i}] should be int, got {type(token_id)}'

if 'meta_info' in data:
meta = data['meta_info']
assert isinstance(meta, dict), 'meta_info should be dict'

if 'completion_tokens' in meta:
assert meta['completion_tokens'] == len(output_ids), \
f"meta.completion_tokens ({meta['completion_tokens']}) " \
f'should equal len(output_ids) ({len(output_ids)})'

if expect_logprobs:
assert 'meta_info' in data, \
"Response should contain 'meta_info' when expecting logprobs"
meta = data['meta_info']
assert isinstance(meta, dict)

assert 'output_token_logprobs' in meta, \
"meta_info missing 'output_token_logprobs'"
logprobs_data = meta['output_token_logprobs']

assert isinstance(logprobs_data, list), \
'output_token_logprobs should be a list'
assert len(logprobs_data) > 0, \
'output_token_logprobs should not be empty'

if 'output_ids' in data:
assert len(logprobs_data) == len(data['output_ids']), \
f'Logprobs outer list length ({len(logprobs_data)}) != ' \
f"Output IDs length ({len(data['output_ids'])})"

for idx, item in enumerate(logprobs_data):
assert isinstance(item, list), \
f'Logprobs item at index {idx} should be a list, got {type(item)}'
assert len(item) == 2, \
f'Logprobs item at index {idx} should have 2 elements ' \
f'[logprob, token_id], got {len(item)}'

logprob_val = item[0]
assert isinstance(logprob_val, (float, int)), \
f'Logprob value at [{idx}][0] should be number, ' \
f'got {type(logprob_val)}'
assert logprob_val <= 0, \
f'Logprob value should be <= 0, got {logprob_val}'

token_id_in_logprob = item[1]
assert isinstance(token_id_in_logprob, int), \
f'Token ID in logprobs at [{idx}][1] should be int, ' \
f'got {type(token_id_in_logprob)}'

if 'output_ids' in data and idx < len(data['output_ids']):
assert token_id_in_logprob == data['output_ids'][idx], \
f'Token ID mismatch at index {idx}: output_ids has ' \
f"{data['output_ids'][idx]}, but logprobs has " \
f'{token_id_in_logprob}'

if expected_fields:
for field in expected_fields:
assert field in data, f'Missing expected field: {field}'

if 'error' in data:
assert not data['error'], f"Response contains error: {data['error']}"
if 'code' in data and data['code'] != 0:
assert False, f"Response contains error code: {data['code']}"

def test_basic_generation(self):
print(f'\n[Model: {self.model_name}] Running basic generation test')
test_cases = [{
'name': 'simple prompt',
'payload': {
'prompt': 'The sky is',
'max_tokens': 5
},
}, {
'name': 'prompt with spaces',
'payload': {
'prompt': ' Hello world ',
'max_tokens': 3
},
}, {
'name': 'unicode prompt',
'payload': {
'prompt': 'Hello, world',
'max_tokens': 3
},
}, {
'name': 'longer generation',
'payload': {
'prompt': 'Once upon a time',
'max_tokens': 10
},
}]

for test_case in test_cases:
test_name = test_case['name']
print(f'\n[Test: {test_name}]')

resp = self._post(test_case['payload'])
data = resp.json()

self._validate_generation_response(data=data, validate_tokens=True)

prompt = test_case['payload']['prompt']
generated_text = data['text']
assert generated_text != prompt.strip(), \
f"Generated text should be different from prompt: '{generated_text}'"

if 'output_ids' in data:
output_ids = data['output_ids']
max_tokens = test_case['payload']['max_tokens']
max_allowed = max_tokens + 1

assert len(output_ids) <= max_allowed, \
f'Too many tokens generated: {len(output_ids)} > {max_allowed}'

meta = data.get('meta_info', {})
finish_type = meta.get('finish_reason', {}).get('type')
if len(output_ids) >= max_tokens and finish_type != 'length':
print(f'[WARN] Generated {len(output_ids)} tokens but '
f"finish_reason is not 'length': {finish_type}")

print(f" Generated text: '{generated_text[:50]}...'")
print(f" Generated tokens: {len(data.get('output_ids', []))}")

def test_input_ids_mode(self, config):
print(f'\n[Model: {self.model_name}] Running input_ids mode test')
model_path = os.path.join(config.get('model_path'), self.model_name)

test_cases = [{
'name': 'simple text',
'text': 'Hello world',
'max_tokens': 5,
'expected_min_text': 3
}, {
'name': 'question',
'text': 'What is the meaning of life?',
'max_tokens': 8,
'expected_min_text': 5
}, {
'name': 'short input',
'text': 'Yes',
'max_tokens': 3,
'expected_min_text': 1
}]

for test_case in test_cases:
test_name = test_case['name']
print(f'\n[Test: input_ids - {test_name}]')

try:
input_ids = encode_text(model_path, test_case['text'])
except Exception as e:
pytest.skip(f'Tokenizer failed for {test_case["name"]}: {e}')

assert isinstance(input_ids, list), \
f'input_ids should be list, got {type(input_ids)}'
assert len(input_ids) > 0, 'input_ids should not be empty'
for i, token_id in enumerate(input_ids):
assert isinstance(token_id, int), \
f'input_ids[{i}] should be int, got {type(token_id)}'
assert token_id >= 0, \
f'input_ids[{i}] should be >= 0, got {token_id}'

resp = self._post({'input_ids': input_ids, 'max_tokens': test_case['max_tokens']})
data = resp.json()

self._validate_generation_response(data=data, validate_tokens=True)

generated_text = data['text']
try:
generated_text.encode('utf-8')
except UnicodeEncodeError:
pytest.fail(f'Generated text contains invalid UTF-8 characters: '
f'{generated_text[:100]}')

print(f' Input tokens: {len(input_ids)}')
print(f" Output tokens: {len(data.get('output_ids', []))}")
print(f" Generated text: '{generated_text[:50]}...'")

def test_conflict_prompt_and_input_ids(self):
print(f'\n[Model: {self.model_name}] Running conflict test')
test_cases = [{
'name':
'both provided',
'payload': {
'prompt': 'Hello world',
'input_ids': [1, 2, 3, 4, 5],
'max_tokens': 5
},
'expected_status':
400,
'expected_error_keywords': [
'conflict', 'both', 'either', 'cannot', 'mutually exclusive', 'specify exactly one', 'prompt',
'input_ids'
]
}, {
'name':
'prompt with empty input_ids',
'payload': {
'prompt': 'Test',
'input_ids': [],
'max_tokens': 3
},
'expected_status':
400,
'expected_error_keywords': ['conflict', 'invalid', 'empty', 'specify exactly one', 'prompt', 'input_ids']
}, {
'name':
'empty prompt with input_ids',
'payload': {
'prompt': '',
'input_ids': [100, 200, 300],
'max_tokens': 3
},
'expected_status':
400,
'expected_error_keywords': ['conflict', 'empty', 'invalid', 'specify exactly one', 'prompt', 'input_ids']
}]

for test_case in test_cases:
test_name = test_case['name']
print(f'\n[Test: conflict - {test_name}]')

try:
resp = requests.post(self.api_url, json=test_case['payload'], headers=self.headers, timeout=30)

assert resp.status_code == test_case['expected_status'], \
f"Expected status {test_case['expected_status']}, " \
f'got {resp.status_code}'

error_data = resp.json()
assert 'error' in error_data or 'message' in error_data, \
"Error response should contain 'error' or 'message' field"

error_msg = ''
if 'error' in error_data:
error_msg = str(error_data['error']).lower()
elif 'message' in error_data:
error_msg = str(error_data['message']).lower()

keywords_found = any(keyword in error_msg for keyword in test_case['expected_error_keywords'])

if not keywords_found:
has_both_fields = ('prompt' in error_msg and 'input_ids' in error_msg)
has_exclusivity = any(phrase in error_msg for phrase in [
'only one', 'specify exactly', 'cannot both', 'mutually exclusive', 'exactly one',
'must specify'
])
if has_both_fields and has_exclusivity:
keywords_found = True

assert keywords_found, \
f'Error message should indicate conflict between prompt and ' \
f'input_ids, got: {error_msg}'

assert 'text' not in error_data, \
"Error response should not contain 'text' field"
assert 'output_ids' not in error_data, \
"Error response should not contain 'output_ids' field"

print(f' Got expected error: {error_msg[:100]}...')

except Exception as e:
print(f' Unexpected error: {e}')
raise

@pytest.mark.logprob
def test_input_ids_with_logprob(self, config):
print(f'\n[Model: {self.model_name}] Running input_ids with logprob test')
model_path = os.path.join(config.get('model_path'), self.model_name)

test_cases = [{
'name': 'basic logprob',
'text': 'The weather is',
'max_tokens': 3,
'expected_min_text': 3
}, {
'name': 'single token generation',
'text': 'Hello',
'max_tokens': 1,
'expected_min_text': 1
}, {
'name': 'multiple tokens with logprob',
'text': 'Artificial intelligence is',
'max_tokens': 5,
'expected_min_text': 5
}]

for test_case in test_cases:
test_name = test_case['name']
print(f'\n[Test: logprob - {test_name}]')

try:
input_ids = encode_text(model_path, test_case['text'])
except Exception as e:
pytest.skip(f'Tokenizer failed for {test_case["name"]}: {e}')

request_payload = {'input_ids': input_ids, 'max_tokens': test_case['max_tokens'], 'return_logprob': True}

resp = self._post(request_payload)
data = resp.json()

self._validate_generation_response(data=data, validate_tokens=True, expect_logprobs=True)

assert 'meta_info' in data, \
"Response should contain 'meta_info' when return_logprob=True"
meta = data['meta_info']

assert 'output_token_logprobs' in meta, \
"meta_info should contain 'output_token_logprobs'"
logprobs = meta['output_token_logprobs']

logprob_values = []

for i, item in enumerate(logprobs):
logprob_values.append(item[0])

avg_logprob = sum(logprob_values) / len(logprob_values)
if avg_logprob < -10.0:
pytest.fail(f'Generation confidence critically low '
f'(Avg: {avg_logprob:.2f})')

generated_text = data.get('text', '')
print(f' Generated tokens: {len(logprob_values)}')
print(f' Avg Logprob: {avg_logprob:.3f}')
print(f" Generated text: '{generated_text[:50]}...'")

def test_stop_str_with_include_flag(self):
print(f'\n[Model: {self.model_name}] Running stop_str with include flag test')
test_cases = [{
'name': 'simple stop word',
'prompt': 'Count: 1, 2, 3, ',
'stop_word': '6',
'max_tokens': 10,
}]

for test_case in test_cases:
test_name = test_case['name']
print(f'\n[Test: stop_str - {test_name}]')

prompt = test_case['prompt']
stop_word = test_case['stop_word']
max_tokens = test_case['max_tokens']

print(' Testing EXCLUDE mode (include_stop=False)...')
resp1 = self._post({
'prompt': prompt,
'max_tokens': max_tokens,
'stop': [stop_word],
'include_stop_str_in_output': False,
'return_logprob': True
})

self._validate_generation_response(resp1.json())
text_exclude = resp1.json()['text']
assert stop_word not in text_exclude, \
f"Stop word '{stop_word}' should NOT be in output when include_stop=False"

print(' Testing INCLUDE mode (include_stop=True)...')
resp2 = self._post({
'prompt': prompt,
'max_tokens': max_tokens,
'stop': [stop_word],
'include_stop_str_in_output': True,
'return_logprob': True
})

self._validate_generation_response(resp2.json())
text_include = resp2.json()['text']
assert stop_word in text_include, \
f"Stop word '{stop_word}' should be in output when include_stop=True"

def test_streaming_mode(self):
print(f'\n[Model: {self.model_name}] Running streaming mode test')
prompt = 'Count: 1, 2,'

resp = self._post({'prompt': prompt, 'max_tokens': 8, 'stream': True}, stream=True)
assert resp.status_code == 200
data = resp.json()

text = data['text']
output_ids = data['output_ids']
meta = data['meta_info']

assert isinstance(text, str) and len(text.strip()) > 0, \
'Generated text cannot be empty'
assert len(output_ids) >= 3, 'Output token count should be reasonable'

import re
count_matches = len(re.findall(r'\b[3-9]\b', text))
assert count_matches >= 2, \
f'Expected continuation of counting, but not enough numbers found ' \
f'(found {count_matches})'

stream_events = meta.get('stream_events', [])
assert stream_events >= len(output_ids), \
'Streaming event count should not be less than output token count'

print(f" Generated text: '{text}'")
print(f' Output tokens: {len(output_ids)}, '
f'Stream events: {stream_events}')

def test_streaming_incremental_correctness(self):
print(f'\n[Model: {self.model_name}] Running streaming incremental correctness test')
prompt = 'The sky is '

raw_resp = requests.post(self.api_url,
json={
'prompt': prompt,
'max_tokens': 10,
'stream': True
},
headers=self.headers,
stream=True,
timeout=30)
raw_resp.raise_for_status()

full_text_from_delta = ''
tokens_from_delta = []
event_count = 0

print(' Streaming chunks:')
for line in raw_resp.iter_lines():
if line:
line_str = line.decode('utf-8').strip()
if line_str.startswith('data: ') and '[DONE]' not in line_str:
try:
json_str = line_str[6:]
payload = json.loads(json_str)

delta_text = payload.get('text', '')
token_id = payload.get('token_id')

full_text_from_delta += delta_text
if token_id is not None:
tokens_from_delta.append(token_id)

event_count += 1
if delta_text.strip():
print(f" +'{delta_text}'")

except Exception as e:
print(f' [Parse warning]: {e}')
continue

assert len(full_text_from_delta.strip()) > 0, \
'Assembled text from streaming deltas is empty'
assert event_count >= 3, \
f'Too few streaming events received ({event_count}), ' \
f'connection might be interrupted'

print(f" Final assembled text: '{full_text_from_delta}'")
print(f' Total events received: {event_count}')

@pytest.mark.logprob
def test_return_logprob(self):
print(f'\n[Model: {self.model_name}] Running return_logprob test')

resp = self._post({'prompt': 'Paris is the capital of', 'max_tokens': 2, 'return_logprob': True})
data = resp.json()

self._validate_generation_response(data, validate_tokens=True, expect_logprobs=True)

print(f" Generated text: '{data['text']}'")

def test_same_session_id_allowed(self):
print(f'\n[Model: {self.model_name}] Running same session_id test')
sid = 9999

resp1 = self._post({'prompt': 'First message:', 'session_id': sid, 'max_tokens': 2})
resp2 = self._post({'prompt': 'Second message:', 'session_id': sid, 'max_tokens': 2})

assert resp1.status_code == 200
assert resp2.status_code == 200

data1 = resp1.json()
data2 = resp2.json()

self._validate_generation_response(data1)
self._validate_generation_response(data2)

text1 = data1['text'].strip()
text2 = data2['text'].strip()
assert text1 != text2

print(f" First response: '{data1['text']}'")
print(f" Second response: '{data2['text']}'")

def test_empty_prompt_rejected(self):
print(f'\n[Model: {self.model_name}] Running empty prompt test')

with pytest.raises(requests.HTTPError) as exc:
self._post({'prompt': '', 'max_tokens': 5})

assert exc.value.response.status_code == 400

try:
error_response = exc.value.response.json()
print(f' Error response: {error_response}')
assert 'error' in error_response or 'message' in error_response
except json.JSONDecodeError:
print(f' Non-JSON error: {exc.value.response.text[:100]}')

def test_input_ids_rejected(self):
print(f'\n[Model: {self.model_name}] Running input_ids invalid cases test')

invalid_cases = [{
'case': {
'input_ids': [],
'max_tokens': 5
},
'desc': 'Empty input_ids list'
}, {
'case': {
'input_ids': 'not_a_list',
'max_tokens': 5
},
'desc': 'input_ids is a string, not list'
}, {
'case': {
'max_tokens': 5
},
'desc': 'Missing input_ids field'
}]

for invalid_case in invalid_cases:
test_desc = invalid_case['desc']
payload = invalid_case['case']

with pytest.raises(requests.HTTPError) as exc_info:
self._post(payload)

response = exc_info.value.response
assert response.status_code in [400, 422], (f"Bad Request for case '{test_desc}', "
f'but got {response.status_code}')

def test_stress_concurrent_requests(self):
print(f'\n[Model: {self.model_name}] Running stress concurrent requests test')

def single_request(idx):
start_time = time.time()
try:
resp = requests.post(self.api_url,
json={
'prompt': f'Hello, task {idx}',
'max_tokens': 5,
'stream': False
},
headers=self.headers,
timeout=10)
resp.raise_for_status()
data = resp.json()

if 'text' in data and len(data['text'].strip()) > 0:
latency = time.time() - start_time
return {'success': True, 'latency': latency}
else:
return {'success': False, 'error': 'Empty response'}

except Exception as e:
return {'success': False, 'error': str(e)}

success_count = 0
total_latency = 0
failures = []

with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(single_request, i) for i in range(20)]

for i, future in enumerate(as_completed(futures)):
result = future.result()
if result['success']:
success_count += 1
total_latency += result['latency']
print(f" Req {i}: ✓ (Latency: {result['latency']:.2f}s)")
else:
failures.append(result['error'])
print(f' Req {i}: ✗')

success_rate = success_count / 20
assert success_rate == 1.0, \
f'Stress test failed: success rate {success_rate*100}% < 80%'

if success_count > 0:
avg_latency = total_latency / success_count
assert avg_latency < 5.0, \
f'Average latency too high: {avg_latency:.2f}s'
print(f' Performance: Avg Latency={avg_latency:.2f}s')

print(f' Summary: {success_count}/20 succeeded')

def test_stress_long_prompt_and_generation(self):
print(f'\n[Model: {self.model_name}] Running stress long prompt test')

long_prompt = 'Summarize: The quick brown fox jumps over the lazy dog. ' * 100

resp = self._post({'prompt': long_prompt, 'max_tokens': 512, 'temperature': 0.7})

data = resp.json()
self._validate_generation_response(data=data, validate_tokens=True)

def test_stress_streaming_under_load(self):
print(f'\n[Model: {self.model_name}] Running stress streaming under load test')

def stream_request(idx):
try:
resp = requests.post(self.api_url,
json={
'prompt': f'Stream load test {idx}',
'max_tokens': 10,
'stream': True
},
headers=self.headers,
stream=True,
timeout=30)

assert resp.status_code == 200
content_type = resp.headers.get('Content-Type', '')
assert 'text/event-stream' in content_type or \
'application/x-ndjson' in content_type

full_text = ''
event_count = 0
for line in resp.iter_lines():
if line and line.startswith(b'data:'):
event_count += 1
if b'[DONE]' in line:
break
try:
payload = json.loads(line.decode().replace('data: ', '', 1))
full_text += payload.get('text', '')
except Exception:
pass

assert len(full_text) > 0
assert event_count >= 3

return True

except Exception as e:
print(f' Stream {idx} error: {e}')
return False

with ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(stream_request, i) for i in range(10)]
results = [f.result() for f in futures]

success_count = sum(results)

assert success_count == 10, \
f'Concurrent streaming test failure rate too high: {success_count}/10'

print(f' Streaming under load: {success_count}/10 succeeded')

def test_temperature_parameter(self):
print(f'\n[Model: {self.model_name}] Running temperature parameter test')
prompt = 'The capital of France is'

resp_low = self._post({'prompt': prompt, 'max_tokens': 10, 'temperature': 0.1, 'stream': False})
resp_high = self._post({'prompt': prompt, 'max_tokens': 10, 'temperature': 0.9, 'stream': False})

data_low = resp_low.json()
data_high = resp_high.json()

self._validate_generation_response(data=data_low, validate_tokens=True)
self._validate_generation_response(data=data_high, validate_tokens=True)

assert 'Paris' in data_low['text'] or \
'paris' in data_low['text'].lower(), \
"Low temperature didn't answer correct capital"
assert data_low['text'] != data_high['text'], \
'High and low temperature outputs identical, ' \
'temperature may not be effective'

def test_top_p_parameter(self):
print(f'\n[Model: {self.model_name}] Running top_p parameter test')
prompt = 'The weather today is'

resp_strict = self._post({'prompt': prompt, 'max_tokens': 20, 'top_p': 0.01, 'stream': False})
resp_loose = self._post({'prompt': prompt, 'max_tokens': 20, 'top_p': 0.99, 'stream': False})

text_strict = resp_strict.json()
text_loose = resp_loose.json()

self._validate_generation_response(data=text_strict, validate_tokens=True)
self._validate_generation_response(data=text_loose, validate_tokens=True)

def test_top_k_parameter(self):
print(f'\n[Model: {self.model_name}] Running top_k parameter test')
prompt = 'Artificial intelligence'

resp_k10 = self._post({'prompt': prompt, 'max_tokens': 10, 'top_k': 10, 'stream': False})
resp_k50 = self._post({'prompt': prompt, 'max_tokens': 10, 'top_k': 50, 'stream': False})

text_k10 = resp_k10.json()
text_k50 = resp_k50.json()

self._validate_generation_response(data=text_k10, validate_tokens=True)
self._validate_generation_response(data=text_k50, validate_tokens=True)

def test_min_p_parameter(self):
print(f'\n[Model: {self.model_name}] Running min_p parameter test')
prompt = 'Machine learning is'

resp = self._post({'prompt': prompt, 'max_tokens': 10, 'min_p': 0.05, 'stream': False})
data = resp.json()
self._validate_generation_response(data)

def test_repetition_penalty(self):
print(f'\n[Model: {self.model_name}] Running repetition penalty test')
prompt = 'Repeat repeat repeat repeat'

resp_no_penalty = self._post({'prompt': prompt, 'max_tokens': 10, 'repetition_penalty': 1.0, 'stream': False})
resp_penalty = self._post({'prompt': prompt, 'max_tokens': 10, 'repetition_penalty': 1.5, 'stream': False})

text_no_penalty = resp_no_penalty.json()['text']
text_penalty = resp_penalty.json()['text']

def count_repeats(text):
words = text.lower().split()
return sum(1 for i in range(1, len(words)) if words[i] == words[i - 1])

repeats_no_penalty = count_repeats(text_no_penalty)
repeats_penalty = count_repeats(text_penalty)

assert repeats_penalty <= repeats_no_penalty, (
f'High penalty coefficient ({1.5}) repetition count ({repeats_penalty}) '
f'not less than low penalty ({1.0}) count ({repeats_no_penalty}), '
f'repetition_penalty ineffective')

def test_ignore_eos_parameter(self):
print(f'\n[Model: {self.model_name}] Running ignore_eos parameter test')
prompt = 'The sky is blue.'

resp_normal = self._post({'prompt': prompt, 'ignore_eos': False, 'stream': False})
data_normal = resp_normal.json()
self._validate_generation_response(data_normal)

resp_ignore = self._post({'prompt': prompt, 'ignore_eos': True, 'stream': False})
data_ignore = resp_ignore.json()
self._validate_generation_response(data_ignore)

reason_ignore = data_ignore.get('meta_info', {}).get('finish_reason', {}).get('type', 'unknown')

assert reason_ignore == 'length', \
f'ignore_eos=True must end due to length, actual: {reason_ignore}'

def test_skip_special_tokens(self, config):
print(f'[Model: {self.model_name}] Running skip_special_tokens test')
model_path = os.path.join(config.get('model_path'), self.model_name)
user_content = 'Hello [world]! This is a [test].'

tokenizer = AutoTokenizer.from_pretrained(model_path)
special_tokens_map = tokenizer.special_tokens_map

special_patterns = list(special_tokens_map.values())
special_patterns = [
item for sublist in special_patterns for item in (sublist if isinstance(sublist, list) else [sublist])
]

print('Special patterns:', special_patterns)

print(' Executing skip_special_tokens=True')
payload_true = {'prompt': user_content, 'max_tokens': 100, 'skip_special_tokens': True, 'stream': False}
resp_true = self._post(payload_true)
data_true = resp_true.json()
self._validate_generation_response(data=data_true, validate_tokens=True)
generated_text = data_true['text']
assert not any(pattern in generated_text for pattern in special_patterns), \
'Expected no special pattern in the generated text but found one.'

def test_stop_token_ids(self):
print(f'\n[Model: {self.model_name}] Running stop_token_ids test')
payload = {'prompt': 'Once upon a time', 'max_tokens': 50, 'stop_token_ids': [11], 'stream': False}

resp = self._post(payload)
assert resp.status_code == 200, \
f'HTTP request failed, status code: {resp.status_code}'

try:
data = resp.json()
except Exception as e:
pytest.fail(f'Response JSON parsing failed: {e}')

self._validate_generation_response(data)

generated_text = data.get('text', '')
finish_reason = data.get('meta_info', {}).get('finish_reason', {}).get('type', 'unknown')
actual_length = len(generated_text)

assert finish_reason in ['stop', 'eos'], \
f'Expected generation to end due to stop token, ' \
f'actual reason: {finish_reason}. This may mean stop_token_ids [11] ' \
f"didn't take effect, or generation was truncated."

print(f'\n stop_token_ids=[11] generation result: length={actual_length}, '
f"end reason='{finish_reason}', text='{generated_text[:20]}...'")

def test_combined_parameters(self):
print(f'\n[Model: {self.model_name}] Running combined parameters test')
resp = self._post({
'prompt': 'The future of AI',
'max_tokens': 15,
'temperature': 0.7,
'top_p': 0.9,
'top_k': 40,
'repetition_penalty': 1.1,
'stream': False
})

assert resp.status_code == 200
data = resp.json()
self._validate_generation_response(data)

def test_streaming_with_all_parameters(self):
print(f'\n[Model: {self.model_name}] Running streaming with all parameters test')
resp = self._post(
{
'prompt': 'Streaming test with parameters',
'max_tokens': 10,
'temperature': 0.8,
'top_p': 0.85,
'top_k': 30,
'repetition_penalty': 1.2,
'stop': ['test'],
'stream': True
},
stream=True)

assert resp.status_code == 200
data = resp.json()
self._validate_generation_response(data)

stream_events = data['meta_info'].get('stream_events', [])

assert stream_events == len(data['output_ids']) + 1, \
'Streaming event count should not be less than generated token count'

def test_invalid_temperature_values(self):
print(f'\n[Model: {self.model_name}] Running invalid temperature values test')
resp1 = self._post({'prompt': 'Test', 'max_tokens': 3, 'temperature': 0.0, 'stream': False})
assert resp1.status_code == 200, 'temperature=0.0 should be valid'

with pytest.raises(requests.HTTPError) as exc_info:
self._post({'prompt': 'Test', 'max_tokens': 3, 'temperature': -0.5, 'stream': False})
assert exc_info.value.response.status_code in [400, 422]

print(' Invalid temperature values test passed')

def test_invalid_top_p_values(self):
print(f'\n[Model: {self.model_name}] Running invalid top_p values test')
with pytest.raises(requests.HTTPError) as exc_info:
self._post({'prompt': 'Test', 'max_tokens': 3, 'top_p': 1.5, 'stream': False})
assert exc_info.value.response.status_code in [400, 422]

print(' Invalid top_p values test passed')

def test_invalid_top_k_values(self):
print(f'\n[Model: {self.model_name}] Running invalid top_k values test')
with pytest.raises(requests.HTTPError) as exc_info:
self._post({'prompt': 'Test', 'max_tokens': 3, 'top_k': -5, 'stream': False})
assert exc_info.value.response.status_code in [400, 422]

print(' Invalid top_k values test passed')

def test_boundary_max_tokens(self):
print(f'\n[Model: {self.model_name}] Running boundary max_tokens test')
resp1 = self._post({'prompt': 'Min tokens', 'max_tokens': 1, 'stream': False})
assert resp1.status_code == 200
data1 = resp1.json()
assert data1['meta_info']['completion_tokens'] >= 1

resp2 = self._post({'prompt': 'Max tokens test', 'max_tokens': 2048, 'stream': False})
assert resp2.status_code == 200

with pytest.raises(requests.HTTPError) as exc:
self._post({'prompt': 'Test', 'max_tokens': -2, 'stream': False})

assert exc.value.response.status_code == 400

with pytest.raises(requests.HTTPError) as exc:
self._post({'prompt': 'Test', 'max_tokens': 0, 'stream': False})

assert exc.value.response.status_code == 400

print(' Max tokens boundary test passed')

def test_parameter_interactions(self):
print(f'\n[Model: {self.model_name}] Running parameter interactions test')
resp1 = self._post({
'prompt': 'Deterministic generation',
'max_tokens': 10,
'temperature': 0.0,
'top_p': 0.5,
'top_k': 10,
'stream': False
})
assert resp1.status_code == 200
data1 = resp1.json()

self._validate_generation_response(data1)

print(' Parameter interaction (temp=0 with top_p/k) passed')

def test_session_id_with_all_parameters(self):
print(f'\n[Model: {self.model_name}] Running session_id with all parameters test')
session_id = int(time.time()) % 100000

resp1 = self._post({
'session_id': session_id,
'prompt': 'Hello, introduce yourself briefly.',
'max_tokens': 20,
'temperature': 0.7,
'stream': False
})
assert resp1.status_code == 200
data1 = resp1.json()
self._validate_generation_response(data1)

resp2 = self._post({
'session_id': session_id,
'prompt': 'What was I just talking about?',
'max_tokens': 20,
'temperature': 0.7,
'stream': False
})
assert resp2.status_code == 200
data2 = resp2.json()
self._validate_generation_response(data2)

assert 'What' in data2['text'] or 'hello' in data2['text'].lower() or \
len(data2['text']) > 0

print(f' Session {session_id} test passed')

def test_edge_cases_stop_conditions(self):
print(f'\n[Model: {self.model_name}] Running edge cases stop conditions test')
resp1 = self._post({'prompt': 'Test with empty stop list', 'max_tokens': 10, 'stop': [], 'stream': False})
assert resp1.status_code == 200
data1 = resp1.json()
assert len(data1['text']) > 0

resp2 = self._post({
'prompt': 'Write a sentence ending with a period. Stop here test.',
'max_tokens': 50,
'stop': ['.'],
'stream': False
})
assert resp2.status_code == 200
data2 = resp2.json()

text2 = data2['text']
finish_reason = data2['meta_info']['finish_reason']['type']

if '.' in text2:
assert text2.strip().endswith('.'), \
"Stop token '.' should cause generation to end at period"

assert finish_reason in ['stop', 'eos'], \
f'Expected to end due to stop token, actual: {finish_reason}'

print(f" Stop at '.': generated '{text2}' (Reason: {finish_reason})")

def test_spaces_between_special_tokens(self, config):
print(f'[Model: {self.model_name}] Running spaces_between_special_tokens test')
model_path = os.path.join(config.get('model_path'), self.model_name)
user_content = 'Hello [world]! This is a [test].'

tokenizer = AutoTokenizer.from_pretrained(model_path)
special_tokens_map = tokenizer.special_tokens_map

special_patterns = list(special_tokens_map.values())
special_patterns = [
item for sublist in special_patterns for item in (sublist if isinstance(sublist, list) else [sublist])
]

print(' Executing skip_special_tokens=False and checking spaces between special tokens')
payload_false = {'prompt': user_content, 'max_tokens': 100, 'skip_special_tokens': False, 'stream': False}
resp_false = self._post(payload_false)
data_false = resp_false.json()
self._validate_generation_response(data=data_false, validate_tokens=True)
generated_text = data_false['text']

for i in range(len(generated_text) - 1):
if generated_text[i] in special_patterns and generated_text[i + 1] not in [' ', '\n']:
assert False, f'Expected space after special token {generated_text[i]} but found none.'

@pytest.mark.experts
@pytest.mark.pytorch
def test_request_returns_experts(self):
print(f'\n[Model: {self.model_name}] Running request with experts test')
resp1 = self._post({
'prompt': 'Deterministic generation',
'max_tokens': 50,
'temperature': 0.8,
'return_routed_experts': True
})
assert resp1.status_code == 200
data1 = resp1.json()

self._validate_generation_response(data1, validate_experts=True)

+ 7
- 35
autotest/utils/benchmark_utils.py View File

@@ -1,9 +1,9 @@
import os
import subprocess
from subprocess import PIPE, Popen
from subprocess import PIPE

import allure
import psutil
from utils.common_utils import execute_command_with_logging
from utils.config_utils import _is_bf16_supported_by_device, get_workerid
from utils.run_restful_chat import health_check

@@ -55,12 +55,12 @@ def throughput_test(config, run_id, run_config, cuda_prefix: str = None, worker_
get_max_cache_entry(model, backend), '--csv ', csv_path
])

returncode, stderr = run_testcase(cmd, benchmark_log)
result, stderr = execute_command_with_logging(cmd, benchmark_log)
allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)

if returncode == 0 and not os.path.isfile(csv_path):
if result and not os.path.isfile(csv_path):
return False, 'result is empty'
if returncode != 0:
if not result:
return False, stderr

return True, 'success'
@@ -118,7 +118,7 @@ def longtext_throughput_test(config,
if concurrency:
cmd += f' --concurrency {concurrency}'

returncode, stderr = run_testcase(cmd, benchmark_log)
returncode, stderr = execute_command_with_logging(cmd, benchmark_log)
allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)

if returncode == 0 and not os.path.isfile(csv_path):
@@ -296,7 +296,7 @@ def prefixcache_throughput_test(config,
if concurrency:
command += f' --concurrency {concurrency}'

returncode, stderr = run_testcase(command, benchmark_log)
returncode, stderr = execute_command_with_logging(command, benchmark_log)
allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)

if returncode == 0 and not os.path.isfile(csv_path):
@@ -307,34 +307,6 @@ def prefixcache_throughput_test(config,
return True, ''


def run_testcase(cmd, benchmark_log):
if os.path.isfile(benchmark_log):
write_type = 'a'
else:
write_type = 'w'
with open(benchmark_log, write_type) as f:
f.writelines('reproduce command: ' + cmd + '\n')
print('reproduce command: ' + cmd)
with Popen([cmd], stdin=PIPE, stdout=f, stderr=PIPE, shell=True, text=True, encoding='utf-8') as process:
try:
stdout, stderr = process.communicate(None)
except Exception:
kill_process(process.pid)
raise
except: # noqa: E722
kill_process(process.pid)
raise
retcode = process.poll()
return retcode, stderr


def kill_process(pid):
parent = psutil.Process(pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()


def get_command_with_extra(cmd, cuda_prefix: str = None):
if cuda_prefix is not None and len(cuda_prefix) > 0:
cmd = ' '.join([cuda_prefix, cmd])


+ 60
- 0
autotest/utils/common_utils.py View File

@@ -0,0 +1,60 @@
import os
import subprocess
import sys
from typing import Tuple

import psutil


def execute_command_with_logging(cmd, log_file_path: str) -> Tuple[bool, str]:
if os.path.isfile(log_file_path):
write_type = 'a'
else:
write_type = 'w'
try:
with open(log_file_path, write_type, encoding='utf-8') as log_file:
start_msg = f'execute command: {cmd}\n'
print(start_msg, end='')
log_file.write(start_msg)
log_file.flush()

process = subprocess.run(cmd,
shell=True,
text=True,
encoding='utf-8',
errors='replace',
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
bufsize=1)

if process.stdout:
print(process.stdout, end='')
log_file.write(process.stdout)

if process.returncode == 0:
result = True
result_msg = f'success: {process.returncode}\n'
else:
result = False
result_msg = f'fail: {process.returncode}\n'

print(result_msg, end='')
log_file.write(result_msg)

return result, result_msg.strip()

except Exception as e:
error_msg = f'exec fail: {str(e)}\n'
print(error_msg, file=sys.stderr, end='')

with open(log_file_path, 'a', encoding='utf-8') as log_file:
log_file.write(error_msg)

return False, error_msg.strip()


def kill_process(pid):
parent = psutil.Process(pid)
for child in parent.children(recursive=True):
child.kill()
parent.kill()

+ 34
- 109
autotest/utils/config_utils.py View File

@@ -61,30 +61,8 @@ def get_turbomind_model_list(parallel_config: Optional[Union[int, Dict[str, int]
if parallel_config is not None:
filtered_models = []

target_config = {}
if isinstance(parallel_config, int):
target_config = {'tp': parallel_config}
elif isinstance(parallel_config, dict):
target_config = parallel_config

for model in all_models:

model_config = get_parallel_config(config, model)

if not model_config:

if not target_config or (len(target_config) == 1 and 'tp' in target_config
and target_config['tp'] == 1):
filtered_models.append(model)
continue

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

if match:
if is_model_in_list(config, parallel_config, model):
filtered_models.append(model)

all_models = filtered_models
@@ -137,21 +115,7 @@ def get_torch_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] =
for model in all_models:

model_config = get_parallel_config(config, model)

if not model_config:

if not target_config or (len(target_config) == 1 and 'tp' in target_config
and target_config['tp'] == 1):
filtered_models.append(model)
continue

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

if match:
if is_model_in_list(config, parallel_config, model):
filtered_models.append(model)

all_models = filtered_models
@@ -275,25 +239,8 @@ def get_vl_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = No
if parallel_config is not None:
filtered_models = []

target_config = {}
if isinstance(parallel_config, int):
target_config = {'tp': parallel_config}
elif isinstance(parallel_config, dict):
target_config = parallel_config

for model in vl_models:
model_config = get_parallel_config(config, model)

if not model_config:
continue

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

if match:
if is_model_in_list(config, parallel_config, model):
filtered_models.append(model)

vl_models = filtered_models
@@ -354,37 +301,23 @@ def get_vl_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = No

def get_evaluate_turbomind_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = None,
is_longtext: bool = False,
is_mllm: bool = False,
kvint_list: list = []):

config = get_config()

if is_longtext:
case_list_base = [item for item in config.get('longtext_model', [])]
elif is_mllm:
case_list_base = config.get('mllm_evaluate_model', [])
else:
case_list_base = config.get('evaluate_model', [])

if parallel_config is not None:
filtered_models = []

target_config = {}
if isinstance(parallel_config, int):
target_config = {'tp': parallel_config}
elif isinstance(parallel_config, dict):
target_config = parallel_config

for model in case_list_base:
model_config = get_parallel_config(config, model)

if not model_config:
continue

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

if match:
if is_model_in_list(config, parallel_config, model):
filtered_models.append(model)

case_list_base = filtered_models
@@ -474,37 +407,23 @@ def get_evaluate_turbomind_model_list(parallel_config: Optional[Union[int, Dict[

def get_evaluate_pytorch_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = None,
is_longtext: bool = False,
is_mllm: bool = False,
kvint_list: list = []):

config = get_config()

if is_longtext:
case_list_base = [item for item in config.get('longtext_model', [])]
elif is_mllm:
case_list_base = config.get('mllm_evaluate_model', [])
else:
case_list_base = config.get('evaluate_model', [])

if parallel_config is not None:
filtered_models = []

target_config = {}
if isinstance(parallel_config, int):
target_config = {'tp': parallel_config}
elif isinstance(parallel_config, dict):
target_config = parallel_config

for model in case_list_base:
model_config = get_parallel_config(config, model)

if not model_config:
continue

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

if match:
if is_model_in_list(config, parallel_config, model):
filtered_models.append(model)

case_list_base = filtered_models
@@ -576,25 +495,9 @@ def get_benchmark_model_list(parallel_config: Optional[Union[int, Dict[str, int]
if parallel_config is not None:
filtered_models = []

target_config = {}
if isinstance(parallel_config, int):
target_config = {'tp': parallel_config}
elif isinstance(parallel_config, dict):
target_config = parallel_config

for model in case_list_base:
model_config = get_parallel_config(config, model)

if not model_config:
continue

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

if match:
if is_model_in_list(config, parallel_config, model):
filtered_models.append(model)

case_list_base = filtered_models
@@ -801,3 +704,25 @@ def unset_device_env_variable():
else:
if 'CUDA_VISIBLE_DEVICES' in os.environ:
del os.environ['CUDA_VISIBLE_DEVICES']


def is_model_in_list(config, parallel_config, model):
model_config = get_parallel_config(config, model)

target_config = {}
if isinstance(parallel_config, int):
target_config = {'tp': parallel_config}
elif isinstance(parallel_config, dict):
target_config = parallel_config

if not model_config:
if not target_config or (len(target_config) == 1 and 'tp' in target_config and target_config['tp'] == 1):
return True

match = True
for key, target_value in target_config.items():
if key not in model_config or model_config[key] != target_value:
match = False
break

return match

+ 22
- 0
autotest/utils/constant.py View File

@@ -0,0 +1,22 @@
DEFAULT_PORT = 23333
PROXY_PORT = 8000

EVAL_CONFIGS = {
'default': {
'query_per_second': 4,
'max_out_len': 64000,
'max_seq_len': 65536,
'batch_size': 500,
'temperature': 0.6,
},
'gpt': {
'query_per_second': 4,
'max_out_len': 64000,
'max_seq_len': 65536,
'batch_size': 500,
'temperature': 0.6,
'openai_extra_kwargs': {
'reasoning_effort': 'high',
}
}
}

+ 149
- 33
autotest/utils/evaluate_utils.py View File

@@ -1,16 +1,57 @@
import csv
import glob
import json
import os
import subprocess

import allure
import pandas as pd
from mmengine.config import Config
from utils.common_utils import execute_command_with_logging

DEFAULT_PORT = 23333


def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, communicator, work_dir=None):
def write_to_summary(model_name, tp_num, result, backend_type, communicator, metrics, work_dir=None):
status = '✅ PASS' if result else '❌ FAIL'

dataset_name = []
dataset_metrics = []
for key in sorted(metrics.keys()):
dataset_name.append(key)
dataset_metrics.append(metrics.get(key, ''))

summary_dataset_name = ' | '.join(dataset_name)
summary_dataset_metrics = ' | '.join(dataset_metrics)

summary_file = os.environ.get('GITHUB_STEP_SUMMARY', '')
md_summary_file = f'{work_dir}/summary.md'
summary_line = f'| {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n' # noqa: E501

write_header = not os.path.exists(md_summary_file) or os.path.getsize(md_summary_file) == 0
with open(md_summary_file, 'a') as f:
if write_header:
dash_line = '-----|' * (len(metrics.keys()))
f.write('## Model Evaluation Results\n')
f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
f.write(summary_line)
if summary_file:
write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
with open(summary_file, 'a') as f:
if write_header:
dash_line = '-----|' * (len(metrics.keys()))
f.write('## Model Evaluation Results\n')
f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
f.write(summary_line)
else:
print(
f'Summary: {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics}' # noqa: E501
)


def llm_summary(model_name, tp_num, result, backend_type, communicator, work_dir=None):
metrics = {}

if work_dir and os.path.exists(work_dir):
@@ -45,39 +86,54 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, c

except Exception as e:
print(f'Error reading metrics: {str(e)}')
write_to_summary(model_name, tp_num, result, backend_type, communicator, metrics, work_dir)

dataset_name = []
dataset_metrics = []
for key in sorted(metrics.keys()):
dataset_name.append(key)
dataset_metrics.append(metrics.get(key, ''))

summary_dataset_name = ' | '.join(dataset_name)
summary_dataset_metrics = ' | '.join(dataset_metrics)
def mllm_summary(model_name,
summary_model_name,
tp_num,
result,
backend_type,
communicator,
work_dir=None,
dataset_list=['MMBench_V11_MINI', 'MMStar_MINI', 'AI2D_MINI', 'OCRBench_MINI']):
metrics = {}
pattern = os.path.join(work_dir, model_name, 'T*')
t_dirs = [d for d in glob.glob(pattern) if os.path.isdir(d)]

if not t_dirs:
return

# 按修改时间排序
t_dirs.sort(key=os.path.getmtime, reverse=True)
latest_dir = t_dirs[0]

for dataset in dataset_list:
if dataset == 'OCRBench_MINI':
score_file = f'{latest_dir}/{model_name}_{dataset}_score.json'
cur_score = 0
with open(score_file, 'r') as f:
total_score = json.load(f)
cur_score = total_score['Final Score Norm']
metrics[dataset] = f'{cur_score:.2f}' # noqa: E231
else:
score_file = f'{latest_dir}/{model_name}_{dataset}_acc.csv'
df = pd.read_csv(score_file)
cur_score = df['Overall'].iloc[0]
if dataset == 'MMBench_V11_MINI':
cur_score = df.loc[df['split'] == 'dev', 'Overall'].values
cur_score = cur_score * 100
metrics[dataset] = f'{cur_score.item():.2f}' # noqa: E231

summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
summary_line = f'| {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n' # noqa: E501
if summary_file:
write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
with open(summary_file, 'a') as f:
if write_header:
dash_line = '-----|' * (len(metrics.keys()))
f.write('## Model Evaluation Results\n')
f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
f.write(summary_line)
else:
print(
f'Summary: {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics}' # noqa: E501
)
write_to_summary(summary_model_name, tp_num, result, backend_type, communicator, metrics, work_dir)


def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT, test_type='infer', **kwargs):
def eval_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT, test_type='infer', **kwargs):
work_dir = None
try:
model_name = prepare_environment['model']
backend_type = prepare_environment['backend']
communicator = prepare_environment.get('communicator', 'cuda-ipc')
communicator = prepare_environment.get('communicator', 'nccl')
quant_policy = prepare_environment.get('quant_policy', 0)

parallel_config = prepare_environment.get('parallel_config', 1)
@@ -151,8 +207,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
elif test_type == 'eval':
if not os.path.exists(temp_config_path):
error_msg = f'Temp config file {temp_config_path} not found for eval stage'
write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type,
communicator, work_dir)
llm_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator,
work_dir)
return False, error_msg

cfg = Config.fromfile(temp_config_path)
@@ -187,6 +243,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
cmd = [
'opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir, '-m', test_type
]

print(f"Running command: {' '.join(cmd)}")
print(f'Work directory: {work_dir}')

@@ -249,9 +306,11 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
error_lines = ' | '.join(error_lines[:3])
final_msg += f'\nLog errors: {error_lines}'

allure.attach.file(log_file, attachment_type=allure.attachment_type.TEXT)

if test_type == 'eval':
write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type,
communicator, work_dir)
llm_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, communicator,
work_dir)

return final_result, final_msg

@@ -263,12 +322,69 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
timeout_msg = (f'Evaluation timed out for {model_name} '
f'after 259200 seconds')
if work_dir and test_type == 'eval':
write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator,
work_dir)
llm_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator, work_dir)
return False, timeout_msg
except Exception as e:
error_msg = f'Error during evaluation for {model_name}: {str(e)}'
if work_dir and test_type == 'eval':
write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator,
work_dir)
llm_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator, work_dir)
return False, error_msg


def mllm_eval_test(config,
run_id,
prepare_environment,
worker_id='gw0',
port=DEFAULT_PORT,
test_type='infer',
**kwargs):
work_dir = None
model_name = prepare_environment['model']
backend_type = prepare_environment['backend']
tp_num = prepare_environment.get('tp_num', 1)
communicator = prepare_environment.get('communicator', 'nccl')
quant_policy = prepare_environment.get('quant_policy', 0)

summary_model_name = model_name
if quant_policy in [4, 8]:
summary_model_name = f'{model_name}-kvint{quant_policy}'

model_base_path = config.get('model_path', '/nvme/qa_test_models')
model_path = os.path.join(model_base_path, model_name)

print(f'Starting VLMEvalKit evaluation for model: {model_name}')
print(f'Model path: {model_path}')
print(f'Backend: {backend_type}')

log_path = config.get('mllm_eval_log_path', '/nvme/qa_test_models/mllm_evaluation_report') + f'/{run_id}'
os.makedirs(log_path, exist_ok=True)

work_dir = os.path.join(log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{quant_policy}")
simple_model_name = model_name.split('/')[-1]
os.makedirs(work_dir, exist_ok=True)
if test_type == 'infer':
cmd = f'python run.py --data MMBench_V11_MINI MMStar_MINI AI2D_MINI OCRBench_MINI --model {simple_model_name} --base-url http://127.0.0.1:{port}/v1 --reuse --work-dir {work_dir} --api-nproc 32 --mode infer' # noqa

elif test_type == 'eval':
cmd = f'python run.py --data MMBench_V11_MINI MMStar_MINI AI2D_MINI OCRBench_MINI --model {simple_model_name} --base-url http://127.0.0.1:empty/v1 --reuse --work-dir {work_dir} --api-nproc 32 --mode eval --judge Qwen2.5-32B-Instruct --judge-base-url http://127.0.0.1:{port}/v1' # noqa

print(f'Work directory: {work_dir}')

log_filename = (f'{backend_type}_'
f"{model_name.replace('/', '_')}_"
f'{communicator}_'
f'{worker_id}_'
f'{quant_policy}.log')
log_file = os.path.join(log_path, log_filename)
result, msg = execute_command_with_logging(cmd, log_file)

if test_type == 'eval':
mllm_summary(simple_model_name,
summary_model_name,
tp_num,
result,
backend_type,
communicator,
work_dir,
dataset_list=['MMBench_V11_MINI', 'MMStar_MINI', 'AI2D_MINI', 'OCRBench_MINI'])
return result, msg

+ 39
- 0
autotest/utils/toolkit.py View File

@@ -0,0 +1,39 @@
from functools import lru_cache
from typing import List

from transformers import AutoTokenizer


def parse_sse_stream(content: str) -> list:
"""Parse SSE (Server-Sent Events) stream content into a list of events.

Each event is either a JSON string or "[DONE]".
"""
lines = content.strip().split('\n')
events = []
for line in lines:
line = line.strip()
if line.startswith('data: '):
data = line[6:] # remove "data: "
if data.strip() == '[DONE]':
events.append('[DONE]')
else:
events.append(data)
return events


@lru_cache(maxsize=4)
def _load_tokenizer_cached(model_path: str):
try:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
return tokenizer
except Exception as e:
raise RuntimeError(f"Failed to load tokenizer from '{model_path}': {e}")


def encode_text(model_path: str, text: str) -> List[int]:
tokenizer = _load_tokenizer_cached(model_path)

encoded = tokenizer.encode(text)

return encoded

+ 1
- 1
lmdeploy/serve/openai/api_server.py View File

@@ -925,7 +925,7 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
if error_check_ret is not None:
return error_check_ret
if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id!r} is occupied.')

prompt = request.prompt
input_ids = request.input_ids


+ 9
- 0
lmdeploy/serve/openai/serving_generate.py View File

@@ -23,6 +23,15 @@ def check_request(request: GenerateReqInput, engine_config: 'TurbomindEngineConf
if (request.prompt is not None) ^ (request.input_ids is None):
return 'You must specify exactly one of prompt or input_ids'

if request.prompt is not None and request.prompt == '':
return 'The prompt must not be an empty string'

if request.input_ids is not None and len(request.input_ids) == 0:
return 'The input_ids must not be an empty list'

if request.max_tokens is not None and request.max_tokens <= 0:
return f'The max_tokens {request.max_tokens!r} must be a positive integer.'

# check sampling settings
if not (0 < request.top_p <= 1):
return f'The top_p {request.top_p!r} must be in (0, 1].'


Loading…
Cancel
Save
Baidu
map