[ci] add mllm eval (#4194 )

* update * update * update * update * update * update * update * update * update * update * update * update * fix lint * updata * add more models * update * update * update * update * add more models * update score * update score format * update * fix lint * update * fix typo
Add test for "generate" endpoint (#4181 )
--- a/.github/workflows/api_eval.yml
+++ b/.github/workflows/api_eval.yml
@@ -15,7 +15,7 @@ on:
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      execution_mode:
--- a/.github/workflows/api_eval_h800.yml
+++ b/.github/workflows/api_eval_h800.yml
@@ -15,7 +15,7 @@ on:
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      execution_mode:
--- a/.github/workflows/daily_ete_test.yml
+++ b/.github/workflows/daily_ete_test.yml
@@ -15,12 +15,12 @@ on:
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      model:
        required: true
        description: 'Set testcase module filter: llm, vllm. Default contains all models'
        description: 'Set testcase module filter: llm, mllm. Default contains all models'
        type: string
        default: "['llm','mllm']"
      function:
--- a/.github/workflows/daily_ete_test_3090.yml
+++ b/.github/workflows/daily_ete_test_3090.yml
@@ -15,12 +15,12 @@ on:
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      model:
        required: true
        description: 'Set testcase module filter: llm, vllm. Default contains all models'
        description: 'Set testcase module filter: llm, mllm. Default contains all models'
        type: string
        default: "['llm','mllm']"
      function:
--- a/.github/workflows/daily_ete_test_5080.yml
+++ b/.github/workflows/daily_ete_test_5080.yml
@@ -15,12 +15,12 @@ on:
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      model:
        required: true
        description: 'Set testcase module filter: llm, vllm. Default contains all models'
        description: 'Set testcase module filter: llm, mllm. Default contains all models'
        type: string
        default: "['llm','mllm']"
      function:
--- a/.github/workflows/daily_ete_test_h800.yml
+++ b/.github/workflows/daily_ete_test_h800.yml
@@ -15,12 +15,12 @@ on:
        default: 'main'
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      model:
        required: true
        description: 'Set testcase module filter: llm, vllm. Default contains all models'
        description: 'Set testcase module filter: llm, mllm. Default contains all models'
        type: string
        default: "['llm','mllm']"
      function:
--- a/.github/workflows/evaluate_remote.yml
+++ b/.github/workflows/evaluate_remote.yml
@@ -35,7 +35,7 @@ on:
        default: "['chat_models','base_models']"
      backend:
        required: true
        description: 'Set backend testcase filter: turbomind or pytorch or turbomind, pytorch. Default is "["turbomind", "pytorch"]"'
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"

--- a/.github/workflows/mllm_api_eval.yml
+++ b/.github/workflows/mllm_api_eval.yml
@@ -0,0 +1,223 @@
 name: mllm_api_eval

 on:
  workflow_dispatch:
    inputs:
      repo_org:
        required: false
        description: 'Tested repository organization name. Default is InternLM/lmdeploy'
        type: string
        default: 'InternLM/lmdeploy'
      repo_ref:
        required: false
        description: 'Set branch or tag or commit id. Default is "main"'
        type: string
        default: 'main'
      backend:
        required: true
        description: 'Set backend filter. Default is "["turbomind", "pytorch"]"'
        type: string
        default: "['turbomind', 'pytorch']"
      execution_mode:
        required: false
        description: 'Select execution mode: infer, eval, or both. Default is "both"'
        type: choice
        options:
          - both
          - infer
          - eval
        default: 'both'
      run_id:
        required: false
        description: 'Set custom run ID. If not provided, github.run_id will be used'
        type: string
        default: ''


 env:
  HOST_PIP_CACHE_DIR: /nvme/github-actions/pip-cache
  HOST_LOCALTIME: /usr/share/zoneinfo/Asia/Shanghai
  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
  REPORT_DIR: /nvme/qa_test_models/mllm_evaluation_report/allure_report/${{ github.run_id }}
  COV_PARAM: --cov /opt/py3/lib/python3.10/site-packages/lmdeploy
  FAIL_CONFIG: '--lf'
  TEST_CODE_PATH: /nvme/qa_test_models/test_pkg/lmdeploy/${{ github.run_id }}
  OFFLINE_CODE_PATH: /nvme/qa_test_models/offline_pkg/lmdeploy
  OFFLINE_REQUIREMENTS: /nvme/qa_test_models/offline_pkg/requirements.txt
  DEEPSEEK_VL: /nvme/qa_test_models/offline_pkg/DeepSeek-VL
  LMUData: /nvme/qa_test_models/LMUData
  LOCAL_LLM: Qwen2.5-32B-Instruct
  OPENAI_API_KEY: sk-empty
  HF_DATASETS_OFFLINE: 1
  HF_DATASETS_CACHE: /nvme/qa_test_models/hf_datasets
  HF_HUB_OFFLINE: 1
  HF_EVALUATE_OFFLINE: 1

 jobs:
  linux-build:
    if: ${{ !cancelled() }}
    strategy:
      matrix:
        pyver: [py310]
    runs-on: ubuntu-latest
    env:
      PYTHON_VERSION: ${{ matrix.pyver }}
      PLAT_NAME: manylinux2014_x86_64
      DOCKER_TAG: cuda12.8
      OUTPUT_FOLDER: cuda12.8_dist_${{ github.run_id }}
    steps:
      - name: Free disk space
        uses: jlumbroso/free-disk-space@main
        with:
          # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
          tool-cache: false
          docker-images: false
          # All of these default to true, but feel free to set to "false" if necessary for your workflow
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          swap-storage: false
      - name: Checkout repository
        uses: actions/checkout@v3
        with:
          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Build
        run: |
          echo ${PYTHON_VERSION}
          echo ${PLAT_NAME}
          echo ${DOCKER_TAG}
          echo ${OUTPUT_FOLDER}
          echo ${GITHUB_RUN_ID}
          # remove -it
          sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
          bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
      - name: Upload Artifacts
        uses: actions/upload-artifact@v4
        with:
          if-no-files-found: error
          path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
          retention-days: 1
          name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}

  download_pkgs:
    needs: linux-build
    if: ${{!cancelled()}}
    runs-on: [self-hosted, linux-a100]
    timeout-minutes: 50
    container:
      image: openmmlab/lmdeploy:latest-cu12.8
      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
      volumes:
        - /nvme/qa_test_models:/nvme/qa_test_models
        - /mnt/121:/mnt/121
        - /mnt/104:/mnt/104
        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
    steps:
      - name: Clone repository
        uses: actions/checkout@v2
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        with:
          repository: ${{ github.event.inputs.repo_org || 'InternLM/lmdeploy' }}
          ref: ${{github.event.inputs.repo_ref || 'main'}}
      - name: Copy repository
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r . ${{env.TEST_CODE_PATH}}
      - name: Copy repository - offline
        if: ${{inputs.offline_mode}}
        run: rm -rf ${{env.TEST_CODE_PATH}} && mkdir ${{env.TEST_CODE_PATH}} && chmod 777 ${{env.TEST_CODE_PATH}} && cp -r ${{env.OFFLINE_CODE_PATH}}/. ${{env.TEST_CODE_PATH}}
      - name: Download Artifacts
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        uses: actions/download-artifact@v4
        with:
          name: my-artifact-${{ github.run_id }}-py310
      - name: Copy Artifacts
        if: ${{github.event_name == 'schedule' || !inputs.offline_mode}}
        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
      - name: Copy Artifacts - offline
        if: ${{inputs.offline_mode}}
        run: rm ${{env.TEST_CODE_PATH}}/lmdeploy-*.whl -f && cp ${{env.OFFLINE_CODE_PATH}}/lmdeploy-*.whl ${{env.TEST_CODE_PATH}}
      - name: Mark as start
        run: |
          chmod -R 777 ${{env.TEST_CODE_PATH}}
          mkdir ${{env.REPORT_DIR}} -p
          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt

  test_evaluation:
    needs: download_pkgs
    if: ${{ !cancelled() }}
    runs-on: [self-hosted, test-140]
    timeout-minutes: 2400
    strategy:
      fail-fast: false
      matrix:
        backend: ${{ fromJSON(inputs.backend || '["turbomind", "pytorch"]')}}
        gpu_num: ['gpu_num_1', 'gpu_num_2', 'gpu_num_4', 'gpu_num_8']
        include:
          - n: 8
            gpu_num: gpu_num_1
          - n: 4
            gpu_num: gpu_num_2
          - n: 2
            gpu_num: gpu_num_4
          - n: 1
            gpu_num: gpu_num_8
    container:
      image: openmmlab/lmdeploy:latest-cu12.8
      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e NVIDIA_DISABLE_REQUIRE=1 --pull never"
      volumes:
        - /nvme/github-actions/pip-cache:/root/.cache/pip
        - /nvme/github-actions/packages:/root/packages
        - /nvme/github-actions/resources:/root/resources
        - /nvme/qa_test_models/evaluation-reports:/root/evaluation-reports
        - /nvme/qa_test_models:/nvme/qa_test_models
        - /nvme/huggingface_hub:/nvme/huggingface_hub
        - /mnt/121:/mnt/121
        - /mnt/104:/mnt/104
        - /mnt/bigdisk:/mnt/bigdisk
        - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime:ro
    steps:
      - name: Copy repository and Artifacts
        run: |
          cp -r ${{env.TEST_CODE_PATH}}/. .
          mkdir ${{env.REPORT_DIR}} -p
          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
      - name: Install lmdeploy - dependency
        run: |
          python3 -m pip install -r /nvme/qa_test_models/offline_pkg/requirements.txt
      - name: Install lmdeploy
        run: |
          python3 -m pip uninstall lmdeploy -y && python3 -m pip install lmdeploy-*.whl --no-deps
          python3 -m pip install -r requirements/test.txt
      - name: Install vlmeval
        run: |
          python3 -m pip install pandas datasets scikit-learn pylatexenc math_verify
          apt update && apt install -y libgl1 libglib2.0-0
          cp -r /nvme/qa_test_models/offline_pkg/VLMEvalKit .
          cd VLMEvalKit && pip install .
      - name: Check env
        run: |
          python3 -m pip list
          lmdeploy check_env
          mkdir ${{env.REPORT_DIR}} -p
          echo "starttime=$(date +%s)" > ${{env.REPORT_DIR}}/status.txt
      - name: Setup paths for evaluation
        if: (matrix.backend == 'pytorch' || matrix.backend == 'turbomind')
        run: |
          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
          cd VLMEvalKit && cp -r ../autotest .
          execution_mode="${{ github.event.inputs.execution_mode || 'both' }}"
          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "infer" ]; then
            pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and infer" -n ${{matrix.n}} --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
          fi
          if [ "$execution_mode" = "both" ] || [ "$execution_mode" = "eval" ]; then
            pytest autotest/evaluate/test_mllm_api_evaluate.py -m "${{matrix.gpu_num}} and ${{matrix.backend}} and eval" -n 4 --run_id ${{ github.event.inputs.run_id || github.run_id }} --alluredir=${{env.REPORT_DIR}} || overall_exit=$?
          fi
          exit $overall_exit
      - name: Clear workspace
        if: always()
        run: |
          echo "status=done" >> ${{env.REPORT_DIR}}/status.txt
          export workdir=$(pwd)
          rm -rf $workdir/*
--- a/autotest/config-ascend.yaml
+++ b/autotest/config-ascend.yaml
@@ -2,6 +2,7 @@ model_path: /mnt/vc-intern-delivery/qa-llm-cicd/qa_test_models
 resource_path: /mnt/vc-intern-delivery/qa-llm-cicd/resource
 log_path: /mnt/vc-intern-delivery/qa-llm-cicd/log
 eval_log_path: /mnt/vc-intern-delivery/qa-llm-cicd/evaluation_report
 mllm_eval_log_path: /mnt/vc-intern-delivery/qa-llm-cicd/mllm_evaluation_report
 benchmark_path: /mnt/vc-intern-delivery/qa-llm-cicd/benchmark-reports
 dataset_path: /mnt/vc-intern-delivery/qa-llm-cicd/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 prefix_dataset_path: /mnt/vc-intern-delivery/qa-llm-cicd/datasets/prefix_cache_test.json
--- a/autotest/config-h.yaml
+++ b/autotest/config-h.yaml
@@ -2,6 +2,7 @@ model_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/model
 resource_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/resource
 log_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/log
 eval_log_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/evaluation_report
 mllm_eval_log_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/mllm_evaluation_report
 benchmark_path: /mnt/shared-storage-user/auto-eval-pipeline/qa-llm-cicd/benchmark-reports
 dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 prefix_dataset_path: /mnt/shared-storage-user/auto-eval-pipeline/datasets/prefix_cache_test.json
@@ -204,3 +205,7 @@ evaluate_model:
    - deepseek/DeepSeek-V3.1
    - moonshotai/Kimi-K2-Instruct-0905
    - Qwen/Qwen3-235B-A22B-Thinking-2507

 mllm_evaluate_model:
    - internlm/Intern-S1
    - internlm/Intern-S1-mini
--- a/autotest/config.yaml
+++ b/autotest/config.yaml
@@ -2,6 +2,7 @@ model_path: /nvme/qa_test_models
 resource_path: /nvme/qa_test_models/resource
 log_path: /nvme/qa_test_models/autotest_model/log
 eval_log_path: /nvme/qa_test_models/evaluation_report
 mllm_eval_log_path: /nvme/qa_test_models/mllm_evaluation_report
 benchmark_path: /nvme/qa_test_models/benchmark-reports
 dataset_path: /nvme/qa_test_models/datasets/ShareGPT_V3_unfiltered_cleaned_split.json
 prefix_dataset_path: /nvme/qa_test_models/datasets/prefix_cache_test.json
@@ -23,6 +24,8 @@ tp_config:
    Qwen3-235B-A22B: 8
    Qwen3-32B: 2
    Qwen3-30B-A3B: 2
    Qwen3-VL-32B-Instruct: 2
    Qwen3-VL-30B-A3B-Instruct: 2
    Qwen3-30B-A3B-Base: 2
    Qwen2.5-32B-Instruct: 2
    Qwen2.5-72B-Instruct: 4
@@ -38,6 +41,7 @@ tp_config:
    InternVL2-Llama3-76B-AWQ: 4
    gpt-oss-20b-BF16: 2
    gpt-oss-120b-BF16: 4
    InternVL3_5-30B-A3B: 2



@@ -56,6 +60,7 @@ turbomind_chat_model:
        - internlm/internlm3-8b-instruct-awq
        - internlm/internlm2_5-7b-chat
        - internlm/internlm2_5-20b-chat
        - OpenGVLab/InternVL3_5-30B-A3B
        - OpenGVLab/InternVL3-2B
        - OpenGVLab/InternVL3-8B
        - OpenGVLab/InternVL3-38B
@@ -74,6 +79,9 @@ turbomind_chat_model:
        - Qwen/Qwen3-32B
        - Qwen/Qwen3-30B-A3B
        - Qwen/Qwen3-235B-A22B
        - Qwen/Qwen3-VL-8B-Instruct
        - Qwen/Qwen3-VL-32B-Instruct
        - Qwen/Qwen3-VL-30B-A3B-Instruct
        - Qwen/Qwen2.5-0.5B-Instruct
        - Qwen/Qwen2.5-7B-Instruct
        - Qwen/Qwen2.5-32B-Instruct
@@ -119,6 +127,7 @@ pytorch_chat_model:
        - internlm/internlm3-8b-instruct
        - internlm/internlm2_5-7b-chat
        - internlm/internlm2_5-20b-chat
        - OpenGVLab/InternVL3_5-30B-A3B
        - OpenGVLab/InternVL3-2B
        - OpenGVLab/InternVL3-8B
        - OpenGVLab/InternVL3-38B
@@ -138,6 +147,9 @@ pytorch_chat_model:
        - Qwen/Qwen3-32B
        - Qwen/Qwen3-30B-A3B
        - Qwen/Qwen3-235B-A22B
        - Qwen/Qwen3-VL-8B-Instruct
        - Qwen/Qwen3-VL-32B-Instruct
        - Qwen/Qwen3-VL-30B-A3B-Instruct
        - Qwen/Qwen2.5-0.5B-Instruct
        - Qwen/Qwen2.5-7B-Instruct
        - Qwen/Qwen2.5-32B-Instruct
@@ -181,6 +193,7 @@ turbomind_vl_model:
        - internlm/Intern-S1-mini
        - OpenGVLab/InternVL2_5-26B-MPO
        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
        - OpenGVLab/InternVL3_5-30B-A3B
        - OpenGVLab/InternVL3-2B
        - OpenGVLab/InternVL3-8B
        - OpenGVLab/InternVL3-38B
@@ -191,6 +204,9 @@ turbomind_vl_model:
        - OpenGVLab/InternVL2-2B
        - OpenGVLab/InternVL2-40B
        - OpenGVLab/InternVL2-Llama3-76B-AWQ
        - Qwen/Qwen3-VL-8B-Instruct
        - Qwen/Qwen3-VL-32B-Instruct
        - Qwen/Qwen3-VL-30B-A3B-Instruct
        - Qwen/Qwen2.5-VL-7B-Instruct
        - Qwen/Qwen2.5-VL-32B-Instruct
        - Qwen/Qwen2-VL-2B-Instruct
@@ -208,6 +224,7 @@ pytorch_vl_model:
        - internlm/Intern-S1-mini
        - OpenGVLab/InternVL2_5-26B-MPO
        - OpenGVLab/Mini-InternVL-Chat-2B-V1-5
        - OpenGVLab/InternVL3_5-30B-A3B
        - OpenGVLab/InternVL3-2B
        - OpenGVLab/InternVL3-8B
        - OpenGVLab/InternVL3-38B
@@ -219,6 +236,9 @@ pytorch_vl_model:
        - OpenGVLab/InternVL2-4B
        - OpenGVLab/InternVL2-40B
        - OpenGVLab/Mono-InternVL-2B
        - Qwen/Qwen3-VL-8B-Instruct
        - Qwen/Qwen3-VL-32B-Instruct
        - Qwen/Qwen3-VL-30B-A3B-Instruct
        - Qwen/Qwen2-VL-2B-Instruct
        - Qwen/Qwen2-VL-7B-Instruct
        - Qwen/Qwen2.5-VL-7B-Instruct
@@ -283,6 +303,9 @@ turbomind_quantization:
        - Qwen/Qwen3-30B-A3B
        - Qwen/Qwen3-235B-A22B
        - Qwen/Qwen3-30B-A3B-Base
        - Qwen/Qwen3-VL-8B-Instruct
        - Qwen/Qwen3-VL-32B-Instruct
        - Qwen/Qwen3-VL-30B-A3B-Instruct
        - Qwen/Qwen2.5-0.5B-Instruct
        - Qwen/Qwen2.5-7B-Instruct
        - Qwen/Qwen2.5-32B-Instruct
@@ -388,10 +411,20 @@ benchmark_model:


 evaluate_model:
    - google/gemma-2-9b-it
    - google/gemma-2-27b-it
    - meta-llama/Meta-Llama-3-1-8B-Instruct
    - Qwen/Qwen2.5-7B-Instruct
    - Qwen/Qwen2.5-32B-Instruct
    - Qwen/Qwen1.5-MoE-A2.7B-Chat
    - Qwen/Qwen3-30B-A3B
  - google/gemma-2-9b-it
  - google/gemma-2-27b-it
  - meta-llama/Meta-Llama-3-1-8B-Instruct
  - Qwen/Qwen2.5-7B-Instruct
  - Qwen/Qwen2.5-32B-Instruct
  - Qwen/Qwen1.5-MoE-A2.7B-Chat
  - Qwen/Qwen3-30B-A3B


 mllm_evaluate_model:
  - internlm/Intern-S1-mini
  - OpenGVLab/InternVL3-8B
  - Qwen/Qwen3-VL-8B-Instruct
  - Qwen/Qwen3-VL-32B-Instruct
  - Qwen/Qwen3-VL-30B-A3B-Instruct
  - internlm/Intern-S1
  - OpenGVLab/InternVL3_5-30B-A3B
--- a/autotest/evaluate/test_api_evaluate.py
+++ b/autotest/evaluate/test_api_evaluate.py
@@ -2,35 +2,13 @@ import os
 import time

 import pytest
 import utils.constant as constant
 from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid
 from utils.evaluate_utils import restful_test
 from utils.evaluate_utils import eval_test
 from utils.proxy_distributed_utils import ApiServerPerTest, proxy_worker_node_wait
 from utils.ray_distributed_utils import ray_worker_node_wait
 from utils.run_restful_chat import start_proxy_server, start_restful_api, stop_restful_api

 DEFAULT_PORT = 23333
 PROXY_PORT = 8000

 EVAL_CONFIGS = {
    'default': {
        'query_per_second': 4,
        'max_out_len': 64000,
        'max_seq_len': 65536,
        'batch_size': 500,
        'temperature': 0.6,
    },
    'gpt': {
        'query_per_second': 4,
        'max_out_len': 64000,
        'max_seq_len': 65536,
        'batch_size': 500,
        'temperature': 0.6,
        'openai_extra_kwargs': {
            'reasoning_effort': 'high',
        }
    }
 }


@pytest.fixture(scope='function')
 def prepare_environment(request, config, worker_id):
@@ -46,9 +24,9 @@ def prepare_environment(request, config, worker_id):
@pytest.fixture(scope='function')
 def prepare_environment_judge_evaluate(request, config, worker_id):
    if get_workerid(worker_id) is None:
        port = PROXY_PORT
        port = constant.PROXY_PORT
    else:
        port = PROXY_PORT + get_workerid(worker_id)
        port = constant.PROXY_PORT + get_workerid(worker_id)
    judge_config = {
        'model': 'Qwen/Qwen2.5-32B-Instruct',
        'backend': 'turbomind',
@@ -92,25 +70,25 @@ def _run_ray_distributed_test(
    assert manager is not None, 'Manager instance must be provided'
    if 'gpt' in model_param.get('model', '').lower():
        eval_config_name = 'gpt'
        preset_config = EVAL_CONFIGS.get(eval_config_name, {})
        preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

    if manager.is_master:
        model_name = model_param['model']
        model_path = os.path.join(config['model_path'], model_name)
        preset_config = EVAL_CONFIGS.get(eval_config_name, {})
        preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

        # Start API Server for current model (master node starts/stops, worker nodes verify)
        manager.start_lmdeploy_api_server(model_path=model_path, model_param=model_param)

        try:
            print(f'🧪 Master node executing {test_type} test ({eval_config_name})...')
            result, msg = restful_test(config,
                                       run_id,
                                       model_param,
                                       worker_id=worker_id,
                                       port=PROXY_PORT,
                                       test_type=test_type,
                                       **preset_config)
            result, msg = eval_test(config,
                                    run_id,
                                    model_param,
                                    worker_id=worker_id,
                                    port=constant.PROXY_PORT,
                                    test_type=test_type,
                                    **preset_config)
            assert result, f'❌ {test_type} test failed: {msg}'
            print(f'✅ {test_type} test passed')

@@ -134,7 +112,7 @@ def _run_proxy_distributed_test(config,
    if 'gpt' in model_param.get('model', '').lower():
        eval_config_name = 'gpt'

    preset_config = EVAL_CONFIGS.get(eval_config_name, {})
    preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})
    model_name = model_param['model']
    model_path = os.path.join(config['model_path'], model_name)

@@ -146,13 +124,13 @@ def _run_proxy_distributed_test(config,
            api_server.wait_until_ready()
            print(f'🧪 Master node executing {test_type} test ({eval_config_name})...')

            result, msg = restful_test(config,
                                       run_id,
                                       model_param,
                                       worker_id=worker_id,
                                       port=PROXY_PORT,
                                       test_type=test_type,
                                       **preset_config)
            result, msg = eval_test(config,
                                    run_id,
                                    model_param,
                                    worker_id=worker_id,
                                    port=constant.PROXY_PORT,
                                    test_type=test_type,
                                    **preset_config)
            assert result, f'❌ {test_type} test failed: {msg}'
            print(f'✅ {test_type} test passed')

@@ -171,9 +149,9 @@ def get_turbomind_model_list(tp_num):
    new_model_list = []
    for model in model_list:
        if 'Qwen3-235B-A22B-Thinking-2507' in model['model']:
            model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
            model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
        else:
            model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 '
            model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 '
        model['cuda_prefix'] = None
        new_model_list.append(model)
    return new_model_list
@@ -184,9 +162,9 @@ def get_pytorch_model_list(tp_num):
    new_model_list = []
    for model in model_list:
        if 'Qwen3-235B-A22B-Thinking-2507' in model['model']:
            model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
            model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 --max-batch-size 1024 '
        else:
            model['extra'] = '--session-len 65536 --cache-max-entry-count 0.9 '
            model['extra'] += '--session-len 65536 --cache-max-entry-count 0.9 '
        model['cuda_prefix'] = None
        new_model_list.append(model)
    return new_model_list
@@ -196,29 +174,29 @@ def run_test(config, run_id, prepare_environment, worker_id, test_type='infer',
    """Run test with specified evaluation configuration."""
    if 'gpt' in prepare_environment.get('model', '').lower():
        eval_config_name = 'gpt'
    preset_config = EVAL_CONFIGS.get(eval_config_name, {})
    preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

    if test_type == 'infer':
        port = DEFAULT_PORT
        port = constant.DEFAULT_PORT
    else:  # eval
        port = PROXY_PORT
        port = constant.PROXY_PORT

    if get_workerid(worker_id) is None:
        result, msg = restful_test(config,
                                   run_id,
                                   prepare_environment,
                                   worker_id=worker_id,
                                   port=port,
                                   test_type=test_type,
                                   **preset_config)
        result, msg = eval_test(config,
                                run_id,
                                prepare_environment,
                                worker_id=worker_id,
                                port=port,
                                test_type=test_type,
                                **preset_config)
    else:
        result, msg = restful_test(config,
                                   run_id,
                                   prepare_environment,
                                   worker_id=worker_id,
                                   port=port + get_workerid(worker_id),
                                   test_type=test_type,
                                   **preset_config)
        result, msg = eval_test(config,
                                run_id,
                                prepare_environment,
                                worker_id=worker_id,
                                port=port + get_workerid(worker_id),
                                test_type=test_type,
                                **preset_config)
    return result, msg


--- a/autotest/evaluate/test_mllm_api_evaluate.py
+++ b/autotest/evaluate/test_mllm_api_evaluate.py
@@ -0,0 +1,290 @@
 import pytest
 import utils.constant as constant
 from utils.config_utils import get_evaluate_pytorch_model_list, get_evaluate_turbomind_model_list, get_workerid
 from utils.evaluate_utils import mllm_eval_test
 from utils.run_restful_chat import start_proxy_server, start_restful_api, stop_restful_api


@pytest.fixture(scope='function')
 def prepare_environment(request, config, worker_id):
    param = request.param
    model = param['model']
    backend = param['backend']
    param['extra'] = ' '.join(
        [param.get('extra', ''), '--model-name',
         model.split('/')[-1], '--cache-max-entry-count 0.6'])  # noqa
    model_path = config.get('model_path') + '/' + model
    pid, startRes = start_restful_api(config, param, model, model_path, backend, worker_id)
    yield param
    stop_restful_api(pid, startRes, param)


@pytest.fixture(scope='function')
 def prepare_environment_judge_evaluate(request, config, worker_id):
    if get_workerid(worker_id) is None:
        port = constant.PROXY_PORT
    else:
        port = constant.PROXY_PORT + get_workerid(worker_id)
    judge_config = {
        'model': 'Qwen/Qwen2.5-32B-Instruct',
        'backend': 'turbomind',
        'param': {
            'tp_num':
            2,
            'extra':
            '--server-name 127.0.0.1 --proxy-url http://127.0.0.1:{} --session-len 46000 '
            '--model-name Qwen2.5-32B-Instruct '
            '--cache-max-entry-count 0.7 '.format(port),
            'cuda_prefix':
            None
        },
        'log_path': config.get('log_path'),
    }

    param = judge_config['param']
    model = judge_config['model']
    backend = judge_config['backend']
    model_path = config.get('model_path') + '/' + model

    proxy_pid, proxy_process = start_proxy_server(config, worker_id)

    judge_pid, judge_start_res = start_restful_api(config, param, model, model_path, backend, worker_id)

    try:
        yield request.param
    finally:
        stop_restful_api(judge_pid, judge_start_res, request.param)
        stop_restful_api(proxy_pid, proxy_process, request.param)


 def get_turbomind_vl_model_list(tp_num):
    model_list = get_evaluate_turbomind_model_list(tp_num, is_mllm=True, kvint_list=[4, 8])
    new_model_list = []
    for model in model_list:
        model['cuda_prefix'] = None
        new_model_list.append(model)
    return new_model_list


 def get_pytorch_vl_model_list(tp_num):
    model_list = get_evaluate_pytorch_model_list(tp_num, is_mllm=True)
    new_model_list = []
    for model in model_list:
        model['cuda_prefix'] = None
        new_model_list.append(model)
    return new_model_list


 def run_test(config, run_id, prepare_environment, worker_id, test_type='infer', eval_config_name='default'):
    if 'gpt' in prepare_environment.get('model', '').lower():
        eval_config_name = 'gpt'
    preset_config = constant.EVAL_CONFIGS.get(eval_config_name, {})

    if test_type == 'infer':
        port = constant.DEFAULT_PORT
    else:  # eval
        port = constant.PROXY_PORT

    if get_workerid(worker_id) is None:
        result, msg = mllm_eval_test(config,
                                     run_id,
                                     prepare_environment,
                                     worker_id=worker_id,
                                     port=port,
                                     test_type=test_type,
                                     **preset_config)
    else:
        result, msg = mllm_eval_test(config,
                                     run_id,
                                     prepare_environment,
                                     worker_id=worker_id,
                                     port=port + get_workerid(worker_id),
                                     test_type=test_type,
                                     **preset_config)
    return result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=1), indirect=True)
 def test_turbomind_vl_eval_tp1(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=2), indirect=True)
 def test_turbomind_vl_eval_tp2(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=4), indirect=True)
 def test_turbomind_vl_eval_tp4(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.turbomind
@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_turbomind_vl_model_list(tp_num=8), indirect=True)
 def test_turbomind_vl_eval_tp8(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=1), indirect=True)
 def test_pytorch_vl_eval_tp1(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=2), indirect=True)
 def test_pytorch_vl_eval_tp2(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=4), indirect=True)
 def test_pytorch_vl_eval_tp4(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=8), indirect=True)
 def test_pytorch_vl_eval_tp8(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.infer
@pytest.mark.pytorch
@pytest.mark.gpu_num_16
@pytest.mark.test_ascend
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment', get_pytorch_vl_model_list(tp_num=16), indirect=True)
 def test_pytorch_vl_eval_tp16(config, run_id, prepare_environment, worker_id):
    result, msg = run_test(config, run_id, prepare_environment, worker_id)
    assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.gpu_num_1
@pytest.mark.test_ascend
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=1), indirect=True)
 def test_pytorch_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.gpu_num_2
@pytest.mark.test_ascend
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=2), indirect=True)
 def test_pytorch_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.flaky(reruns=0)
@pytest.mark.gpu_num_4
@pytest.mark.test_ascend
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=4), indirect=True)
 def test_pytorch_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.flaky(reruns=0)
@pytest.mark.gpu_num_8
@pytest.mark.test_ascend
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=8), indirect=True)
 def test_pytorch_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.pytorch
@pytest.mark.flaky(reruns=0)
@pytest.mark.gpu_num_16
@pytest.mark.test_ascend
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_pytorch_vl_model_list(tp_num=16), indirect=True)
 def test_pytorch_judgeeval_tp16(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=1), indirect=True)
 def test_turbomind_judgeeval_tp1(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=2), indirect=True)
 def test_turbomind_judgeeval_tp2(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_4
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=4), indirect=True)
 def test_turbomind_judgeeval_tp4(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg


@pytest.mark.eval
@pytest.mark.turbomind
@pytest.mark.gpu_num_8
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment_judge_evaluate', get_turbomind_vl_model_list(tp_num=8), indirect=True)
 def test_turbomind_judgeeval_tp8(config, run_id, prepare_environment_judge_evaluate, worker_id):
    result, msg = run_test(config, run_id, prepare_environment_judge_evaluate, worker_id, 'eval')
    assert result, msg
--- a/autotest/interface/restful/test_restful_generate.py
+++ b/autotest/interface/restful/test_restful_generate.py
@@ -0,0 +1,1169 @@
 import json
 import os
 import re
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from typing import Any, Dict, List

 import pytest
 import requests
 from transformers import AutoTokenizer
 from utils.toolkit import encode_text, parse_sse_stream

 BASE_HTTP_URL = 'http://127.0.0.1'
 DEFAULT_PORT = 23333
 MODEL_LIST = ['Qwen/Qwen3-0.6B', 'Qwen/Qwen3-VL-2B-Instruct', 'Qwen/Qwen3-30B-A3B']
 BASE_URL = ':'.join([BASE_HTTP_URL, str(DEFAULT_PORT)])


@pytest.mark.parametrize('model_name', MODEL_LIST)
 class TestGenerateComprehensive:

    @pytest.fixture(autouse=True)
    def setup_api(self, request, config, model_name):
        self.api_url = f'{BASE_URL}/generate'
        self.headers = {'Content-Type': 'application/json'}
        self.model_name = model_name

        test_name = request.node.name
        safe_test_name = re.sub(r'[^\w\.-]', '_', test_name)
        safe_model_name = self.model_name.replace('/', '_')
        log_base = config.get('log_path', './logs')
        self.log_dir = os.path.join(log_base, safe_model_name)
        os.makedirs(self.log_dir, exist_ok=True)
        self.log_file = os.path.join(self.log_dir, f'{safe_test_name}.log')

    def _log_request_response(self, payload, response_data, stream_raw=None):
        log_entry = {
            'timestamp': datetime.now().isoformat(),
            'model': self.model_name,
            'request': payload,
            'response': response_data,
        }
        if stream_raw is not None:
            log_entry['stream_raw'] = stream_raw

        try:
            with open(self.log_file, 'a', encoding='utf-8') as f:
                json.dump(log_entry, f, indent=2, ensure_ascii=False)
                f.write('\n')
        except Exception as e:
            print(f'[LOG WARN] Failed to write {self.log_file}: {e}')

    def _post(self, payload, stream=False):
        if 'model' not in payload:
            payload['model'] = self.model_name

        resp = requests.post(self.api_url, json=payload, headers=self.headers, stream=stream, timeout=60)
        resp.raise_for_status()

        if stream:
            raw_content = ''
            for chunk in resp.iter_content(chunk_size=None):
                if chunk:
                    raw_content += chunk.decode('utf-8')

            events = parse_sse_stream(raw_content)
            accumulated_text = ''
            output_ids = []
            stream_events_count = 0

            for event in events:
                if event == '[DONE]':
                    break
                try:
                    data_str = event.replace('data: ', '').strip()
                    if not data_str:
                        continue
                    data = json.loads(data_str)
                    delta = data.get('text', '')
                    if isinstance(delta, str):
                        accumulated_text += delta
                    ids = data.get('output_ids')
                    if isinstance(ids, list):
                        output_ids.extend(ids)
                    stream_events_count += 1
                except Exception as e:
                    print(f'Error parsing stream event: {e}')
                    continue

            fake_resp = {
                'text': accumulated_text,
                'output_ids': output_ids,
                'meta_info': {
                    'stream_events': stream_events_count
                }
            }
            self._log_request_response(payload, fake_resp, raw_content)

            class MockResp:

                def json(self):
                    return fake_resp

                @property
                def status_code(self):
                    return 200

            return MockResp()

        else:
            data = resp.json()
            self._log_request_response(payload, data)
            return resp

    def _validate_generation_response(self,
                                      data: Dict[str, Any],
                                      expected_fields: List[str] = None,
                                      validate_tokens: bool = True,
                                      expect_logprobs: bool = False,
                                      validate_experts: bool = False) -> None:
        assert isinstance(data, dict), f'Response should be a dict, got {type(data)}'

        required_fields = ['text']
        for field in required_fields:
            assert field in data, f'Missing required field: {field}'
            assert data[field] is not None, f'Field {field} should not be None'

        assert isinstance(data['text'], str), \
            f"text should be string, got {type(data['text'])}"

        if validate_experts:
            assert 'routed_experts' in data[
                'meta_info'], "Response should contain 'routed_experts' when validate_experts=True"

            experts_data = data['meta_info']['routed_experts']

            assert isinstance(experts_data, list)
            assert len(experts_data) > 0

            total_steps = len(experts_data)

            for step_idx in range(total_steps):
                token_experts = experts_data[step_idx]

                assert isinstance(token_experts, list)
                assert len(token_experts) > 0

                for layer_idx in range(len(token_experts)):
                    layer_experts = token_experts[layer_idx]

                    assert isinstance(layer_experts, list)
                    assert len(layer_experts) == 8

                    for expert_idx, expert_id in enumerate(layer_experts):
                        assert isinstance(expert_id, int)
                        assert 0 <= expert_id < 256, f'Invalid expert_id: {expert_id}. Must be in [0, 256)'

        if validate_tokens:
            assert 'output_ids' in data, "Response should contain 'output_ids'"
            output_ids = data['output_ids']

            assert isinstance(output_ids, list), \
                f'output_ids should be list, got {type(output_ids)}'
            assert len(output_ids) >= 0, 'output_ids should not be empty'

            for i, token_id in enumerate(output_ids):
                assert isinstance(token_id, int), \
                    f'output_ids[{i}] should be int, got {type(token_id)}'

            if 'meta_info' in data:
                meta = data['meta_info']
                assert isinstance(meta, dict), 'meta_info should be dict'

                if 'completion_tokens' in meta:
                    assert meta['completion_tokens'] == len(output_ids), \
                        f"meta.completion_tokens ({meta['completion_tokens']}) " \
                        f'should equal len(output_ids) ({len(output_ids)})'

        if expect_logprobs:
            assert 'meta_info' in data, \
                "Response should contain 'meta_info' when expecting logprobs"
            meta = data['meta_info']
            assert isinstance(meta, dict)

            assert 'output_token_logprobs' in meta, \
                "meta_info missing 'output_token_logprobs'"
            logprobs_data = meta['output_token_logprobs']

            assert isinstance(logprobs_data, list), \
                'output_token_logprobs should be a list'
            assert len(logprobs_data) > 0, \
                'output_token_logprobs should not be empty'

            if 'output_ids' in data:
                assert len(logprobs_data) == len(data['output_ids']), \
                    f'Logprobs outer list length ({len(logprobs_data)}) != ' \
                    f"Output IDs length ({len(data['output_ids'])})"

            for idx, item in enumerate(logprobs_data):
                assert isinstance(item, list), \
                    f'Logprobs item at index {idx} should be a list, got {type(item)}'
                assert len(item) == 2, \
                    f'Logprobs item at index {idx} should have 2 elements ' \
                    f'[logprob, token_id], got {len(item)}'

                logprob_val = item[0]
                assert isinstance(logprob_val, (float, int)), \
                    f'Logprob value at [{idx}][0] should be number, ' \
                    f'got {type(logprob_val)}'
                assert logprob_val <= 0, \
                    f'Logprob value should be <= 0, got {logprob_val}'

                token_id_in_logprob = item[1]
                assert isinstance(token_id_in_logprob, int), \
                    f'Token ID in logprobs at [{idx}][1] should be int, ' \
                    f'got {type(token_id_in_logprob)}'

                if 'output_ids' in data and idx < len(data['output_ids']):
                    assert token_id_in_logprob == data['output_ids'][idx], \
                        f'Token ID mismatch at index {idx}: output_ids has ' \
                        f"{data['output_ids'][idx]}, but logprobs has " \
                        f'{token_id_in_logprob}'

        if expected_fields:
            for field in expected_fields:
                assert field in data, f'Missing expected field: {field}'

        if 'error' in data:
            assert not data['error'], f"Response contains error: {data['error']}"
        if 'code' in data and data['code'] != 0:
            assert False, f"Response contains error code: {data['code']}"

    def test_basic_generation(self):
        print(f'\n[Model: {self.model_name}] Running basic generation test')
        test_cases = [{
            'name': 'simple prompt',
            'payload': {
                'prompt': 'The sky is',
                'max_tokens': 5
            },
        }, {
            'name': 'prompt with spaces',
            'payload': {
                'prompt': '  Hello world  ',
                'max_tokens': 3
            },
        }, {
            'name': 'unicode prompt',
            'payload': {
                'prompt': 'Hello, world',
                'max_tokens': 3
            },
        }, {
            'name': 'longer generation',
            'payload': {
                'prompt': 'Once upon a time',
                'max_tokens': 10
            },
        }]

        for test_case in test_cases:
            test_name = test_case['name']
            print(f'\n[Test: {test_name}]')

            resp = self._post(test_case['payload'])
            data = resp.json()

            self._validate_generation_response(data=data, validate_tokens=True)

            prompt = test_case['payload']['prompt']
            generated_text = data['text']
            assert generated_text != prompt.strip(), \
                f"Generated text should be different from prompt: '{generated_text}'"

            if 'output_ids' in data:
                output_ids = data['output_ids']
                max_tokens = test_case['payload']['max_tokens']
                max_allowed = max_tokens + 1

                assert len(output_ids) <= max_allowed, \
                    f'Too many tokens generated: {len(output_ids)} > {max_allowed}'

                meta = data.get('meta_info', {})
                finish_type = meta.get('finish_reason', {}).get('type')
                if len(output_ids) >= max_tokens and finish_type != 'length':
                    print(f'[WARN] Generated {len(output_ids)} tokens but '
                          f"finish_reason is not 'length': {finish_type}")

            print(f"  Generated text: '{generated_text[:50]}...'")
            print(f"  Generated tokens: {len(data.get('output_ids', []))}")

    def test_input_ids_mode(self, config):
        print(f'\n[Model: {self.model_name}] Running input_ids mode test')
        model_path = os.path.join(config.get('model_path'), self.model_name)

        test_cases = [{
            'name': 'simple text',
            'text': 'Hello world',
            'max_tokens': 5,
            'expected_min_text': 3
        }, {
            'name': 'question',
            'text': 'What is the meaning of life?',
            'max_tokens': 8,
            'expected_min_text': 5
        }, {
            'name': 'short input',
            'text': 'Yes',
            'max_tokens': 3,
            'expected_min_text': 1
        }]

        for test_case in test_cases:
            test_name = test_case['name']
            print(f'\n[Test: input_ids - {test_name}]')

            try:
                input_ids = encode_text(model_path, test_case['text'])
            except Exception as e:
                pytest.skip(f'Tokenizer failed for {test_case["name"]}: {e}')

            assert isinstance(input_ids, list), \
                f'input_ids should be list, got {type(input_ids)}'
            assert len(input_ids) > 0, 'input_ids should not be empty'
            for i, token_id in enumerate(input_ids):
                assert isinstance(token_id, int), \
                    f'input_ids[{i}] should be int, got {type(token_id)}'
                assert token_id >= 0, \
                    f'input_ids[{i}] should be >= 0, got {token_id}'

            resp = self._post({'input_ids': input_ids, 'max_tokens': test_case['max_tokens']})
            data = resp.json()

            self._validate_generation_response(data=data, validate_tokens=True)

            generated_text = data['text']
            try:
                generated_text.encode('utf-8')
            except UnicodeEncodeError:
                pytest.fail(f'Generated text contains invalid UTF-8 characters: '
                            f'{generated_text[:100]}')

            print(f'  Input tokens: {len(input_ids)}')
            print(f"  Output tokens: {len(data.get('output_ids', []))}")
            print(f"  Generated text: '{generated_text[:50]}...'")

    def test_conflict_prompt_and_input_ids(self):
        print(f'\n[Model: {self.model_name}] Running conflict test')
        test_cases = [{
            'name':
            'both provided',
            'payload': {
                'prompt': 'Hello world',
                'input_ids': [1, 2, 3, 4, 5],
                'max_tokens': 5
            },
            'expected_status':
            400,
            'expected_error_keywords': [
                'conflict', 'both', 'either', 'cannot', 'mutually exclusive', 'specify exactly one', 'prompt',
                'input_ids'
            ]
        }, {
            'name':
            'prompt with empty input_ids',
            'payload': {
                'prompt': 'Test',
                'input_ids': [],
                'max_tokens': 3
            },
            'expected_status':
            400,
            'expected_error_keywords': ['conflict', 'invalid', 'empty', 'specify exactly one', 'prompt', 'input_ids']
        }, {
            'name':
            'empty prompt with input_ids',
            'payload': {
                'prompt': '',
                'input_ids': [100, 200, 300],
                'max_tokens': 3
            },
            'expected_status':
            400,
            'expected_error_keywords': ['conflict', 'empty', 'invalid', 'specify exactly one', 'prompt', 'input_ids']
        }]

        for test_case in test_cases:
            test_name = test_case['name']
            print(f'\n[Test: conflict - {test_name}]')

            try:
                resp = requests.post(self.api_url, json=test_case['payload'], headers=self.headers, timeout=30)

                assert resp.status_code == test_case['expected_status'], \
                    f"Expected status {test_case['expected_status']}, " \
                    f'got {resp.status_code}'

                error_data = resp.json()
                assert 'error' in error_data or 'message' in error_data, \
                    "Error response should contain 'error' or 'message' field"

                error_msg = ''
                if 'error' in error_data:
                    error_msg = str(error_data['error']).lower()
                elif 'message' in error_data:
                    error_msg = str(error_data['message']).lower()

                keywords_found = any(keyword in error_msg for keyword in test_case['expected_error_keywords'])

                if not keywords_found:
                    has_both_fields = ('prompt' in error_msg and 'input_ids' in error_msg)
                    has_exclusivity = any(phrase in error_msg for phrase in [
                        'only one', 'specify exactly', 'cannot both', 'mutually exclusive', 'exactly one',
                        'must specify'
                    ])
                    if has_both_fields and has_exclusivity:
                        keywords_found = True

                assert keywords_found, \
                    f'Error message should indicate conflict between prompt and ' \
                    f'input_ids, got: {error_msg}'

                assert 'text' not in error_data, \
                    "Error response should not contain 'text' field"
                assert 'output_ids' not in error_data, \
                    "Error response should not contain 'output_ids' field"

                print(f'  Got expected error: {error_msg[:100]}...')

            except Exception as e:
                print(f'  Unexpected error: {e}')
                raise

    @pytest.mark.logprob
    def test_input_ids_with_logprob(self, config):
        print(f'\n[Model: {self.model_name}] Running input_ids with logprob test')
        model_path = os.path.join(config.get('model_path'), self.model_name)

        test_cases = [{
            'name': 'basic logprob',
            'text': 'The weather is',
            'max_tokens': 3,
            'expected_min_text': 3
        }, {
            'name': 'single token generation',
            'text': 'Hello',
            'max_tokens': 1,
            'expected_min_text': 1
        }, {
            'name': 'multiple tokens with logprob',
            'text': 'Artificial intelligence is',
            'max_tokens': 5,
            'expected_min_text': 5
        }]

        for test_case in test_cases:
            test_name = test_case['name']
            print(f'\n[Test: logprob - {test_name}]')

            try:
                input_ids = encode_text(model_path, test_case['text'])
            except Exception as e:
                pytest.skip(f'Tokenizer failed for {test_case["name"]}: {e}')

            request_payload = {'input_ids': input_ids, 'max_tokens': test_case['max_tokens'], 'return_logprob': True}

            resp = self._post(request_payload)
            data = resp.json()

            self._validate_generation_response(data=data, validate_tokens=True, expect_logprobs=True)

            assert 'meta_info' in data, \
                "Response should contain 'meta_info' when return_logprob=True"
            meta = data['meta_info']

            assert 'output_token_logprobs' in meta, \
                "meta_info should contain 'output_token_logprobs'"
            logprobs = meta['output_token_logprobs']

            logprob_values = []

            for i, item in enumerate(logprobs):
                logprob_values.append(item[0])

            avg_logprob = sum(logprob_values) / len(logprob_values)
            if avg_logprob < -10.0:
                pytest.fail(f'Generation confidence critically low '
                            f'(Avg: {avg_logprob:.2f})')

            generated_text = data.get('text', '')
            print(f'  Generated tokens: {len(logprob_values)}')
            print(f'  Avg Logprob: {avg_logprob:.3f}')
            print(f"  Generated text: '{generated_text[:50]}...'")

    def test_stop_str_with_include_flag(self):
        print(f'\n[Model: {self.model_name}] Running stop_str with include flag test')
        test_cases = [{
            'name': 'simple stop word',
            'prompt': 'Count: 1, 2, 3, ',
            'stop_word': '6',
            'max_tokens': 10,
        }]

        for test_case in test_cases:
            test_name = test_case['name']
            print(f'\n[Test: stop_str - {test_name}]')

            prompt = test_case['prompt']
            stop_word = test_case['stop_word']
            max_tokens = test_case['max_tokens']

            print('  Testing EXCLUDE mode (include_stop=False)...')
            resp1 = self._post({
                'prompt': prompt,
                'max_tokens': max_tokens,
                'stop': [stop_word],
                'include_stop_str_in_output': False,
                'return_logprob': True
            })

            self._validate_generation_response(resp1.json())
            text_exclude = resp1.json()['text']
            assert stop_word not in text_exclude, \
                f"Stop word '{stop_word}' should NOT be in output when include_stop=False"

            print('  Testing INCLUDE mode (include_stop=True)...')
            resp2 = self._post({
                'prompt': prompt,
                'max_tokens': max_tokens,
                'stop': [stop_word],
                'include_stop_str_in_output': True,
                'return_logprob': True
            })

            self._validate_generation_response(resp2.json())
            text_include = resp2.json()['text']
            assert stop_word in text_include, \
                f"Stop word '{stop_word}' should be in output when include_stop=True"

    def test_streaming_mode(self):
        print(f'\n[Model: {self.model_name}] Running streaming mode test')
        prompt = 'Count: 1, 2,'

        resp = self._post({'prompt': prompt, 'max_tokens': 8, 'stream': True}, stream=True)
        assert resp.status_code == 200
        data = resp.json()

        text = data['text']
        output_ids = data['output_ids']
        meta = data['meta_info']

        assert isinstance(text, str) and len(text.strip()) > 0, \
            'Generated text cannot be empty'
        assert len(output_ids) >= 3, 'Output token count should be reasonable'

        import re
        count_matches = len(re.findall(r'\b[3-9]\b', text))
        assert count_matches >= 2, \
            f'Expected continuation of counting, but not enough numbers found ' \
            f'(found {count_matches})'

        stream_events = meta.get('stream_events', [])
        assert stream_events >= len(output_ids), \
            'Streaming event count should not be less than output token count'

        print(f"  Generated text: '{text}'")
        print(f'  Output tokens: {len(output_ids)}, '
              f'Stream events: {stream_events}')

    def test_streaming_incremental_correctness(self):
        print(f'\n[Model: {self.model_name}] Running streaming incremental correctness test')
        prompt = 'The sky is '

        raw_resp = requests.post(self.api_url,
                                 json={
                                     'prompt': prompt,
                                     'max_tokens': 10,
                                     'stream': True
                                 },
                                 headers=self.headers,
                                 stream=True,
                                 timeout=30)
        raw_resp.raise_for_status()

        full_text_from_delta = ''
        tokens_from_delta = []
        event_count = 0

        print('  Streaming chunks:')
        for line in raw_resp.iter_lines():
            if line:
                line_str = line.decode('utf-8').strip()
                if line_str.startswith('data: ') and '[DONE]' not in line_str:
                    try:
                        json_str = line_str[6:]
                        payload = json.loads(json_str)

                        delta_text = payload.get('text', '')
                        token_id = payload.get('token_id')

                        full_text_from_delta += delta_text
                        if token_id is not None:
                            tokens_from_delta.append(token_id)

                        event_count += 1
                        if delta_text.strip():
                            print(f"    +'{delta_text}'")

                    except Exception as e:
                        print(f'    [Parse warning]: {e}')
                        continue

        assert len(full_text_from_delta.strip()) > 0, \
            'Assembled text from streaming deltas is empty'
        assert event_count >= 3, \
            f'Too few streaming events received ({event_count}), ' \
            f'connection might be interrupted'

        print(f"  Final assembled text: '{full_text_from_delta}'")
        print(f'  Total events received: {event_count}')

    @pytest.mark.logprob
    def test_return_logprob(self):
        print(f'\n[Model: {self.model_name}] Running return_logprob test')

        resp = self._post({'prompt': 'Paris is the capital of', 'max_tokens': 2, 'return_logprob': True})
        data = resp.json()

        self._validate_generation_response(data, validate_tokens=True, expect_logprobs=True)

        print(f"  Generated text: '{data['text']}'")

    def test_same_session_id_allowed(self):
        print(f'\n[Model: {self.model_name}] Running same session_id test')
        sid = 9999

        resp1 = self._post({'prompt': 'First message:', 'session_id': sid, 'max_tokens': 2})
        resp2 = self._post({'prompt': 'Second message:', 'session_id': sid, 'max_tokens': 2})

        assert resp1.status_code == 200
        assert resp2.status_code == 200

        data1 = resp1.json()
        data2 = resp2.json()

        self._validate_generation_response(data1)
        self._validate_generation_response(data2)

        text1 = data1['text'].strip()
        text2 = data2['text'].strip()
        assert text1 != text2

        print(f"  First response: '{data1['text']}'")
        print(f"  Second response: '{data2['text']}'")

    def test_empty_prompt_rejected(self):
        print(f'\n[Model: {self.model_name}] Running empty prompt test')

        with pytest.raises(requests.HTTPError) as exc:
            self._post({'prompt': '', 'max_tokens': 5})

        assert exc.value.response.status_code == 400

        try:
            error_response = exc.value.response.json()
            print(f'  Error response: {error_response}')
            assert 'error' in error_response or 'message' in error_response
        except json.JSONDecodeError:
            print(f'  Non-JSON error: {exc.value.response.text[:100]}')

    def test_input_ids_rejected(self):
        print(f'\n[Model: {self.model_name}] Running input_ids invalid cases test')

        invalid_cases = [{
            'case': {
                'input_ids': [],
                'max_tokens': 5
            },
            'desc': 'Empty input_ids list'
        }, {
            'case': {
                'input_ids': 'not_a_list',
                'max_tokens': 5
            },
            'desc': 'input_ids is a string, not list'
        }, {
            'case': {
                'max_tokens': 5
            },
            'desc': 'Missing input_ids field'
        }]

        for invalid_case in invalid_cases:
            test_desc = invalid_case['desc']
            payload = invalid_case['case']

            with pytest.raises(requests.HTTPError) as exc_info:
                self._post(payload)

            response = exc_info.value.response
            assert response.status_code in [400, 422], (f"Bad Request for case '{test_desc}', "
                                                        f'but got {response.status_code}')

    def test_stress_concurrent_requests(self):
        print(f'\n[Model: {self.model_name}] Running stress concurrent requests test')

        def single_request(idx):
            start_time = time.time()
            try:
                resp = requests.post(self.api_url,
                                     json={
                                         'prompt': f'Hello, task {idx}',
                                         'max_tokens': 5,
                                         'stream': False
                                     },
                                     headers=self.headers,
                                     timeout=10)
                resp.raise_for_status()
                data = resp.json()

                if 'text' in data and len(data['text'].strip()) > 0:
                    latency = time.time() - start_time
                    return {'success': True, 'latency': latency}
                else:
                    return {'success': False, 'error': 'Empty response'}

            except Exception as e:
                return {'success': False, 'error': str(e)}

        success_count = 0
        total_latency = 0
        failures = []

        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(single_request, i) for i in range(20)]

            for i, future in enumerate(as_completed(futures)):
                result = future.result()
                if result['success']:
                    success_count += 1
                    total_latency += result['latency']
                    print(f"  Req {i}: ✓ (Latency: {result['latency']:.2f}s)")
                else:
                    failures.append(result['error'])
                    print(f'  Req {i}: ✗')

        success_rate = success_count / 20
        assert success_rate == 1.0, \
            f'Stress test failed: success rate {success_rate*100}% < 80%'

        if success_count > 0:
            avg_latency = total_latency / success_count
            assert avg_latency < 5.0, \
                f'Average latency too high: {avg_latency:.2f}s'
            print(f'  Performance: Avg Latency={avg_latency:.2f}s')

        print(f'  Summary: {success_count}/20 succeeded')

    def test_stress_long_prompt_and_generation(self):
        print(f'\n[Model: {self.model_name}] Running stress long prompt test')

        long_prompt = 'Summarize: The quick brown fox jumps over the lazy dog. ' * 100

        resp = self._post({'prompt': long_prompt, 'max_tokens': 512, 'temperature': 0.7})

        data = resp.json()
        self._validate_generation_response(data=data, validate_tokens=True)

    def test_stress_streaming_under_load(self):
        print(f'\n[Model: {self.model_name}] Running stress streaming under load test')

        def stream_request(idx):
            try:
                resp = requests.post(self.api_url,
                                     json={
                                         'prompt': f'Stream load test {idx}',
                                         'max_tokens': 10,
                                         'stream': True
                                     },
                                     headers=self.headers,
                                     stream=True,
                                     timeout=30)

                assert resp.status_code == 200
                content_type = resp.headers.get('Content-Type', '')
                assert 'text/event-stream' in content_type or \
                    'application/x-ndjson' in content_type

                full_text = ''
                event_count = 0
                for line in resp.iter_lines():
                    if line and line.startswith(b'data:'):
                        event_count += 1
                        if b'[DONE]' in line:
                            break
                        try:
                            payload = json.loads(line.decode().replace('data: ', '', 1))
                            full_text += payload.get('text', '')
                        except Exception:
                            pass

                assert len(full_text) > 0
                assert event_count >= 3

                return True

            except Exception as e:
                print(f'  Stream {idx} error: {e}')
                return False

        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(stream_request, i) for i in range(10)]
            results = [f.result() for f in futures]

        success_count = sum(results)

        assert success_count == 10, \
            f'Concurrent streaming test failure rate too high: {success_count}/10'

        print(f'  Streaming under load: {success_count}/10 succeeded')

    def test_temperature_parameter(self):
        print(f'\n[Model: {self.model_name}] Running temperature parameter test')
        prompt = 'The capital of France is'

        resp_low = self._post({'prompt': prompt, 'max_tokens': 10, 'temperature': 0.1, 'stream': False})
        resp_high = self._post({'prompt': prompt, 'max_tokens': 10, 'temperature': 0.9, 'stream': False})

        data_low = resp_low.json()
        data_high = resp_high.json()

        self._validate_generation_response(data=data_low, validate_tokens=True)
        self._validate_generation_response(data=data_high, validate_tokens=True)

        assert 'Paris' in data_low['text'] or \
            'paris' in data_low['text'].lower(), \
            "Low temperature didn't answer correct capital"
        assert data_low['text'] != data_high['text'], \
            'High and low temperature outputs identical, ' \
            'temperature may not be effective'

    def test_top_p_parameter(self):
        print(f'\n[Model: {self.model_name}] Running top_p parameter test')
        prompt = 'The weather today is'

        resp_strict = self._post({'prompt': prompt, 'max_tokens': 20, 'top_p': 0.01, 'stream': False})
        resp_loose = self._post({'prompt': prompt, 'max_tokens': 20, 'top_p': 0.99, 'stream': False})

        text_strict = resp_strict.json()
        text_loose = resp_loose.json()

        self._validate_generation_response(data=text_strict, validate_tokens=True)
        self._validate_generation_response(data=text_loose, validate_tokens=True)

    def test_top_k_parameter(self):
        print(f'\n[Model: {self.model_name}] Running top_k parameter test')
        prompt = 'Artificial intelligence'

        resp_k10 = self._post({'prompt': prompt, 'max_tokens': 10, 'top_k': 10, 'stream': False})
        resp_k50 = self._post({'prompt': prompt, 'max_tokens': 10, 'top_k': 50, 'stream': False})

        text_k10 = resp_k10.json()
        text_k50 = resp_k50.json()

        self._validate_generation_response(data=text_k10, validate_tokens=True)
        self._validate_generation_response(data=text_k50, validate_tokens=True)

    def test_min_p_parameter(self):
        print(f'\n[Model: {self.model_name}] Running min_p parameter test')
        prompt = 'Machine learning is'

        resp = self._post({'prompt': prompt, 'max_tokens': 10, 'min_p': 0.05, 'stream': False})
        data = resp.json()
        self._validate_generation_response(data)

    def test_repetition_penalty(self):
        print(f'\n[Model: {self.model_name}] Running repetition penalty test')
        prompt = 'Repeat repeat repeat repeat'

        resp_no_penalty = self._post({'prompt': prompt, 'max_tokens': 10, 'repetition_penalty': 1.0, 'stream': False})
        resp_penalty = self._post({'prompt': prompt, 'max_tokens': 10, 'repetition_penalty': 1.5, 'stream': False})

        text_no_penalty = resp_no_penalty.json()['text']
        text_penalty = resp_penalty.json()['text']

        def count_repeats(text):
            words = text.lower().split()
            return sum(1 for i in range(1, len(words)) if words[i] == words[i - 1])

        repeats_no_penalty = count_repeats(text_no_penalty)
        repeats_penalty = count_repeats(text_penalty)

        assert repeats_penalty <= repeats_no_penalty, (
            f'High penalty coefficient ({1.5}) repetition count ({repeats_penalty}) '
            f'not less than low penalty ({1.0}) count ({repeats_no_penalty}), '
            f'repetition_penalty ineffective')

    def test_ignore_eos_parameter(self):
        print(f'\n[Model: {self.model_name}] Running ignore_eos parameter test')
        prompt = 'The sky is blue.'

        resp_normal = self._post({'prompt': prompt, 'ignore_eos': False, 'stream': False})
        data_normal = resp_normal.json()
        self._validate_generation_response(data_normal)

        resp_ignore = self._post({'prompt': prompt, 'ignore_eos': True, 'stream': False})
        data_ignore = resp_ignore.json()
        self._validate_generation_response(data_ignore)

        reason_ignore = data_ignore.get('meta_info', {}).get('finish_reason', {}).get('type', 'unknown')

        assert reason_ignore == 'length', \
            f'ignore_eos=True must end due to length, actual: {reason_ignore}'

    def test_skip_special_tokens(self, config):
        print(f'[Model: {self.model_name}] Running skip_special_tokens test')
        model_path = os.path.join(config.get('model_path'), self.model_name)
        user_content = 'Hello [world]! This is a [test].'

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        special_tokens_map = tokenizer.special_tokens_map

        special_patterns = list(special_tokens_map.values())
        special_patterns = [
            item for sublist in special_patterns for item in (sublist if isinstance(sublist, list) else [sublist])
        ]

        print('Special patterns:', special_patterns)

        print(' Executing skip_special_tokens=True')
        payload_true = {'prompt': user_content, 'max_tokens': 100, 'skip_special_tokens': True, 'stream': False}
        resp_true = self._post(payload_true)
        data_true = resp_true.json()
        self._validate_generation_response(data=data_true, validate_tokens=True)
        generated_text = data_true['text']
        assert not any(pattern in generated_text for pattern in special_patterns), \
            'Expected no special pattern in the generated text but found one.'

    def test_stop_token_ids(self):
        print(f'\n[Model: {self.model_name}] Running stop_token_ids test')
        payload = {'prompt': 'Once upon a time', 'max_tokens': 50, 'stop_token_ids': [11], 'stream': False}

        resp = self._post(payload)
        assert resp.status_code == 200, \
            f'HTTP request failed, status code: {resp.status_code}'

        try:
            data = resp.json()
        except Exception as e:
            pytest.fail(f'Response JSON parsing failed: {e}')

        self._validate_generation_response(data)

        generated_text = data.get('text', '')
        finish_reason = data.get('meta_info', {}).get('finish_reason', {}).get('type', 'unknown')
        actual_length = len(generated_text)

        assert finish_reason in ['stop', 'eos'], \
            f'Expected generation to end due to stop token, ' \
            f'actual reason: {finish_reason}. This may mean stop_token_ids [11] ' \
            f"didn't take effect, or generation was truncated."

        print(f'\n stop_token_ids=[11] generation result: length={actual_length}, '
              f"end reason='{finish_reason}', text='{generated_text[:20]}...'")

    def test_combined_parameters(self):
        print(f'\n[Model: {self.model_name}] Running combined parameters test')
        resp = self._post({
            'prompt': 'The future of AI',
            'max_tokens': 15,
            'temperature': 0.7,
            'top_p': 0.9,
            'top_k': 40,
            'repetition_penalty': 1.1,
            'stream': False
        })

        assert resp.status_code == 200
        data = resp.json()
        self._validate_generation_response(data)

    def test_streaming_with_all_parameters(self):
        print(f'\n[Model: {self.model_name}] Running streaming with all parameters test')
        resp = self._post(
            {
                'prompt': 'Streaming test with parameters',
                'max_tokens': 10,
                'temperature': 0.8,
                'top_p': 0.85,
                'top_k': 30,
                'repetition_penalty': 1.2,
                'stop': ['test'],
                'stream': True
            },
            stream=True)

        assert resp.status_code == 200
        data = resp.json()
        self._validate_generation_response(data)

        stream_events = data['meta_info'].get('stream_events', [])

        assert stream_events == len(data['output_ids']) + 1, \
            'Streaming event count should not be less than generated token count'

    def test_invalid_temperature_values(self):
        print(f'\n[Model: {self.model_name}] Running invalid temperature values test')
        resp1 = self._post({'prompt': 'Test', 'max_tokens': 3, 'temperature': 0.0, 'stream': False})
        assert resp1.status_code == 200, 'temperature=0.0 should be valid'

        with pytest.raises(requests.HTTPError) as exc_info:
            self._post({'prompt': 'Test', 'max_tokens': 3, 'temperature': -0.5, 'stream': False})
        assert exc_info.value.response.status_code in [400, 422]

        print('  Invalid temperature values test passed')

    def test_invalid_top_p_values(self):
        print(f'\n[Model: {self.model_name}] Running invalid top_p values test')
        with pytest.raises(requests.HTTPError) as exc_info:
            self._post({'prompt': 'Test', 'max_tokens': 3, 'top_p': 1.5, 'stream': False})
        assert exc_info.value.response.status_code in [400, 422]

        print('  Invalid top_p values test passed')

    def test_invalid_top_k_values(self):
        print(f'\n[Model: {self.model_name}] Running invalid top_k values test')
        with pytest.raises(requests.HTTPError) as exc_info:
            self._post({'prompt': 'Test', 'max_tokens': 3, 'top_k': -5, 'stream': False})
        assert exc_info.value.response.status_code in [400, 422]

        print('  Invalid top_k values test passed')

    def test_boundary_max_tokens(self):
        print(f'\n[Model: {self.model_name}] Running boundary max_tokens test')
        resp1 = self._post({'prompt': 'Min tokens', 'max_tokens': 1, 'stream': False})
        assert resp1.status_code == 200
        data1 = resp1.json()
        assert data1['meta_info']['completion_tokens'] >= 1

        resp2 = self._post({'prompt': 'Max tokens test', 'max_tokens': 2048, 'stream': False})
        assert resp2.status_code == 200

        with pytest.raises(requests.HTTPError) as exc:
            self._post({'prompt': 'Test', 'max_tokens': -2, 'stream': False})

        assert exc.value.response.status_code == 400

        with pytest.raises(requests.HTTPError) as exc:
            self._post({'prompt': 'Test', 'max_tokens': 0, 'stream': False})

        assert exc.value.response.status_code == 400

        print('  Max tokens boundary test passed')

    def test_parameter_interactions(self):
        print(f'\n[Model: {self.model_name}] Running parameter interactions test')
        resp1 = self._post({
            'prompt': 'Deterministic generation',
            'max_tokens': 10,
            'temperature': 0.0,
            'top_p': 0.5,
            'top_k': 10,
            'stream': False
        })
        assert resp1.status_code == 200
        data1 = resp1.json()

        self._validate_generation_response(data1)

        print('  Parameter interaction (temp=0 with top_p/k) passed')

    def test_session_id_with_all_parameters(self):
        print(f'\n[Model: {self.model_name}] Running session_id with all parameters test')
        session_id = int(time.time()) % 100000

        resp1 = self._post({
            'session_id': session_id,
            'prompt': 'Hello, introduce yourself briefly.',
            'max_tokens': 20,
            'temperature': 0.7,
            'stream': False
        })
        assert resp1.status_code == 200
        data1 = resp1.json()
        self._validate_generation_response(data1)

        resp2 = self._post({
            'session_id': session_id,
            'prompt': 'What was I just talking about?',
            'max_tokens': 20,
            'temperature': 0.7,
            'stream': False
        })
        assert resp2.status_code == 200
        data2 = resp2.json()
        self._validate_generation_response(data2)

        assert 'What' in data2['text'] or 'hello' in data2['text'].lower() or \
            len(data2['text']) > 0

        print(f'  Session {session_id} test passed')

    def test_edge_cases_stop_conditions(self):
        print(f'\n[Model: {self.model_name}] Running edge cases stop conditions test')
        resp1 = self._post({'prompt': 'Test with empty stop list', 'max_tokens': 10, 'stop': [], 'stream': False})
        assert resp1.status_code == 200
        data1 = resp1.json()
        assert len(data1['text']) > 0

        resp2 = self._post({
            'prompt': 'Write a sentence ending with a period. Stop here test.',
            'max_tokens': 50,
            'stop': ['.'],
            'stream': False
        })
        assert resp2.status_code == 200
        data2 = resp2.json()

        text2 = data2['text']
        finish_reason = data2['meta_info']['finish_reason']['type']

        if '.' in text2:
            assert text2.strip().endswith('.'), \
                "Stop token '.' should cause generation to end at period"

        assert finish_reason in ['stop', 'eos'], \
            f'Expected to end due to stop token, actual: {finish_reason}'

        print(f"  Stop at '.': generated '{text2}' (Reason: {finish_reason})")

    def test_spaces_between_special_tokens(self, config):
        print(f'[Model: {self.model_name}] Running spaces_between_special_tokens test')
        model_path = os.path.join(config.get('model_path'), self.model_name)
        user_content = 'Hello [world]! This is a [test].'

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        special_tokens_map = tokenizer.special_tokens_map

        special_patterns = list(special_tokens_map.values())
        special_patterns = [
            item for sublist in special_patterns for item in (sublist if isinstance(sublist, list) else [sublist])
        ]

        print(' Executing skip_special_tokens=False and checking spaces between special tokens')
        payload_false = {'prompt': user_content, 'max_tokens': 100, 'skip_special_tokens': False, 'stream': False}
        resp_false = self._post(payload_false)
        data_false = resp_false.json()
        self._validate_generation_response(data=data_false, validate_tokens=True)
        generated_text = data_false['text']

        for i in range(len(generated_text) - 1):
            if generated_text[i] in special_patterns and generated_text[i + 1] not in [' ', '\n']:
                assert False, f'Expected space after special token {generated_text[i]} but found none.'

    @pytest.mark.experts
    @pytest.mark.pytorch
    def test_request_returns_experts(self):
        print(f'\n[Model: {self.model_name}] Running request with experts test')
        resp1 = self._post({
            'prompt': 'Deterministic generation',
            'max_tokens': 50,
            'temperature': 0.8,
            'return_routed_experts': True
        })
        assert resp1.status_code == 200
        data1 = resp1.json()

        self._validate_generation_response(data1, validate_experts=True)
--- a/autotest/utils/benchmark_utils.py
+++ b/autotest/utils/benchmark_utils.py
@@ -1,9 +1,9 @@
 import os
 import subprocess
 from subprocess import PIPE, Popen
 from subprocess import PIPE

 import allure
 import psutil
 from utils.common_utils import execute_command_with_logging
 from utils.config_utils import _is_bf16_supported_by_device, get_workerid
 from utils.run_restful_chat import health_check

@@ -55,12 +55,12 @@ def throughput_test(config, run_id, run_config, cuda_prefix: str = None, worker_
            get_max_cache_entry(model, backend), '--csv ', csv_path
        ])

        returncode, stderr = run_testcase(cmd, benchmark_log)
        result, stderr = execute_command_with_logging(cmd, benchmark_log)
        allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)

        if returncode == 0 and not os.path.isfile(csv_path):
        if result and not os.path.isfile(csv_path):
            return False, 'result is empty'
        if returncode != 0:
        if not result:
            return False, stderr

    return True, 'success'
@@ -118,7 +118,7 @@ def longtext_throughput_test(config,
        if concurrency:
            cmd += f' --concurrency {concurrency}'

        returncode, stderr = run_testcase(cmd, benchmark_log)
        returncode, stderr = execute_command_with_logging(cmd, benchmark_log)
        allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)

        if returncode == 0 and not os.path.isfile(csv_path):
@@ -296,7 +296,7 @@ def prefixcache_throughput_test(config,
            if concurrency:
                command += f' --concurrency {concurrency}'

            returncode, stderr = run_testcase(command, benchmark_log)
            returncode, stderr = execute_command_with_logging(command, benchmark_log)
            allure.attach.file(benchmark_log, attachment_type=allure.attachment_type.TEXT)

            if returncode == 0 and not os.path.isfile(csv_path):
@@ -307,34 +307,6 @@ def prefixcache_throughput_test(config,
    return True, ''


 def run_testcase(cmd, benchmark_log):
    if os.path.isfile(benchmark_log):
        write_type = 'a'
    else:
        write_type = 'w'
    with open(benchmark_log, write_type) as f:
        f.writelines('reproduce command: ' + cmd + '\n')
        print('reproduce command: ' + cmd)
        with Popen([cmd], stdin=PIPE, stdout=f, stderr=PIPE, shell=True, text=True, encoding='utf-8') as process:
            try:
                stdout, stderr = process.communicate(None)
            except Exception:
                kill_process(process.pid)
                raise
            except:  # noqa: E722
                kill_process(process.pid)
                raise
            retcode = process.poll()
    return retcode, stderr


 def kill_process(pid):
    parent = psutil.Process(pid)
    for child in parent.children(recursive=True):
        child.kill()
    parent.kill()


 def get_command_with_extra(cmd, cuda_prefix: str = None):
    if cuda_prefix is not None and len(cuda_prefix) > 0:
        cmd = ' '.join([cuda_prefix, cmd])
--- a/autotest/utils/common_utils.py
+++ b/autotest/utils/common_utils.py
@@ -0,0 +1,60 @@
 import os
 import subprocess
 import sys
 from typing import Tuple

 import psutil


 def execute_command_with_logging(cmd, log_file_path: str) -> Tuple[bool, str]:
    if os.path.isfile(log_file_path):
        write_type = 'a'
    else:
        write_type = 'w'
    try:
        with open(log_file_path, write_type, encoding='utf-8') as log_file:
            start_msg = f'execute command: {cmd}\n'
            print(start_msg, end='')
            log_file.write(start_msg)
            log_file.flush()

            process = subprocess.run(cmd,
                                     shell=True,
                                     text=True,
                                     encoding='utf-8',
                                     errors='replace',
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.STDOUT,
                                     bufsize=1)

            if process.stdout:
                print(process.stdout, end='')
                log_file.write(process.stdout)

            if process.returncode == 0:
                result = True
                result_msg = f'success: {process.returncode}\n'
            else:
                result = False
                result_msg = f'fail: {process.returncode}\n'

            print(result_msg, end='')
            log_file.write(result_msg)

            return result, result_msg.strip()

    except Exception as e:
        error_msg = f'exec fail: {str(e)}\n'
        print(error_msg, file=sys.stderr, end='')

        with open(log_file_path, 'a', encoding='utf-8') as log_file:
            log_file.write(error_msg)

        return False, error_msg.strip()


 def kill_process(pid):
    parent = psutil.Process(pid)
    for child in parent.children(recursive=True):
        child.kill()
    parent.kill()
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
@@ -61,30 +61,8 @@ def get_turbomind_model_list(parallel_config: Optional[Union[int, Dict[str, int]
    if parallel_config is not None:
        filtered_models = []

        target_config = {}
        if isinstance(parallel_config, int):
            target_config = {'tp': parallel_config}
        elif isinstance(parallel_config, dict):
            target_config = parallel_config

        for model in all_models:

            model_config = get_parallel_config(config, model)

            if not model_config:

                if not target_config or (len(target_config) == 1 and 'tp' in target_config
                                         and target_config['tp'] == 1):
                    filtered_models.append(model)
                continue

            match = True
            for key, target_value in target_config.items():
                if key not in model_config or model_config[key] != target_value:
                    match = False
                    break

            if match:
            if is_model_in_list(config, parallel_config, model):
                filtered_models.append(model)

        all_models = filtered_models
@@ -137,21 +115,7 @@ def get_torch_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] =
        for model in all_models:

            model_config = get_parallel_config(config, model)

            if not model_config:

                if not target_config or (len(target_config) == 1 and 'tp' in target_config
                                         and target_config['tp'] == 1):
                    filtered_models.append(model)
                continue

            match = True
            for key, target_value in target_config.items():
                if key not in model_config or model_config[key] != target_value:
                    match = False
                    break

            if match:
            if is_model_in_list(config, parallel_config, model):
                filtered_models.append(model)

        all_models = filtered_models
@@ -275,25 +239,8 @@ def get_vl_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = No
    if parallel_config is not None:
        filtered_models = []

        target_config = {}
        if isinstance(parallel_config, int):
            target_config = {'tp': parallel_config}
        elif isinstance(parallel_config, dict):
            target_config = parallel_config

        for model in vl_models:
            model_config = get_parallel_config(config, model)

            if not model_config:
                continue

            match = True
            for key, target_value in target_config.items():
                if key not in model_config or model_config[key] != target_value:
                    match = False
                    break

            if match:
            if is_model_in_list(config, parallel_config, model):
                filtered_models.append(model)

        vl_models = filtered_models
@@ -354,37 +301,23 @@ def get_vl_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = No

 def get_evaluate_turbomind_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = None,
                                      is_longtext: bool = False,
                                      is_mllm: bool = False,
                                      kvint_list: list = []):

    config = get_config()

    if is_longtext:
        case_list_base = [item for item in config.get('longtext_model', [])]
    elif is_mllm:
        case_list_base = config.get('mllm_evaluate_model', [])
    else:
        case_list_base = config.get('evaluate_model', [])

    if parallel_config is not None:
        filtered_models = []

        target_config = {}
        if isinstance(parallel_config, int):
            target_config = {'tp': parallel_config}
        elif isinstance(parallel_config, dict):
            target_config = parallel_config

        for model in case_list_base:
            model_config = get_parallel_config(config, model)

            if not model_config:
                continue

            match = True
            for key, target_value in target_config.items():
                if key not in model_config or model_config[key] != target_value:
                    match = False
                    break

            if match:
            if is_model_in_list(config, parallel_config, model):
                filtered_models.append(model)

        case_list_base = filtered_models
@@ -474,37 +407,23 @@ def get_evaluate_turbomind_model_list(parallel_config: Optional[Union[int, Dict[

 def get_evaluate_pytorch_model_list(parallel_config: Optional[Union[int, Dict[str, int]]] = None,
                                    is_longtext: bool = False,
                                    is_mllm: bool = False,
                                    kvint_list: list = []):

    config = get_config()

    if is_longtext:
        case_list_base = [item for item in config.get('longtext_model', [])]
    elif is_mllm:
        case_list_base = config.get('mllm_evaluate_model', [])
    else:
        case_list_base = config.get('evaluate_model', [])

    if parallel_config is not None:
        filtered_models = []

        target_config = {}
        if isinstance(parallel_config, int):
            target_config = {'tp': parallel_config}
        elif isinstance(parallel_config, dict):
            target_config = parallel_config

        for model in case_list_base:
            model_config = get_parallel_config(config, model)

            if not model_config:
                continue

            match = True
            for key, target_value in target_config.items():
                if key not in model_config or model_config[key] != target_value:
                    match = False
                    break

            if match:
            if is_model_in_list(config, parallel_config, model):
                filtered_models.append(model)

        case_list_base = filtered_models
@@ -576,25 +495,9 @@ def get_benchmark_model_list(parallel_config: Optional[Union[int, Dict[str, int]
    if parallel_config is not None:
        filtered_models = []

        target_config = {}
        if isinstance(parallel_config, int):
            target_config = {'tp': parallel_config}
        elif isinstance(parallel_config, dict):
            target_config = parallel_config

        for model in case_list_base:
            model_config = get_parallel_config(config, model)

            if not model_config:
                continue

            match = True
            for key, target_value in target_config.items():
                if key not in model_config or model_config[key] != target_value:
                    match = False
                    break

            if match:
            if is_model_in_list(config, parallel_config, model):
                filtered_models.append(model)

        case_list_base = filtered_models
@@ -801,3 +704,25 @@ def unset_device_env_variable():
    else:
        if 'CUDA_VISIBLE_DEVICES' in os.environ:
            del os.environ['CUDA_VISIBLE_DEVICES']


 def is_model_in_list(config, parallel_config, model):
    model_config = get_parallel_config(config, model)

    target_config = {}
    if isinstance(parallel_config, int):
        target_config = {'tp': parallel_config}
    elif isinstance(parallel_config, dict):
        target_config = parallel_config

    if not model_config:
        if not target_config or (len(target_config) == 1 and 'tp' in target_config and target_config['tp'] == 1):
            return True

    match = True
    for key, target_value in target_config.items():
        if key not in model_config or model_config[key] != target_value:
            match = False
            break

    return match
--- a/autotest/utils/constant.py
+++ b/autotest/utils/constant.py
@@ -0,0 +1,22 @@
 DEFAULT_PORT = 23333
 PROXY_PORT = 8000

 EVAL_CONFIGS = {
    'default': {
        'query_per_second': 4,
        'max_out_len': 64000,
        'max_seq_len': 65536,
        'batch_size': 500,
        'temperature': 0.6,
    },
    'gpt': {
        'query_per_second': 4,
        'max_out_len': 64000,
        'max_seq_len': 65536,
        'batch_size': 500,
        'temperature': 0.6,
        'openai_extra_kwargs': {
            'reasoning_effort': 'high',
        }
    }
 }
--- a/autotest/utils/evaluate_utils.py
+++ b/autotest/utils/evaluate_utils.py
@@ -1,16 +1,57 @@
 import csv
 import glob
 import json
 import os
 import subprocess

 import allure
 import pandas as pd
 from mmengine.config import Config
 from utils.common_utils import execute_command_with_logging

 DEFAULT_PORT = 23333


 def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, communicator, work_dir=None):
 def write_to_summary(model_name, tp_num, result, backend_type, communicator, metrics, work_dir=None):
    status = '✅ PASS' if result else '❌ FAIL'

    dataset_name = []
    dataset_metrics = []
    for key in sorted(metrics.keys()):
        dataset_name.append(key)
        dataset_metrics.append(metrics.get(key, ''))

    summary_dataset_name = ' | '.join(dataset_name)
    summary_dataset_metrics = ' | '.join(dataset_metrics)

    summary_file = os.environ.get('GITHUB_STEP_SUMMARY', '')
    md_summary_file = f'{work_dir}/summary.md'
    summary_line = f'| {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n'  # noqa: E501

    write_header = not os.path.exists(md_summary_file) or os.path.getsize(md_summary_file) == 0
    with open(md_summary_file, 'a') as f:
        if write_header:
            dash_line = '-----|' * (len(metrics.keys()))
            f.write('## Model Evaluation Results\n')
            f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
            f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
        f.write(summary_line)
    if summary_file:
        write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
        with open(summary_file, 'a') as f:
            if write_header:
                dash_line = '-----|' * (len(metrics.keys()))
                f.write('## Model Evaluation Results\n')
                f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
                f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
            f.write(summary_line)
    else:
        print(
            f'Summary: {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics}'  # noqa: E501
        )


 def llm_summary(model_name, tp_num, result, backend_type, communicator, work_dir=None):
    metrics = {}

    if work_dir and os.path.exists(work_dir):
@@ -45,39 +86,54 @@ def write_to_summary(model_name, tp_num, result, msg, worker_id, backend_type, c

        except Exception as e:
            print(f'Error reading metrics: {str(e)}')
    write_to_summary(model_name, tp_num, result, backend_type, communicator, metrics, work_dir)

    dataset_name = []
    dataset_metrics = []
    for key in sorted(metrics.keys()):
        dataset_name.append(key)
        dataset_metrics.append(metrics.get(key, ''))

    summary_dataset_name = ' | '.join(dataset_name)
    summary_dataset_metrics = ' | '.join(dataset_metrics)
 def mllm_summary(model_name,
                 summary_model_name,
                 tp_num,
                 result,
                 backend_type,
                 communicator,
                 work_dir=None,
                 dataset_list=['MMBench_V11_MINI', 'MMStar_MINI', 'AI2D_MINI', 'OCRBench_MINI']):
    metrics = {}
    pattern = os.path.join(work_dir, model_name, 'T*')
    t_dirs = [d for d in glob.glob(pattern) if os.path.isdir(d)]

    if not t_dirs:
        return

    # 按修改时间排序
    t_dirs.sort(key=os.path.getmtime, reverse=True)
    latest_dir = t_dirs[0]

    for dataset in dataset_list:
        if dataset == 'OCRBench_MINI':
            score_file = f'{latest_dir}/{model_name}_{dataset}_score.json'
            cur_score = 0
            with open(score_file, 'r') as f:
                total_score = json.load(f)
                cur_score = total_score['Final Score Norm']
            metrics[dataset] = f'{cur_score:.2f}'  # noqa: E231
        else:
            score_file = f'{latest_dir}/{model_name}_{dataset}_acc.csv'
            df = pd.read_csv(score_file)
            cur_score = df['Overall'].iloc[0]
            if dataset == 'MMBench_V11_MINI':
                cur_score = df.loc[df['split'] == 'dev', 'Overall'].values
            cur_score = cur_score * 100
            metrics[dataset] = f'{cur_score.item():.2f}'  # noqa: E231

    summary_file = os.environ.get('GITHUB_STEP_SUMMARY', None)
    summary_line = f'| {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics} |\n'  # noqa: E501
    if summary_file:
        write_header = not os.path.exists(summary_file) or os.path.getsize(summary_file) == 0
        with open(summary_file, 'a') as f:
            if write_header:
                dash_line = '-----|' * (len(metrics.keys()))
                f.write('## Model Evaluation Results\n')
                f.write(f'| Model | Backend | Communicator | TP | Status | {summary_dataset_name} |\n')
                f.write(f'|-------|---------|--------------|----|--------|{dash_line}\n')
            f.write(summary_line)
    else:
        print(
            f'Summary: {model_name} | {backend_type} | {communicator} | TP{tp_num} | {status} | {summary_dataset_metrics}'  # noqa: E501
        )
    write_to_summary(summary_model_name, tp_num, result, backend_type, communicator, metrics, work_dir)


 def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT, test_type='infer', **kwargs):
 def eval_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFAULT_PORT, test_type='infer', **kwargs):
    work_dir = None
    try:
        model_name = prepare_environment['model']
        backend_type = prepare_environment['backend']
        communicator = prepare_environment.get('communicator', 'cuda-ipc')
        communicator = prepare_environment.get('communicator', 'nccl')
        quant_policy = prepare_environment.get('quant_policy', 0)

        parallel_config = prepare_environment.get('parallel_config', 1)
@@ -151,8 +207,8 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
            elif test_type == 'eval':
                if not os.path.exists(temp_config_path):
                    error_msg = f'Temp config file {temp_config_path} not found for eval stage'
                    write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type,
                                     communicator, work_dir)
                    llm_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator,
                                work_dir)
                    return False, error_msg

                cfg = Config.fromfile(temp_config_path)
@@ -187,6 +243,7 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
            cmd = [
                'opencompass', temp_config_path, '--reuse', '--max-num-workers', '16', '-w', work_dir, '-m', test_type
            ]

            print(f"Running command: {' '.join(cmd)}")
            print(f'Work directory: {work_dir}')

@@ -249,9 +306,11 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
                        error_lines = ' | '.join(error_lines[:3])
                        final_msg += f'\nLog errors: {error_lines}'

            allure.attach.file(log_file, attachment_type=allure.attachment_type.TEXT)

            if test_type == 'eval':
                write_to_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type,
                                 communicator, work_dir)
                llm_summary(summary_model_name, tp_num, final_result, final_msg, worker_id, backend_type, communicator,
                            work_dir)

            return final_result, final_msg

@@ -263,12 +322,69 @@ def restful_test(config, run_id, prepare_environment, worker_id='gw0', port=DEFA
        timeout_msg = (f'Evaluation timed out for {model_name} '
                       f'after 259200 seconds')
        if work_dir and test_type == 'eval':
            write_to_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator,
                             work_dir)
            llm_summary(summary_model_name, tp_num, False, timeout_msg, worker_id, backend_type, communicator, work_dir)
        return False, timeout_msg
    except Exception as e:
        error_msg = f'Error during evaluation for {model_name}: {str(e)}'
        if work_dir and test_type == 'eval':
            write_to_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator,
                             work_dir)
            llm_summary(summary_model_name, tp_num, False, error_msg, worker_id, backend_type, communicator, work_dir)
        return False, error_msg


 def mllm_eval_test(config,
                   run_id,
                   prepare_environment,
                   worker_id='gw0',
                   port=DEFAULT_PORT,
                   test_type='infer',
                   **kwargs):
    work_dir = None
    model_name = prepare_environment['model']
    backend_type = prepare_environment['backend']
    tp_num = prepare_environment.get('tp_num', 1)
    communicator = prepare_environment.get('communicator', 'nccl')
    quant_policy = prepare_environment.get('quant_policy', 0)

    summary_model_name = model_name
    if quant_policy in [4, 8]:
        summary_model_name = f'{model_name}-kvint{quant_policy}'

    model_base_path = config.get('model_path', '/nvme/qa_test_models')
    model_path = os.path.join(model_base_path, model_name)

    print(f'Starting VLMEvalKit evaluation for model: {model_name}')
    print(f'Model path: {model_path}')
    print(f'Backend: {backend_type}')

    log_path = config.get('mllm_eval_log_path', '/nvme/qa_test_models/mllm_evaluation_report') + f'/{run_id}'
    os.makedirs(log_path, exist_ok=True)

    work_dir = os.path.join(log_path, f"wk_{backend_type}_{model_name.replace('/', '_')}_{communicator}_{quant_policy}")
    simple_model_name = model_name.split('/')[-1]
    os.makedirs(work_dir, exist_ok=True)
    if test_type == 'infer':
        cmd = f'python run.py --data MMBench_V11_MINI MMStar_MINI AI2D_MINI OCRBench_MINI --model {simple_model_name} --base-url http://127.0.0.1:{port}/v1 --reuse --work-dir {work_dir} --api-nproc 32 --mode infer'  # noqa

    elif test_type == 'eval':
        cmd = f'python run.py --data MMBench_V11_MINI MMStar_MINI AI2D_MINI OCRBench_MINI --model {simple_model_name} --base-url http://127.0.0.1:empty/v1 --reuse --work-dir {work_dir} --api-nproc 32 --mode eval --judge Qwen2.5-32B-Instruct --judge-base-url http://127.0.0.1:{port}/v1'  # noqa

    print(f'Work directory: {work_dir}')

    log_filename = (f'{backend_type}_'
                    f"{model_name.replace('/', '_')}_"
                    f'{communicator}_'
                    f'{worker_id}_'
                    f'{quant_policy}.log')
    log_file = os.path.join(log_path, log_filename)
    result, msg = execute_command_with_logging(cmd, log_file)

    if test_type == 'eval':
        mllm_summary(simple_model_name,
                     summary_model_name,
                     tp_num,
                     result,
                     backend_type,
                     communicator,
                     work_dir,
                     dataset_list=['MMBench_V11_MINI', 'MMStar_MINI', 'AI2D_MINI', 'OCRBench_MINI'])
    return result, msg
--- a/autotest/utils/toolkit.py
+++ b/autotest/utils/toolkit.py
@@ -0,0 +1,39 @@
 from functools import lru_cache
 from typing import List

 from transformers import AutoTokenizer


 def parse_sse_stream(content: str) -> list:
    """Parse SSE (Server-Sent Events) stream content into a list of events.

    Each event is either a JSON string or "[DONE]".
    """
    lines = content.strip().split('\n')
    events = []
    for line in lines:
        line = line.strip()
        if line.startswith('data: '):
            data = line[6:]  # remove "data: "
            if data.strip() == '[DONE]':
                events.append('[DONE]')
            else:
                events.append(data)
    return events


@lru_cache(maxsize=4)
 def _load_tokenizer_cached(model_path: str):
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        return tokenizer
    except Exception as e:
        raise RuntimeError(f"Failed to load tokenizer from '{model_path}': {e}")


 def encode_text(model_path: str, text: str) -> List[int]:
    tokenizer = _load_tokenizer_cached(model_path)

    encoded = tokenizer.encode(text)

    return encoded
--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -925,7 +925,7 @@ async def generate(request: GenerateReqInput, raw_request: Request = None):
    if error_check_ret is not None:
        return error_check_ret
    if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
        return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
        return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id!r} is occupied.')

    prompt = request.prompt
    input_ids = request.input_ids
--- a/lmdeploy/serve/openai/serving_generate.py
+++ b/lmdeploy/serve/openai/serving_generate.py
@@ -23,6 +23,15 @@ def check_request(request: GenerateReqInput, engine_config: 'TurbomindEngineConf
    if (request.prompt is not None) ^ (request.input_ids is None):
        return 'You must specify exactly one of prompt or input_ids'

    if request.prompt is not None and request.prompt == '':
        return 'The prompt must not be an empty string'

    if request.input_ids is not None and len(request.input_ids) == 0:
        return 'The input_ids must not be an empty list'

    if request.max_tokens is not None and request.max_tokens <= 0:
        return f'The max_tokens {request.max_tokens!r} must be a positive integer.'

    # check sampling settings
    if not (0 < request.top_p <= 1):
        return f'The top_p {request.top_p!r} must be in (0, 1].'