diff --git a/models/cv/object_detection/rtdetr/ixrt/README.md b/models/cv/object_detection/rtdetr/ixrt/README.md index fe0ad7e3c0a285e79f05883d3ce66f59b3d0c7a5..a63bf714613e58e0c77f88996a45e733366aa5a3 100644 --- a/models/cv/object_detection/rtdetr/ixrt/README.md +++ b/models/cv/object_detection/rtdetr/ixrt/README.md @@ -86,4 +86,4 @@ bash scripts/infer_rtdetr_fp16_performance.sh | Model | BatchSize | Precision | FPS | IOU@0.5 | IOU@0.5:0.95 | |:------:|:---------:|:---------:|:-----:|:-------:|:------------:| -| RT-DETR| 32 | FP16 | 71.4 | 0.729 | 0.543 | +| RT-DETR| 32 | FP16 | 326.427 | 0.656 | 0.480 | diff --git a/models/multimodal/vision_language_model/aria/vllm/README.md b/models/multimodal/vision_language_model/aria/vllm/README.md index be571e813adba4fd217c57da98483a0e75fb9baf..cb9a736c33ddf7c3767570a85057a5f1f20a8c0f 100644 --- a/models/multimodal/vision_language_model/aria/vllm/README.md +++ b/models/multimodal/vision_language_model/aria/vllm/README.md @@ -32,16 +32,6 @@ mkdir data In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. -```bash -# Install libGL -## CentOS -yum install -y mesa-libGL -## Ubuntu -apt install -y libgl1-mesa-glx - -pip install transformers==4.53.0 -``` - ## Model Inference ```bash diff --git a/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh index 1c1b1cee4cdf51831d24ebf395c41d27e92ee6ab..1ce243cbc5197ba4f8526707e50605e75b46e691 100644 --- a/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh +++ b/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh @@ -15,14 +15,5 @@ # limitations under the License. set -x -ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') -if [[ ${ID} == "ubuntu" ]]; then - apt install -y libgl1-mesa-glx -elif [[ ${ID} == "centos" ]]; then - yum install -y mesa-libGL -else - echo "Not Support Os" -fi -cp -r ../../vllm_public_assets/ ./ -pip install transformers==4.53.0 \ No newline at end of file +cp -r ../../vllm_public_assets/ ./ \ No newline at end of file diff --git a/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py index 445c1b2152edaba0e181aab59392cf4067ac41bd..27c8d307f5fa5fb909b74fff366b18b00439d1ac 100644 --- a/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py @@ -23,7 +23,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse import dataclasses @@ -73,8 +82,6 @@ def get_multi_modal_input(args): msg = f"Modality {args.modality} is not supported." raise ValueError(msg) - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--num-prompts', @@ -92,14 +99,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/chameleon_7b/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/chameleon_7b/vllm/offline_inference_vision_language.py index 92aa07b658a67c3b12726682f6f955f6dac273e4..ad72fd00ae03e37ae7f3a3b77ccc3db7452f2247 100755 --- a/models/multimodal/vision_language_model/chameleon_7b/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/chameleon_7b/vllm/offline_inference_vision_language.py @@ -21,7 +21,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -69,8 +78,6 @@ def get_multi_modal_input(args): msg = f"Modality {args.modality} is not supported." raise ValueError(msg) - - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--num-prompts', @@ -88,14 +95,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/e5-v/vllm/README.md b/models/multimodal/vision_language_model/e5-v/vllm/README.md index 53fceb2078c438f2bfaf42591cfb2efee2653c60..3f5004ae78f9e23be624a62bf2d82a3f399551fc 100644 --- a/models/multimodal/vision_language_model/e5-v/vllm/README.md +++ b/models/multimodal/vision_language_model/e5-v/vllm/README.md @@ -31,7 +31,7 @@ In order to run the model smoothly, you need to get the sdk from [resource cente ## Model Inference ```bash -python3 offline_inference_vision_language_embedding.py --model /path/to/e5-v/ --modality "image" --tensor_parallel_size 1 --task "embed" --trust_remote_code --max_model_len 4096 +python3 offline_inference_vision_language_embedding.py --model-name e5-v ``` ## Model Results diff --git a/models/multimodal/vision_language_model/e5-v/vllm/offline_inference_vision_language_embedding.py b/models/multimodal/vision_language_model/e5-v/vllm/offline_inference_vision_language_embedding.py index db738a33a6789e8e57d0c9f5ab3f9c034f42f6a4..5a78e30f0d47dade6c258a6761417d71dfcacbaa 100644 --- a/models/multimodal/vision_language_model/e5-v/vllm/offline_inference_vision_language_embedding.py +++ b/models/multimodal/vision_language_model/e5-v/vllm/offline_inference_vision_language_embedding.py @@ -1,38 +1,29 @@ -#!/bin/bash -# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with -the correct prompt format on vision language models for multimodal embedding. +the correct prompt format on vision language models for multimodal pooling. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ -from argparse import Namespace -import time -from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args import io import base64 -from PIL import Image -from vllm import LLM -from vllm.multimodal.utils import fetch_image -from vllm.utils import FlexibleArgumentParser +from argparse import Namespace +from dataclasses import asdict +from pathlib import Path +from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args + +from PIL.Image import Image, open as ImageOpen + from vllm import LLM, EngineArgs -import dataclasses +from vllm.entrypoints.score_utils import ScoreMultiModalParam +from vllm.multimodal.utils import fetch_image +from vllm.utils.argparse_utils import FlexibleArgumentParser + +ROOT_DIR = Path(__file__).parent.parent.parent +EXAMPLES_DIR = ROOT_DIR / "examples" + class TextQuery(TypedDict): modality: Literal["text"] @@ -41,58 +32,65 @@ class TextQuery(TypedDict): class ImageQuery(TypedDict): modality: Literal["image"] - image: Image.Image + image: Image class TextImageQuery(TypedDict): modality: Literal["text+image"] text: str - image: Image.Image + image: Image -QueryModality = Literal["text", "image", "text+image"] -Query = Union[TextQuery, ImageQuery, TextImageQuery] +class TextImagesQuery(TypedDict): + modality: Literal["text+images"] + text: str + image: ScoreMultiModalParam -class ModelRequestData(NamedTuple): - llm: LLM - prompt: str - image: Optional[Image.Image] +QueryModality = Literal["text", "image", "text+image", "text+images"] +Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery -def run_e5_v(query: Query, engine_params): - llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompt: str | None = None + image: Image | None = None + query: str | None = None + documents: ScoreMultiModalParam | None = None + +def run_e5_v(query: Query) -> ModelRequestData: + llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 if query["modality"] == "text": text = query["text"] - prompt = llama3_template.format( - f"{text}\nSummary above sentence in one word: ") + prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ") image = None elif query["modality"] == "image": - prompt = llama3_template.format( - "\nSummary above image in one word: ") + prompt = llama3_template.format("\nSummary above image in one word: ") image = query["image"] else: - modality = query['modality'] + modality = query["modality"] raise ValueError(f"Unsupported query modality: '{modality}'") - llm = LLM(**engine_params) + engine_args = EngineArgs( + model="royokong/e5-v", + runner="pooling", + max_model_len=4096, + limit_mm_per_prompt={"image": 1}, + ) return ModelRequestData( - llm=llm, + engine_args=engine_args, prompt=prompt, image=image, ) - - def get_query(modality: QueryModality): if modality == "text": return TextQuery(modality="text", text="A dog sitting in the grass") - if modality == "image": - image: Image = Image.open("vllm_public_assets/American_Eskimo_Dog.jpg") + image: Image = ImageOpen("vllm_public_assets/American_Eskimo_Dog.jpg") image = image.convert("RGB") image_data = io.BytesIO() image.save(image_data, format='JPEG') @@ -103,60 +101,90 @@ def get_query(modality: QueryModality): ), ) - if modality == "text+image": - image: Image = Image.open("vllm_public_assets/Felis_catus-cat_on_snow.jpg") - image = image.convert("RGB") - image_data = io.BytesIO() - image.save(image_data, format='JPEG') - image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") - return TextImageQuery( - modality="text+image", - text="A cat standing in the snow.", - image= fetch_image(f"data:image/jpeg;base64,{image_base64}" - ), - ) - msg = f"Modality {modality} is not supported." raise ValueError(msg) -def run_encode(engine_params, modality: QueryModality): +def run_encode(model: str, modality: QueryModality, seed: int | None): query = get_query(modality) - req_data = run_e5_v(query, engine_params) + req_data = model_example_map[model](query) + + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {} + ) + + engine_args = asdict(req_data.engine_args) | {"seed": seed} + llm = LLM(**engine_args) mm_data = {} if req_data.image is not None: mm_data["image"] = req_data.image - start_time = time.perf_counter() - outputs = req_data.llm.embed({ - "prompt": req_data.prompt, - "multi_modal_data": mm_data, - }) - end_time = time.perf_counter() - duration_time = end_time - start_time - num_tokens = 0 + outputs = llm.embed( + { + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + } + ) + + print("-" * 50) for output in outputs: - num_tokens += len(output.outputs.embedding) print(output.outputs.embedding) + print("-" * 50) if output.outputs.embedding is not None: print("Offline inference is successful!") - num_requests = 1 # 请求的数量 - qps = num_requests / duration_time - print(f"requests: {num_requests}, QPS: {qps}, tokens: {num_tokens}, Token/s: {num_tokens/duration_time}") -if __name__ == "__main__": +model_example_map = { + "e5_v": run_e5_v +} + + +def parse_args(): parser = FlexibleArgumentParser( - description='Demo on using vLLM for offline inference with ' - 'vision language models for multimodal embedding') - parser.add_argument('--modality', - type=str, - default="image", - choices=get_args(QueryModality), - help='Modality of the input.') - parser = EngineArgs.add_cli_args(parser) - args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] - engine_params = {attr: getattr(args, attr) for attr in engine_args} - - run_encode(engine_params, args.modality) + description="Demo on using vLLM for offline inference with " + "vision language models for multimodal pooling tasks." + ) + parser.add_argument( + "--model-name", + "-m", + type=str, + default="vlm2vec_phi3v", + choices=model_example_map.keys(), + help="The name of the embedding model.", + ) + parser.add_argument( + "--task", + "-t", + type=str, + default="embedding", + choices=["embedding", "scoring"], + help="The task type.", + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=get_args(QueryModality), + help="Modality of the input.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) + return parser.parse_args() + + +def main(args: Namespace): + if args.task == "embedding": + run_encode(args.model_name, args.modality, args.seed) + else: + raise ValueError(f"Unsupported task: {args.task}") + + +if __name__ == "__main__": + args = parse_args() + main(args) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/fuyu_8b/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/fuyu_8b/vllm/offline_inference_vision_language.py index 067ebfc2da63db41e07bfd42ad1ebeb286a60b6a..192cebe6b677fe9e9e263b6cf04a8421b9d40fdc 100755 --- a/models/multimodal/vision_language_model/fuyu_8b/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/fuyu_8b/vllm/offline_inference_vision_language.py @@ -21,7 +21,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -86,14 +95,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py index 70fef0c737dc300224309482aeb8ca4e8fd39d75..b3820087e98ccee35c0f2591a2e9b576bf4994a2 100644 --- a/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py @@ -24,7 +24,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -92,14 +101,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py index 721d4f629590eac0e99721c78ee397303af287e5..bca581a48e87db067920e62817a33a137068bf94 100644 --- a/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py @@ -23,7 +23,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -100,14 +109,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/idefics3/vllm/README.md b/models/multimodal/vision_language_model/idefics3/vllm/README.md index 99d807b8f3f154cca55ac0f73ea1a0d09ea4d05e..871f3bee00b3aba38934aa53d7ed3c74b687cd75 100644 --- a/models/multimodal/vision_language_model/idefics3/vllm/README.md +++ b/models/multimodal/vision_language_model/idefics3/vllm/README.md @@ -31,16 +31,6 @@ mkdir HuggingFaceM4 In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. -```bash -# Install libGL -## CentOS -yum install -y mesa-libGL -## Ubuntu -apt install -y libgl1-mesa-glx - -pip install transformers==4.50.3 -``` - ## Model Inference ```bash diff --git a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh index 26f7a3ffa914a58ae2cb1905e4140bf4779e8911..1ce243cbc5197ba4f8526707e50605e75b46e691 100644 --- a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh +++ b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh @@ -15,14 +15,5 @@ # limitations under the License. set -x -ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') -if [[ ${ID} == "ubuntu" ]]; then - apt install -y libgl1-mesa-glx -elif [[ ${ID} == "centos" ]]; then - yum install -y mesa-libGL -else - echo "Not Support Os" -fi -cp -r ../../vllm_public_assets/ ./ -pip install transformers==4.50.3 \ No newline at end of file +cp -r ../../vllm_public_assets/ ./ \ No newline at end of file diff --git a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py index 502f21d1eeecf6a2e307c483d5d3a1dbd01a824d..59c13c0a644d2d602a508057a590f74f965dfff8 100644 --- a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py @@ -18,7 +18,7 @@ from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset from vllm.lora.request import LoRARequest -from vllm.utils import FlexibleArgumentParser +from vllm.utils.argparse_utils import FlexibleArgumentParser import time class ModelRequestData(NamedTuple): diff --git a/models/multimodal/vision_language_model/intern_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/intern_vl/vllm/offline_inference_vision_language.py index 0db5c982d48c9b5552fb9607569beb1313cc911a..7e3d8a1b0a76a75b71ca915152cf6d6e6b222cb9 100644 --- a/models/multimodal/vision_language_model/intern_vl/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/intern_vl/vllm/offline_inference_vision_language.py @@ -22,7 +22,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -102,14 +111,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/llama-3.2/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/llama-3.2/vllm/offline_inference_vision_language.py index 49aab699937ebef4641713e8db51a08fd7a900c9..b2164b077992c92a3066b91095eb5cb258a05837 100644 --- a/models/multimodal/vision_language_model/llama-3.2/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/llama-3.2/vllm/offline_inference_vision_language.py @@ -23,20 +23,23 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse import dataclasses import inspect from vllm.assets.image import ImageAsset -from vllm.assets.video import VideoAsset - - from vllm import LLM, EngineArgs, SamplingParams -import sys -from pathlib import Path -import os from utils import sampling_add_cli_args # LLama 3.2 @@ -83,14 +86,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/llava/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/llava/vllm/offline_inference_vision_language.py index 2088f6fcca792f5e4ebd0778766ef623c4c21345..a954e4b30d1f9d02c01d1b2a13b53d837c2ae706 100644 --- a/models/multimodal/vision_language_model/llava/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/llava/vllm/offline_inference_vision_language.py @@ -22,7 +22,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -30,12 +39,7 @@ import dataclasses import inspect from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset - - from vllm import LLM, EngineArgs, SamplingParams -import sys -from pathlib import Path -import os from utils import sampling_add_cli_args # LLaVA-1.5 @@ -139,14 +143,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/offline_inference_vision_language.py index b0c6fa2d8a7a71f0e274a652e2a66a57eacaef0c..2252d5d5628a883b5113aa3cebe756e930c4e794 100755 --- a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/offline_inference_vision_language.py @@ -21,7 +21,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -29,12 +38,7 @@ import dataclasses import inspect from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset - - from vllm import LLM, EngineArgs, SamplingParams -import sys -from pathlib import Path -import os from utils import sampling_add_cli_args # LLaVA-1.5 @@ -101,8 +105,7 @@ def get_multi_modal_input(args): if args.modality == "video": # Input video and question - video = VideoAsset(name="sample_demo_1.mp4", - num_frames=args.num_frames).np_ndarrays + video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays vid_question = "Why is this video funny?" return { @@ -138,14 +141,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4 b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4 deleted file mode 100644 index 17cc120e9fd151e2938cd46c0ba6fd83947bf961..0000000000000000000000000000000000000000 Binary files a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4 and /dev/null differ diff --git a/models/multimodal/vision_language_model/minicpm-o-2/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm-o-2/vllm/offline_inference_vision_language.py index 494eccef4daffff8246243710815900b0d424a02..7b347305d4f763e98e7cdfe5d26b0151c42b2ce7 100644 --- a/models/multimodal/vision_language_model/minicpm-o-2/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/minicpm-o-2/vllm/offline_inference_vision_language.py @@ -22,7 +22,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -105,14 +114,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/paligemma/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/paligemma/vllm/offline_inference_vision_language.py index 0f004b719ab2f176e32fe2a2288a0210ae45ff06..d6eed1c1ff7c58dd6e44018924d7a1430eee74b1 100644 --- a/models/multimodal/vision_language_model/paligemma/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/paligemma/vllm/offline_inference_vision_language.py @@ -23,7 +23,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -90,14 +99,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/phi3_v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/phi3_v/vllm/offline_inference_vision_language.py index d9c3095619f2ff7773888ce6517dc335d173f713..23b6a14e1dd002d0a01da10e367b218c23ecf28a 100644 --- a/models/multimodal/vision_language_model/phi3_v/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/phi3_v/vllm/offline_inference_vision_language.py @@ -23,7 +23,16 @@ on HuggingFace model repository. """ import sys from pathlib import Path -import os +import argparse as _argparse +# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== +_original_add_argument = _argparse._ArgumentGroup.add_argument + +def _patched_add_argument(self, *args, **kwargs): + kwargs.pop('deprecated', None) + return _original_add_argument(self, *args, **kwargs) + +_argparse._ArgumentGroup.add_argument = _patched_add_argument +# ========================================================= import time sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) import argparse @@ -93,14 +102,14 @@ if __name__ == "__main__": parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_args = EngineArgs.from_cli_args(args) + engine_params = dataclasses.asdict(engine_args) sampling_args = [ param.name for param in list( inspect.signature(SamplingParams).parameters.values() ) ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} sampling_params = { attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) } diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen_vl/vllm/README.md index ae9e0d78ba2c807dc08829efd191c469cdac12f9..680601a1adfa394abe55470af00d6ae82f815f2c 100644 --- a/models/multimodal/vision_language_model/qwen_vl/vllm/README.md +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/README.md @@ -18,6 +18,7 @@ Qwen-VL (Qwen Large Vision Language Model) is the visual multimodal version of t ```bash cp -r ../../vllm_public_assets/ ./ +# download model and make sure model path is ./qwen_vl ``` ### Install Dependencies @@ -32,7 +33,7 @@ pip install matplotlib ```bash export VLLM_ASSETS_CACHE=../vllm/ -python3 offline_inference_vision_language.py --model /path/to/Qwen-VL-Chat -tp 1 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["QwenVLForConditionalGeneration"]}' +python3 offline_inference_vision_language.py --model-type qwen_vl ``` ## Model Results diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py index b43c6bad9242f36cd6779401a4105dafa0af5cd1..838889bfc30157947242ced3d547836891d88c54 100644 --- a/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py @@ -1,20 +1,5 @@ -#!/bin/bash -# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models. @@ -22,132 +7,318 @@ with the correct prompt format on vision language models. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ -import sys -from pathlib import Path -import argparse as _argparse -# ====== PATCH: 兼容旧版 argparse 不支持 'deprecated' ====== -_original_add_argument = _argparse._ArgumentGroup.add_argument - -def _patched_add_argument(self, *args, **kwargs): - kwargs.pop('deprecated', None) - return _original_add_argument(self, *args, **kwargs) - -_argparse._ArgumentGroup.add_argument = _patched_add_argument -# ========================================================= + import time -sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) -import argparse -import dataclasses -import inspect +import random +from typing import NamedTuple +from contextlib import contextmanager +from dataclasses import asdict from vllm.assets.image import ImageAsset -from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode +from vllm.lora.request import LoRARequest from vllm import LLM, EngineArgs, SamplingParams -from utils import sampling_add_cli_args +from vllm.utils.argparse_utils import FlexibleArgumentParser +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: list[str] + stop_token_ids: list[int] | None = None + lora_requests: list[LoRARequest] | None = None + sampling_params: list[SamplingParams] | None = None -# Qwen -def run_qwen_vl(question,engine_params,modality): +# Qwen-VL +def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - llm = LLM(**engine_params) - prompt = f"{question}Picture 1: \n" - stop_token_ids = None - return llm, prompt, stop_token_ids + engine_args = EngineArgs( + model="./qwen_vl", + trust_remote_code=True, + max_model_len=1024, + max_num_seqs=2, + hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, + limit_mm_per_prompt={modality: 1}, + ) + + prompts = [f"{question}Picture 1: \n" for question in questions] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) + +model_example_map = { + "qwen_vl": run_qwen_vl, +} def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ if args.modality == "image": # Input image and question - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - img_question = "What is the content of this image?" + image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") + img_questions = [ + "What is the content of this image?", + "Describe the content of this image in detail.", + "What's in the image?", + "Where is this image taken?", + ] return { "data": image, - "question": img_question, - } - - if args.modality == "video": - # Input video and question - video = VideoAsset(name="sample_demo_1.mp4", - num_frames=args.num_frames).np_ndarrays - vid_question = "Why is this video funny?" - - return { - "data": video, - "question": vid_question, + "questions": img_questions, } msg = f"Modality {args.modality} is not supported." raise ValueError(msg) -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--num-prompts', - type=int, - default=1, - help='Number of prompts to run.') - parser.add_argument('--modality', - type=str, - default="image", - help='Modality of the input.') - parser.add_argument('--num-frames', - type=int, - default=16, - help='Number of frames to extract from the video.') - parser = EngineArgs.add_cli_args(parser) - parser = sampling_add_cli_args(parser) - args = parser.parse_args() - engine_args = EngineArgs.from_cli_args(args) - engine_params = dataclasses.asdict(engine_args) - sampling_args = [ - param.name - for param in list( - inspect.signature(SamplingParams).parameters.values() +def apply_image_repeat( + image_repeat_prob, num_prompts, data, prompts: list[str], modality +): + """Repeats images with provided probability of "image_repeat_prob". + Used to simulate hit/miss for the MM preprocessor cache. + """ + assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0 + no_yes = [0, 1] + probs = [1.0 - image_repeat_prob, image_repeat_prob] + + inputs = [] + inputs_with_empty_media = [] + cur_image = data + for i in range(num_prompts): + if image_repeat_prob is not None: + res = random.choices(no_yes, probs)[0] + if res == 0: + # No repeat => Modify one pixel + cur_image = cur_image.copy() + new_val = (i // 256 // 256, i // 256, i % 256) + cur_image.putpixel((0, 0), new_val) + + uuid = "uuid_{}".format(i) + + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: cur_image}, + "multi_modal_uuids": {modality: uuid}, + } ) - ] - sampling_params = { - attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) - } - + + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) + + return inputs, inputs_with_empty_media + +def parse_args(): + parser = FlexibleArgumentParser( + description="Demo on using vLLM for offline inference with " + "vision language models for text generation" + ) + parser.add_argument( + "--model-type", + "-m", + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".', + ) + parser.add_argument( + "--num-prompts", type=int, default=4, help="Number of prompts to run." + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=["image", "video"], + help="Modality of the input.", + ) + parser.add_argument( + "--num-frames", + type=int, + default=16, + help="Number of frames to extract from the video.", + ) + parser.add_argument( + "--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.", + ) + + parser.add_argument( + "--image-repeat-prob", + type=float, + default=None, + help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)", + ) + + parser.add_argument( + "--disable-mm-processor-cache", + action="store_true", + help="If True, disables caching of multi-modal processor.", + ) + + parser.add_argument( + "--time-generate", + action="store_true", + help="If True, then print the total generate() call time", + ) + + parser.add_argument( + "--use-different-prompt-per-request", + action="store_true", + help="If True, then use different prompt (with the same multi-modal " + "data) for each request.", + ) + + parser.add_argument( + "--verify-mm-cache-hit-with-uuids", + action="store_true", + help="If True, will send all requests in a second batch with empty mm " + "data to verify cache hits with UUIDs.", + ) + return parser.parse_args() + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + modality = args.modality mm_input = get_multi_modal_input(args) data = mm_input["data"] - question = mm_input["question"] + questions = mm_input["questions"] + + req_data = model_example_map[model](questions, modality) + + # Disable other modalities to save memory + default_limits = {"image": 0, "video": 0, "audio": 0} + req_data.engine_args.limit_mm_per_prompt = default_limits | dict( + req_data.engine_args.limit_mm_per_prompt or {} + ) + + engine_args = asdict(req_data.engine_args) | { + "seed": args.seed, + "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, + } + llm = LLM(**engine_args) - llm, prompt, stop_token_ids = run_qwen_vl(question,engine_params,args.modality) - sampling_params['stop_token_ids'] = stop_token_ids + # Don't want to check the flag multiple times, so just hijack `prompts`. + prompts = ( + req_data.prompts + if args.use_different_prompt_per_request + else [req_data.prompts[0]] + ) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(**sampling_params) + sampling_params = ( + SamplingParams( + temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids + ) + if req_data.sampling_params is None + else req_data.sampling_params + ) assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference + uuid = "uuid_0" inputs = { - "prompt": prompt, - "multi_modal_data": { - modality: data - }, + "prompt": prompts[0], + "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + inputs_with_empty_media = { + "prompt": prompts[0], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, } - else: # Batch inference - inputs = [{ - "prompt": prompt, - "multi_modal_data": { - modality: data - }, - } for _ in range(args.num_prompts)] + if args.image_repeat_prob is not None: + # Repeat images with specified probability of "image_repeat_prob" + inputs, inputs_with_empty_media = apply_image_repeat( + args.image_repeat_prob, + args.num_prompts, + data, + prompts, + modality, + ) + else: + # Use the same image for all prompts + inputs = [] + inputs_with_empty_media = [] + for i in range(args.num_prompts): + uuid = "uuid_{}".format(i) + inputs.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: data}, + "multi_modal_uuids": {modality: uuid}, + } + ) + inputs_with_empty_media.append( + { + "prompt": prompts[i % len(prompts)], + "multi_modal_data": {modality: None}, + "multi_modal_uuids": {modality: uuid}, + } + ) + + # Add LoRA request if applicable + lora_request = ( + req_data.lora_requests * args.num_prompts if req_data.lora_requests else None + ) start_time = time.perf_counter() - outputs = llm.generate(inputs, sampling_params=sampling_params) + outputs = llm.generate( + inputs, + sampling_params=sampling_params, + lora_request=lora_request, + ) end_time = time.perf_counter() duration_time = end_time - start_time - num_tokens = 0 + + print("-" * 50) for o in outputs: num_tokens += len(o.outputs[0].token_ids) generated_text = o.outputs[0].text print(generated_text) + print("-" * 50) num_requests = args.num_prompts # 请求的数量 qps = num_requests / duration_time - print(f"requests: {num_requests}, QPS: {qps}, tokens: {num_tokens}, Token/s: {num_tokens/duration_time}") \ No newline at end of file + print(f"requests: {num_requests}, QPS: {qps}, tokens: {num_tokens}, Token/s: {num_tokens/duration_time}") + + if args.verify_mm_cache_hit_with_uuids: + try: + # Verify cache hits with UUIDs + print( + "Sending a second batch of requests with empty media" + " and matching UUIDs." + ) + outputs = llm.generate( + inputs_with_empty_media, + sampling_params=sampling_params, + lora_request=lora_request, + ) + print("-" * 50) + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + print("-" * 50) + except Exception as e: + print(f"Failed to verify cache hits with UUIDs. Error: {e}") + + +if __name__ == "__main__": + args = parse_args() + main(args) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/xlmroberta/vllm/README.md b/models/multimodal/vision_language_model/xlmroberta/vllm/README.md index 363a896f356df74864981607a17ca94b62de8957..c8e1e8aedb79fd280724d6913c40ff4b714260d4 100644 --- a/models/multimodal/vision_language_model/xlmroberta/vllm/README.md +++ b/models/multimodal/vision_language_model/xlmroberta/vllm/README.md @@ -21,7 +21,7 @@ RoBERTa is a transformers model pretrained on a large corpus in a self-supervise ```bash # Download model from the website and make sure the model's path is "data/bge-reranker-v2-m3" "data/multilingual-e5-large" -mkdir data +mkdir -p data ``` ### Install Dependencies @@ -32,7 +32,7 @@ In order to run the model smoothly, you need to get the sdk from [resource cente ### Sentence Pair Scoring Modeling ```bash -python3 offline_inference_scoring.py --model data/bge-reranker-v2-m3 --task "score" --tensor-parallel-size 1 +python3 offline_inference_scoring.py --model data/bge-reranker-v2-m3 ``` ### Text Embedding diff --git a/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_embedding.py b/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_embedding.py index a7be09fa4fbc503f538ac3d19dbcc4ef9bf8c34a..79d250256390ca443b397b3bb42af776db2b50b4 100644 --- a/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_embedding.py +++ b/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_embedding.py @@ -1,54 +1,25 @@ -#!/bin/bash -# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import sys -from pathlib import Path -import os +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) -import argparse -import dataclasses -import inspect -import logging -import time +from argparse import Namespace -import torch -from utils import load_chat_template, sampling_add_cli_args -from vllm import LLM, EngineArgs, SamplingParams +from vllm import LLM, EngineArgs +from vllm.utils.argparse_utils import FlexibleArgumentParser -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser = EngineArgs.add_cli_args(parser) - parser = sampling_add_cli_args(parser) - args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] - sampling_args = [ - param.name - for param in list( - inspect.signature(SamplingParams).parameters.values() - ) - ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} - sampling_params = { - attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) - } +def parse_args(): + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + # Set example specific arguments + parser.set_defaults( + model="intfloat/e5-small", + runner="pooling", + enforce_eager=True, + ) + return parser.parse_args() - model_name = os.path.dirname(args.model).rsplit("/")[-1] +def main(args: Namespace): # Sample prompts. prompts = [ "Hello, my name is", @@ -57,27 +28,24 @@ if __name__ == "__main__": "The future of AI is", ] - # Create a sampling params object. - sampling_params = SamplingParams(**sampling_params) - # Create an LLM. - llm = LLM(**engine_params) + # You should pass runner="pooling" for embedding models + llm = LLM(**vars(args)) - - - - start_time = time.perf_counter() - # skip process chat template # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = llm.encode(prompts) - end_time = time.perf_counter() - duration_time = end_time - start_time - num_tokens = 0 + outputs = llm.embed(prompts) + # Print the outputs. - for output in outputs: - num_tokens += len(output.outputs.embedding) - print(output.outputs.embedding) # list of hidden_size floats - print("Offline inference is successful!") - num_requests = len(prompts) # 请求的数量 - qps = num_requests / duration_time - print(f"requests: {num_requests}, QPS: {qps}, tokens: {num_tokens}, Token/s: {num_tokens/duration_time}") \ No newline at end of file + print("\nGenerated Outputs:\n" + "-" * 60) + for prompt, output in zip(prompts, outputs): + embeds = output.outputs.embedding + embeds_trimmed = ( + (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds + ) + print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})") + print("-" * 60) + + +if __name__ == "__main__": + args = parse_args() + main(args) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_scoring.py b/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_scoring.py index 05c9ed0b4ed1282a261738088fbc532c914b337c..7920230aa3fc471b41db61f981766d6c7098abb6 100644 --- a/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_scoring.py +++ b/models/multimodal/vision_language_model/xlmroberta/vllm/offline_inference_scoring.py @@ -1,47 +1,47 @@ -#!/bin/bash -# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from vllm import LLM -import argparse +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from argparse import Namespace + from vllm import LLM, EngineArgs -import dataclasses +from vllm.utils.argparse_utils import FlexibleArgumentParser -if __name__ == "__main__": - - parser = argparse.ArgumentParser() + +def parse_args(): + parser = FlexibleArgumentParser() parser = EngineArgs.add_cli_args(parser) - args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] - engine_params = {attr: getattr(args, attr) for attr in engine_args} + # Set example specific arguments + parser.set_defaults( + model="BAAI/bge-reranker-v2-m3", + runner="pooling", + enforce_eager=True, + ) + return parser.parse_args() + + +def main(args: Namespace): # Sample prompts. text_1 = "What is the capital of France?" texts_2 = [ - "The capital of Brazil is Brasilia.", "The capital of France is Paris." + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", ] # Create an LLM. - # You should pass task="score" for cross-encoder models - model = LLM(**engine_params) + # You should pass runner="pooling" for cross-encoder models + llm = LLM(**vars(args)) # Generate scores. The output is a list of ScoringRequestOutputs. - outputs = model.score(text_1, texts_2) + outputs = llm.score(text_1, texts_2) # Print the outputs. + print("\nGenerated Outputs:\n" + "-" * 60) for text_2, output in zip(texts_2, outputs): score = output.outputs.score - print(f"Pair: {[text_1, text_2]!r} | Score: {score}") - + print(f"Pair: {[text_1, text_2]!r} \nScore: {score}") + print("-" * 60) + + +if __name__ == "__main__": + args = parse_args() + main(args) \ No newline at end of file diff --git a/tests/run_vllm.py b/tests/run_vllm.py index 792051c5853a25fe8ecf10e9228b07d1601fc9a5..b60e5ee91e71df6390ac735527320a1a01e8d33b 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -103,10 +103,10 @@ _VISION_MODEL_CONFIGS = { "llava": ("offline_inference_vision_language.py", ["--max-tokens 256", "-tp 4", "--model-type llava-next", "--max-model-len 4096"], "0,1,3,4", ["VLLM_ASSETS_CACHE=../vllm/"]), "llava_next_video_7b": ("offline_inference_vision_language.py", ["--max-tokens 256", "-tp 4", "--model-type llava-next-video", "--modality video", "--dtype bfloat16"], "0,1,3,4", ["VLLM_ASSETS_CACHE=../vllm/"]), "intern_vl": ("offline_inference_vision_language.py", ["--max-tokens 256", "-tp 2", "--max-model-len 2048"], None, ["VLLM_ASSETS_CACHE=../vllm/"]), - "qwen_vl": ("offline_inference_vision_language.py", ["-tp 1", "--hf-overrides '{\"architectures\": [\"QwenVLForConditionalGeneration\"]}'"], None, ["VLLM_ASSETS_CACHE=../vllm/"]), + "qwen_vl": ("offline_inference_vision_language.py", ["--model-type qwen_vl"], None, ["VLLM_ASSETS_CACHE=../vllm/"]), "qwen2_vl": ("offline_inference_vision_language.py", ["--max-tokens 256", "-tp 4", "--max-num-seqs 5"], "0,1,3,4", ["VLLM_ASSETS_CACHE=../vllm/", "ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1"]), "qwen2_5_vl": ("offline_inference_vision_language.py", ["-tp 4", "--max-token 256"], "0,1,3,4", ["VLLM_ASSETS_CACHE=../vllm/", "ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1"]), - "e5-v": ("offline_inference_vision_language_embedding.py", ["--modality \"image\"", "--tensor_parallel_size 1", "--task \"embed\"", "--max_model_len 4096"], None, []), + "e5-v": ("offline_inference_vision_language_embedding.py", ["---model-name e5-v"], None, []), "glm-4v": ("offline_inference_vision_language.py", ["--max-tokens 256", "-tp 4", "--hf-overrides '{\"architectures\": [\"GLM4VForCausalLM\"]}'"], "0,1,3,4", ["VLLM_ASSETS_CACHE=../vllm/"]), "minicpm-o-2": ("offline_inference_vision_language.py", ["--max-model-len 4096", "--max-num-seqs 2", "--disable-mm-preprocessor-cache"], None, []), "phi3_v": ("offline_inference_vision_language.py", ["--max-tokens 256", "-tp 4", "--max-model-len 4096"], "0,1,3,4", ["VLLM_ASSETS_CACHE=../vllm/"]), @@ -170,9 +170,9 @@ def _build_inference_script(model: Dict[str, Any], prec: str) -> str: case "xlmroberta": return base_script + ( - "python3 offline_inference_scoring.py --model ./xlmroberta --task \"score\" --tensor-parallel-size 1\n" + "python3 offline_inference_scoring.py --model ./xlmroberta \n" "ln -s /mnt/deepspark/data/checkpoints/multilingual-e5-large ./\n" - "python3 offline_inference_embedding.py --model ./multilingual-e5-large -tp 2" + "python3 offline_inference_embedding.py --model ./multilingual-e5-large" ) case "whisper": @@ -183,13 +183,24 @@ def _build_inference_script(model: Dict[str, Any], prec: str) -> str: ) # Vision-language models - case "aria" | "chameleon_7b" | "fuyu_8b" | "idefics3" | "h2vol" | "minicpm-v-2" | "llama-3.2" | "pixtral" | "llava" | "llava_next_video_7b" | "intern_vl" | "qwen_vl" | "qwen2_vl" | "qwen2_5_vl" | "e5-v" | "glm-4v" | "minicpm-o-2" | "phi3_v" | "paligemma" | "minicpm-v-4" | "deepseek-ocr": + case "aria" | "chameleon_7b" | "fuyu_8b" | "h2vol" | "llama-3.2" | "pixtral" | "llava" | "llava_next_video_7b" | "intern_vl" | "qwen2_vl" | "qwen2_5_vl" | "e5-v" | "glm-4v" | "minicpm-o-2" | "phi3_v" | "paligemma": config = _VISION_MODEL_CONFIGS[model_name] script_file, args, gpus, envs = config env_lines = "\n".join(f"export {e}" for e in envs) + ("\n" if envs else "") gpu_prefix = f"CUDA_VISIBLE_DEVICES={gpus} " if gpus else "" arg_str = " ".join(args) cmd = f"{gpu_prefix}python3 {script_file} --model ./{model_name} {arg_str} --trust-remote-code --temperature 0.0" + if model_name == "e5-v": + cmd = f"{gpu_prefix}python3 {script_file}" + return base_script + env_lines + cmd + + case "deepseek-ocr" | "minicpm-v-2" | "minicpm-v-4" | "idefics3" | "qwen_vl": + config = _VISION_MODEL_CONFIGS[model_name] + script_file, args, gpus, envs = config + env_lines = "\n".join(f"export {e}" for e in envs) + ("\n" if envs else "") + gpu_prefix = f"CUDA_VISIBLE_DEVICES={gpus} " if gpus else "" + arg_str = " ".join(args) + cmd = f"{gpu_prefix}python3 {script_file} {arg_str}" return base_script + env_lines + cmd # Standard LLMs @@ -243,6 +254,14 @@ def _append_benchmark_script(script: str, model: Dict[str, Any]) -> str: "--dataset-path lmarena-ai/VisionArena-Chat --num-prompts 10 --hf-split train " "-tp 4 --max-model-len 4096 --max-num-seqs 2 --trust-remote-code" ) + if model_name == "deepseek-ocr": + bench = ( + "mkdir -p lmarena-ai\n" + "ln -s /mnt/deepspark/data/datasets/VisionArena-Chat lmarena-ai/\n" + "CUDA_VISIBLE_DEVICES=0,1,3,4 python3 vllm/benchmarks/benchmark_throughput.py " + f"--model ./{model_name} --backend vllm-chat --dataset-name hf " + "--dataset-path lmarena-ai/VisionArena-Chat --num-prompts 10 --hf-split train " + ) return script + common_bench + bench return script @@ -286,13 +305,13 @@ def _parse_script_output(sout: str, prec: str, display_name: str) -> Dict[str, A return result_entry # Fallback pattern for concurrency - match = re.search(r"Maximum concurrency for ([0-9,]+) tokens per request:\s*([0-9.]+)x", sout) - if match: - return { - "tokens": int(match.group(1).replace(",", "")), - "QPS": float(match.group(2)), - "status": "PASS" - } + # match = re.search(r"Maximum concurrency for ([0-9,]+) tokens per request:\s*([0-9.]+)x", sout) + # if match: + # return { + # "tokens": int(match.group(1).replace(",", "")), + # "QPS": float(match.group(2)), + # "status": "PASS" + # } matchs = re.findall(METRIC_PATTERN, sout) if matchs and len(matchs) == 1: