diff --git a/README.md b/README.md
index 0af03f6635e001599e5e3944d23ac88d3e49a2ce..3f571e07c09f495845f52a4a0827a5b23ec12238 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@
| DeepSeek-R1-Distill-Llama-8B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm) | 4.3.0 |
| DeepSeek-R1-Distill-Llama-70B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm) | 4.3.0 |
| DeepSeek-R1-Distill-Qwen-1.5B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm) | 4.3.0 |
-| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.3.0 |
+| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.4.0 |
| DeepSeek-R1-Distill-Qwen-14B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-14b/vllm) | 4.3.0 |
| DeepSeek-R1-Distill-Qwen-32B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-32b/vllm) | 4.3.0 |
| DeepSeek-OCR | `Transformers` | [✅](models/multimodal/vision_language_model/deepseek-ocr/transformers) | 4.3.0 |
@@ -61,7 +61,7 @@
| Qwen-7B | `vLLM` | [✅](models/nlp/llm/qwen-7b/vllm) | 4.3.0 |
| Qwen-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen_vl/vllm) | 4.3.0 |
| Qwen2-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_vl/vllm) | 4.3.0 |
-| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.3.0 |
+| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.4.0 |
| Qwen1.5-7B | `vLLM` | [✅](models/nlp/llm/qwen1.5-7b/vllm) | 4.3.0 |
| Qwen1.5-7B | `TGI` | [✅](models/nlp/llm/qwen1.5-7b/tgi) | 4.3.0 |
| Qwen1.5-14B | `vLLM` | [✅](models/nlp/llm/qwen1.5-14b/vllm) | 4.3.0 |
@@ -70,9 +70,14 @@
| Qwen2-7B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-7b/vllm) | 4.3.0 |
| Qwen2-72B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-72b/vllm) | 4.3.0 |
| Qwen3_Moe | `vLLM` | [✅](models/nlp/llm/qwen3-235b/vllm) | dev-only |
-| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3/vllm) | 4.4.0 |
+| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3-8b/vllm) | 4.4.0 |
+| Qwen3-32B | `vLLM` | [✅](models/nlp/llm/qwen3-32b/vllm) | 4.4.0 |
+| Qwen3-30B-A3B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-30b-a3b-thinking/vllm) | 4.4.0 |
+| Qwen3-235B-A22B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-235b-a22b-thinking/vllm) | 4.4.0 |
+| Qwen3-Next-80B-A3B | `vLLM` | [✅](models/nlp/llm/qwen3-next-80b-a3b/vllm) | 4.4.0 |
+| DeepSeek-V3.1 | `vLLM` | [✅](models/nlp/llm/deepseek-v3.1/vllm) | 4.4.0 |
| StableLM2-1.6B | `vLLM` | [✅](models/nlp/llm/stablelm/vllm) | 4.3.0 |
-| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | dev-only |
+| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | 4.4.0 |
| Ultravox | `vLLM` | [✅](models/speech/asr/ultravox/vllm) | 4.3.0 |
| Whisper | `vLLM` | [✅](models/speech/asr/whisper/vllm/) | 4.3.0 |
| XLMRoberta | `vLLM` | [✅](models/multimodal/vision_language_model/xlmroberta/vllm) | 4.3.0 |
@@ -323,6 +328,12 @@
| Stable Diffusion 1.5 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers) | 4.3.0 |
| Stable Diffusion 2.1 | ixRT | [✅](models/multimodal/diffusion_model/stable-diffusion-2.1/diffusers) | 4.4.0 |
| Stable Diffusion 3 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers) | dev-only |
+| FLUX.1-Dev | xDiT | [✅](models/multimodal/diffusion_model/flux.1-dev/xdit) | 4.4.0 |
+| HunyuanVideo | xDiT | [✅](models/multimodal/diffusion_model/hunyuan_video/xdit) | 4.4.0 |
+| Wan2.1-T2V-14B | xDiT | [✅](models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit) | 4.4.0 |
+| Wan2.2-TI2V-5B | xDiT | [✅](models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit) | 4.4.0 |
+| HunyuanDiT-v1.2 | xDiT | [✅](models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit) | 4.4.0 |
+| SD3-Medium | xDiT | [✅](models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit) | 4.4.0 |
### 自然语言处理(NLP)
diff --git a/README_en.md b/README_en.md
index 631736f9d5a0c0605ce14cca93b394a21bafc037..08f98040ca51db95758a53295ae68950232a3970 100644
--- a/README_en.md
+++ b/README_en.md
@@ -46,7 +46,7 @@ inference to be expanded in the future.
| DeepSeek-R1-Distill-Llama-8B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm) | 4.3.0 |
| DeepSeek-R1-Distill-Llama-70B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm) | 4.3.0 |
| DeepSeek-R1-Distill-Qwen-1.5B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm) | 4.3.0 |
-| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.3.0 |
+| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.4.0 |
| DeepSeek-R1-Distill-Qwen-14B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-14b/vllm) | 4.3.0 |
| DeepSeek-R1-Distill-Qwen-32B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-32b/vllm) | 4.3.0 |
| DeepSeek-OCR | `Transformers` | [✅](models/multimodal/vision_language_model/deepseek-ocr/transformers) | 4.3.0 |
@@ -71,7 +71,7 @@ inference to be expanded in the future.
| Qwen-7B | `vLLM` | [✅](models/nlp/llm/qwen-7b/vllm) | 4.3.0 |
| Qwen-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen_vl/vllm) | 4.3.0 |
| Qwen2-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_vl/vllm) | 4.3.0 |
-| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.3.0 |
+| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.4.0 |
| Qwen1.5-7B | `vLLM` | [✅](models/nlp/llm/qwen1.5-7b/vllm) | 4.3.0 |
| Qwen1.5-7B | `TGI` | [✅](models/nlp/llm/qwen1.5-7b/tgi) | 4.3.0 |
| Qwen1.5-14B | `vLLM` | [✅](models/nlp/llm/qwen1.5-14b/vllm) | 4.3.0 |
@@ -80,9 +80,14 @@ inference to be expanded in the future.
| Qwen2-7B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-7b/vllm) | 4.3.0 |
| Qwen2-72B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-72b/vllm) | 4.3.0 |
| Qwen3_Moe | `vLLM` | [✅](models/nlp/llm/qwen3-235b/vllm) | dev-only |
-| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3/vllm) | 4.4.0 |
+| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3-8b/vllm) | 4.4.0 |
+| Qwen3-32B | `vLLM` | [✅](models/nlp/llm/qwen3-32b/vllm) | 4.4.0 |
+| Qwen3-30B-A3B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-30b-a3b-thinking/vllm) | 4.4.0 |
+| Qwen3-235B-A22B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-235b-a22b-thinking/vllm) | 4.4.0 |
+| Qwen3-Next-80B-A3B | `vLLM` | [✅](models/nlp/llm/qwen3-next-80b-a3b/vllm) | 4.4.0 |
+| DeepSeek-V3.1 | `vLLM` | [✅](models/nlp/llm/deepseek-v3.1/vllm) | 4.4.0 |
| StableLM2-1.6B | `vLLM` | [✅](models/nlp/llm/stablelm/vllm) | 4.3.0 |
-| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | dev-only |
+| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | 4.4.0 |
| Ultravox | `vLLM` | [✅](models/speech/asr/ultravox/vllm) | 4.3.0 |
| Whisper | `vLLM` | [✅](models/speech/asr/whisper/vllm/) | 4.3.0 |
| XLMRoberta | `vLLM` | [✅](models/multimodal/vision_language_model/xlmroberta/vllm) | 4.3.0 |
@@ -332,6 +337,12 @@ inference to be expanded in the future.
| Stable Diffusion 1.5 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers) | 4.3.0 |
| Stable Diffusion 2.1 | ixRT | [✅](models/multimodal/diffusion_model/stable-diffusion-2.1/diffusers) | 4.4.0 |
| Stable Diffusion 3 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers) | dev-only |
+| FLUX.1-Dev | xDiT | [✅](models/multimodal/diffusion_model/flux.1-dev/xdit) | 4.4.0 |
+| HunyuanVideo | xDiT | [✅](models/multimodal/diffusion_model/hunyuan_video/xdit) | 4.4.0 |
+| Wan2.1-T2V-14B | xDiT | [✅](models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit) | 4.4.0 |
+| Wan2.2-TI2V-5B | xDiT | [✅](models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit) | 4.4.0 |
+| HunyuanDiT-v1.2 | xDiT | [✅](models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit) | 4.4.0 |
+| SD3-Medium | xDiT | [✅](models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit) | 4.4.0 |
### NLP
#### PLM (Pre-trained Language Model)
diff --git a/RELEASE.md b/RELEASE.md
index 866a42207844a63ba4c95562f2fcfa0d5017717b..b0cf8a3be90c196bdb1f94c60d4823b4d081218b 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,7 @@
| Release Date | Release Version | IXUCA SDK |
|--------------|-----------------|-----------|
+| Mar 2026 | 26.03 | v4.4.0 |
| Dec 2025 | 25.12 | v4.3.0 |
| Sep 2025 | 25.09 | v4.3.0 |
| Jun 2025 | 25.06 | v4.2.0 |
@@ -20,6 +21,104 @@
## Release Notes
+### DeepSparkInference 26.03
+
+#### 模型与算法
+
+* 新增了 16 个推理小模型示例,其中支持 IGIE 推理引擎的 10 个,支持 ixRT 推理引擎的 6 个。
+* 新增了 18 个大语言模型推理示例,其中 12 个使用 [vLLM](https://github.com/vllm-project/vllm),6 个使用 [xDiT](https://github.com/xdit-team/xDiT),2 个使用 ixRT。
+
+
+ | IGIE |
+
+ | MobileViT-S |
+ ViT-B-32 |
+ ViT-L-14 |
+
+
+ | DETR |
+ RT-DETR |
+ YOLOv11m |
+
+
+ | YOLOv11s |
+ YOLOv26n |
+ YOLOv5s |
+
+
+ | DenseNet121(int8) |
+ |
+ |
+
+ ixRT |
+
+ | Swin Transformer |
+ RepNet |
+ Grounding DINO |
+
+
+ | RT-DETR |
+ CRNN |
+ UNet |
+
+ LLM |
+
+ | DeepSeek-V3.1 (vLLM) |
+ DeepSeek-VL2 (vLLM) |
+ DeepSeek-OCR (vLLM) |
+
+
+ | InternLM3 (vLLM) |
+ MiniCPM-V-4 (vLLM) |
+ Qwen3-8B (vLLM) |
+
+
+ | Qwen3-32B (vLLM) |
+ Qwen3-30B-A3B (vLLM) |
+ Qwen3-235B-A22B (vLLM) |
+
+
+ | Qwen3-Next-80B (vLLM) |
+ FLUX.1-Dev(xDiT) |
+ HunyuanVideo(xDiT) |
+
+
+ | Wan2.1-T2V-14B(xDiT) |
+ Wan2.2-TI2V-5B(xDiT) |
+ HunyuanDiT-v1.2(xDiT) |
+
+
+ | SD3-Medium(xDiT) |
+ CosyVoice (ixRT) |
+ Stable Diffusion 2.1 (ixRT) |
+
+
+
+#### 修复更新
+
+* 适配了 IXUCA SDK 4.4.0 版本的 CI 测试流程
+* 修复了 IGIE MViTv2-base 模型运行时缺少 pkg_resources 模块的问题
+* 修复了 vLLM 推理模型的 deprecated 参数错误并升级为离线推理模式
+* 修复了 DeepSeek-R1-Distill-Llama-8B 模型在 vLLM 0.11.2 版本上的兼容性问题
+* 修复了 Qwen-VL、Qwen2-VL、Qwen2.5-VL、Whisper 等模型的参数错误问题
+* 修复了 Pixtral 模型在 vLLM 0.11.2 版本上的兼容性问题
+* 修复了 ixRT RT-DETR 模型在 batchsize 为 64 时运行报错的问题
+* 修复了多个模型的 trust_remote_code 参数配置问题
+* 修复了 IGIE YOLOv8n 模型与 ultralytics 版本的兼容性问题
+* 修复了 YOLOx 模型的数据集路径问题
+* 修复了 IGIE ResNet 和 VGG16 模型在安装 TensorFlow 时的报错问题
+* 修复了 protobuf 版本导致的兼容性问题
+* 新增了 ARM 架构的核心绑定命令支持
+* 新增了 YOLOv8n ixRT 模型的 batchsize 参数支持
+
+#### 版本关联
+
+DeepSparkInference 26.03 对应天数软件栈 4.4.0 版本。
+
+#### 感谢以下社区贡献者
+
+YoungPeng,honglyua,majorli6,shengyan.zhao,yougouda,jinrui.zhang,tianyu,anders。
+
### DeepSparkInference 25.12
#### 模型与算法
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md b/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..95bc45a8f5472ee4b248fd2d2ff41b15f6ce92b7
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md
@@ -0,0 +1,61 @@
+# FLUX.1-Dev (xDiT)
+
+## Model Description
+
+FLUX.1-Dev is a state-of-the-art text-to-image diffusion model developed by Black Forest Labs. It excels at generating high-quality, detailed images from text prompts with exceptional prompt adherence and image quality.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+- Model:
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. Modify model path in ``run.sh``:
+```bash
+# The run.sh script is pre-copied in this directory
+# Modify MODEL_CONFIGS to point to your model path
+vim run.sh
+# Update: MODEL_CONFIGS=(["Flux"]="flux_example.py /home/data/flux___1-schnell/ 28")
+```
+
+2. Run script:
+```bash
+bash run.sh
+```
+
+3. The model supports 512*512 and 1024*1024 image sizes. To modify:
+```bash
+vim run.sh
+# Modify TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+# to TASK_ARGS="--height 512 --width 512 --no_use_resolution_binning --guidance_scale 3.5"
+```
+
+## References
+
+- [FLUX.1](https://github.com/black-forest-labs/flux)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py b/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a6a91e9a35cc2f4433c9303a9fbaffb0321f408
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py
@@ -0,0 +1,124 @@
+import logging
+import time
+import torch
+import torch.distributed
+from transformers import T5EncoderModel
+from xfuser import xFuserFluxPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ get_data_parallel_rank,
+ get_data_parallel_world_size,
+ get_runtime_state,
+ is_dp_last_group,
+ get_pipeline_parallel_world_size,
+ get_classifier_free_guidance_world_size,
+ get_tensor_model_parallel_world_size,
+ get_data_parallel_world_size,
+)
+from xfuser.model_executor.cache.diffusers_adapters import apply_cache_on_transformer
+# if os.environ.get("ENABLE_IXFORMER_CONV2D", "0") == "1":
+# import ixformer as ixff
+# torch.nn.functional.conv2d=ixff.conv2d
+
+
+def main():
+ torch.backends.cudnn.benchmark=False
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+ engine_config, input_config = engine_args.create_config()
+ runtime_dtype = torch.bfloat16
+ engine_config.runtime_config.dtype = runtime_dtype
+ local_rank = get_world_group().local_rank
+ torch.cuda.set_device(local_rank)
+ text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
+
+ if args.use_fp8_t5_encoder:
+ from optimum.quanto import freeze, qfloat8, quantize
+ logging.info(f"rank {local_rank} quantizing text encoder 2")
+ quantize(text_encoder_2, weights=qfloat8)
+ freeze(text_encoder_2)
+
+ cache_args = {
+ "use_teacache": engine_args.use_teacache,
+ "use_fbcache": engine_args.use_fbcache,
+ "rel_l1_thresh": 0.12,
+ "return_hidden_states_first": False,
+ "num_steps": input_config.num_inference_steps,
+ }
+ # print(cache_args)
+ pipe = xFuserFluxPipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ engine_config=engine_config,
+ cache_args=cache_args,
+ torch_dtype=runtime_dtype,
+ text_encoder_2=text_encoder_2,
+
+ )
+ pipe.vae = pipe.vae.to(dtype=torch.float32)
+
+ # pipe.vae.to(memory_format=torch.channels_last)
+ # for net in pipe.vae.modules():
+ # net.register_forward_hook(forward_hook)
+
+ if args.enable_sequential_cpu_offload:
+ pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} sequential CPU offload enabled")
+ elif args.enable_model_cpu_offload:
+ pipe.enable_model_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} model CPU offload enabled")
+ else:
+ pipe = pipe.to(local_rank)
+
+ parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+ ##
+ import os
+ if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1":
+ from w8a8_linear import apply_quant_linear_i8w8o16
+ pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+ # pipe.transformer.fuse_qkv_projections()
+ pipe.prepare_run(input_config, steps=input_config.num_inference_steps)
+
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ prompt=input_config.prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ output_type=input_config.output_type,
+ max_sequence_length=256,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ )
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"tp{engine_args.tensor_parallel_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ if input_config.output_type == "pil":
+ dp_group_index = get_data_parallel_rank()
+ num_dp_groups = get_data_parallel_world_size()
+ dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
+ if pipe.is_dp_last_group():
+ for i, image in enumerate(output.images):
+ image_rank = dp_group_index * dp_batch_size + i
+ image_name = f"flux_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+ image.save(f"./{image_name}")
+ print(f"image {i} saved to ./{image_name}")
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(
+ f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
+ )
+ get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt b/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..92e80ab221aa39de5448471b3bc63121b1dbbd37
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt
@@ -0,0 +1,8 @@
+#diffusers
+yunchang
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh b/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..163e088888c3ec0cd88b0d5e3d50bdd8a1a00990
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh
@@ -0,0 +1,72 @@
+# set -x
+export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4
+export ENABLE_IXFORMER_W8A8LINEAR=1
+
+# Select the model type
+export MODEL_TYPE="Flux"
+# Configuration for different model types
+# script, model_id, inference_step
+declare -A MODEL_CONFIGS=(
+ ["Flux"]="flux_example.py /home/data/flux___1-schnell/ 28"
+)
+
+echo ${MODEL_CONFIGS[$MODEL_TYPE]}
+
+# if [ -v MODEL_CONFIGS[$MODEL_TYPE] ] ; then
+if [ -n "${MODEL_CONFIGS[$MODEL_TYPE]+_}" ]; then
+ IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
+ export SCRIPT MODEL_ID INFERENCE_STEP
+else
+ echo "Invalid MODEL_TYPE: $MODEL_TYPE"
+ exit 1
+fi
+
+
+
+# task args
+TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+
+# cache args
+# CACHE_ARGS="--use_teacache"
+# CACHE_ARGS="--use_fbcache"
+
+# On 8 gpus, pp=2, ulysses=2, ring=1, cfg_parallel=2 (split batch)
+N_GPUS=2
+PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 1"
+
+# CFG_ARGS="--use_cfg_parallel"
+
+# By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance.
+# PIPEFUSION_ARGS="--num_pipeline_patch 8 "
+
+# For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed.
+# OUTPUT_ARGS="--output_type latent"
+
+# PARALLLEL_VAE="--use_parallel_vae"
+
+# Another compile option is `--use_onediff` which will use onediff's compiler.
+# COMPILE_FLAG="--use_torch_compile"
+
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 1 \
+--prompt "brown dog laying on the ground with a metal bowl in front of him." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG \
+$QUANTIZE_FLAG \
+$CACHE_ARGS \
+
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..25717b05019a5509b3a2a1e032adac21f3713df0
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md
@@ -0,0 +1,58 @@
+# HunyuanDiT-v1.2-Diffusers (xDiT)
+
+## Model Description
+
+HunyuanDiT-v1.2 is Tencent's advanced text-to-image diffusion model, featuring improved architecture and training for high-quality image generation. It excels at generating detailed, photorealistic images from text descriptions.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_hunyuandit.sh
+# Update MODEL_ID to your actual model path
+```
+
+2. Run script:
+```bash
+bash run_hunyuandit.sh
+```
+
+3. The model supports BS=1/BS=2. Different BS prompts format:
+```bash
+# BS1 (default) prompt format
+#--prompt "brown dog laying on the ground with a metal bowl in front of him."
+# BS2 prompt format
+--prompt "brown dog laying on the ground with a metal bowl in front of him." "brown dog laying on the ground with a metal bowl in front of him."
+```
+
+## References
+
+- [HunyuanDiT](https://github.com/Tencent/HunyuanDiT)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26a4503848bf371699693406f04ebf9e052a746
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py
@@ -0,0 +1,92 @@
+import time
+import os
+import torch
+import torch.distributed
+from transformers import T5EncoderModel
+from xfuser import xFuserHunyuanDiTPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ is_dp_last_group,
+ get_data_parallel_world_size,
+ get_runtime_state,
+ get_data_parallel_rank,
+)
+
+def main():
+
+ # torch.backends.cudnn.benchmark=False
+
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+ engine_config, input_config = engine_args.create_config()
+ local_rank = get_world_group().local_rank
+ torch.cuda.set_device(local_rank)
+ text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
+ if args.use_fp8_t5_encoder:
+ from optimum.quanto import freeze, qfloat8, quantize
+ print(f"rank {local_rank} quantizing text encoder 2")
+ quantize(text_encoder_2, weights=qfloat8)
+ freeze(text_encoder_2)
+
+ pipe = xFuserHunyuanDiTPipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ engine_config=engine_config,
+ torch_dtype=torch.float16,
+ text_encoder_2=text_encoder_2,
+ ).to(f"cuda:{local_rank}")
+ pipe.vae.to(memory_format=torch.channels_last)
+ parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+ import os
+ if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1":
+ from w8a8_linear import apply_quant_linear_i8w8o16
+ pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+ pipe.prepare_run(input_config)
+
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ prompt=input_config.prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ output_type=input_config.output_type,
+ use_resolution_binning=input_config.use_resolution_binning,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ )
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ if input_config.output_type == "pil":
+ dp_group_index = get_data_parallel_rank()
+ num_dp_groups = get_data_parallel_world_size()
+ dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
+ if pipe.is_dp_last_group():
+ if not os.path.exists("results"):
+ os.mkdir("results")
+ for i, image in enumerate(output.images):
+ image_rank = dp_group_index * dp_batch_size + i
+ image.save(
+ f"./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+ )
+ print(
+ f"image {i} saved to ./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+ )
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(
+ f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
+ )
+ get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2969a4385d913c98a2cb13adfa2bb29f3d3f0938
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c136150e280f7b1f260977707f062515974491fc
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh
@@ -0,0 +1,45 @@
+# set -x
+export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+#多ring 没提升
+# export NCCL_USE_HIGHPRIORITYWARP=1
+
+export ENABLE_IXFORMER_INFERENCE=1
+# export ATTN_OPT_LEVEL=2
+export ENABLE_IXFORMER_W8A8LINEAR=0
+
+# Select the model type
+SCRIPT=hunyuandit_example.py
+MODEL_ID=/data/nlp/HunyuanDiT-v1.2-Diffusers/
+INFERENCE_STEP=20
+
+mkdir -p ./results
+
+# task args
+TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+
+# cache args
+# CACHE_ARGS="--use_teacache"
+# CACHE_ARGS="--use_fbcache"
+
+N_GPUS=2
+PARALLEL_ARGS="--pipefusion_parallel_degree 1 --ulysses_degree 1 --ring_degree 1 --tensor_parallel_degree 1 --data_parallel_degree 1"
+CFG_ARGS="--use_cfg_parallel"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 1 \
+--prompt "brown dog laying on the ground with a metal bowl in front of him." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG \
+$QUANTIZE_FLAG \
+$CACHE_ARGS \
+
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md b/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a9dd1c9e23f18807bc564b6e2efd811f10d0937
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md
@@ -0,0 +1,50 @@
+# HunyuanVideo (xDiT)
+
+## Model Description
+
+HunyuanVideo is Tencent's advanced text-to-video diffusion model capable of generating high-quality videos from text descriptions. It features excellent motion coherence, visual quality, and text understanding capabilities.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path in ``run_hunyuan_video_usp_teacache.sh``:
+```bash
+vim run_hunyuan_video_usp_teacache.sh
+# Update: MODEL_ID="/data/nlp/HunyuanVideo/" to your actual path
+```
+
+2. Run script:
+```bash
+bash run_hunyuan_video_usp_teacache.sh
+```
+
+## References
+
+- [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1c549f223d0d9906ae55129ec3f4413fbd322a
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py
@@ -0,0 +1,333 @@
+# from https://github.com/chengzeyi/ParaAttention/blob/main/examples/run_hunyuan_video.py
+import functools
+from typing import Any, Dict, Union, Optional
+import logging
+import time
+
+import torch
+
+from diffusers import DiffusionPipeline, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import scale_lora_layers, unscale_lora_layers, USE_PEFT_BACKEND
+from diffusers.utils import export_to_video
+from xfuser.model_executor.models.customized.hunyuan_video.tp_applicator import TensorParallelApplicator
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from xfuser import xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ get_data_parallel_world_size,
+ get_data_parallel_rank,
+ get_runtime_state,
+ get_classifier_free_guidance_world_size,
+ get_classifier_free_guidance_rank,
+ get_cfg_group,
+ get_sequence_parallel_world_size,
+ get_sequence_parallel_rank,
+ get_sp_group,
+ is_dp_last_group,
+ initialize_runtime_state,
+ get_pipeline_parallel_world_size,
+)
+
+from xfuser.model_executor.layers.attention_processor import xFuserHunyuanVideoAttnProcessor2_0
+
+assert xFuserHunyuanVideoAttnProcessor2_0 is not None
+
+
+def parallelize_transformer(pipe: DiffusionPipeline):
+ transformer = pipe.transformer
+
+ @functools.wraps(transformer.__class__.forward)
+ def new_forward(
+ self,
+ hidden_states: torch.Tensor,
+ timestep: torch.LongTensor,
+ encoder_hidden_states: torch.Tensor,
+ encoder_attention_mask: torch.Tensor,
+ pooled_projections: torch.Tensor,
+ guidance: torch.Tensor = None,
+ attention_kwargs: Optional[Dict[str, Any]] = None,
+ return_dict: bool = True,
+ ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+ if attention_kwargs is not None:
+ attention_kwargs = attention_kwargs.copy()
+ lora_scale = attention_kwargs.pop("scale", 1.0)
+ else:
+ lora_scale = 1.0
+
+ if USE_PEFT_BACKEND:
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
+ scale_lora_layers(self, lora_scale)
+ else:
+ if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+ logging.warning("Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.")
+
+ batch_size, num_channels, num_frames, height, width = hidden_states.shape
+ assert batch_size % get_classifier_free_guidance_world_size(
+ ) == 0, f"Cannot split dim 0 of hidden_states ({batch_size}) into {get_classifier_free_guidance_world_size()} parts."
+
+ p, p_t = self.config.patch_size, self.config.patch_size_t
+ post_patch_num_frames = num_frames // p_t
+ post_patch_height = height // p
+ post_patch_width = width // p
+
+ # 1. RoPE
+ image_rotary_emb = self.rope(hidden_states)
+
+ # 2. Conditional embeddings
+ # temb = self.time_text_embed(timestep, guidance, pooled_projections)
+ temb, token_replace_emb = self.time_text_embed(timestep,pooled_projections, guidance)
+ hidden_states = self.x_embedder(hidden_states)
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states,
+ timestep,
+ encoder_attention_mask)
+
+ hidden_states = hidden_states.reshape(batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1)
+ hidden_states = hidden_states.flatten(1, 3)
+
+ hidden_states = torch.chunk(hidden_states,
+ get_classifier_free_guidance_world_size(),
+ dim=0)[get_classifier_free_guidance_rank()]
+ hidden_states = torch.chunk(hidden_states,
+ get_sequence_parallel_world_size(),
+ dim=-2)[get_sequence_parallel_rank()]
+
+ encoder_attention_mask = encoder_attention_mask[0].to(torch.bool)
+ encoder_hidden_states_indices = torch.arange(
+ encoder_hidden_states.shape[1],
+ device=encoder_hidden_states.device)
+ encoder_hidden_states_indices = encoder_hidden_states_indices[
+ encoder_attention_mask]
+ encoder_hidden_states = encoder_hidden_states[
+ ..., encoder_hidden_states_indices, :]
+ if encoder_hidden_states.shape[-2] % get_sequence_parallel_world_size(
+ ) != 0:
+ get_runtime_state().split_text_embed_in_sp = False
+ else:
+ get_runtime_state().split_text_embed_in_sp = True
+
+ encoder_hidden_states = torch.chunk(
+ encoder_hidden_states,
+ get_classifier_free_guidance_world_size(),
+ dim=0)[get_classifier_free_guidance_rank()]
+ if get_runtime_state().split_text_embed_in_sp:
+ encoder_hidden_states = torch.chunk(
+ encoder_hidden_states,
+ get_sequence_parallel_world_size(),
+ dim=-2)[get_sequence_parallel_rank()]
+
+ freqs_cos, freqs_sin = image_rotary_emb
+
+ def get_rotary_emb_chunk(freqs):
+ freqs = torch.chunk(freqs, get_sequence_parallel_world_size(), dim=0)[get_sequence_parallel_rank()]
+ return freqs
+
+ freqs_cos = get_rotary_emb_chunk(freqs_cos)
+ freqs_sin = get_rotary_emb_chunk(freqs_sin)
+ image_rotary_emb = (freqs_cos, freqs_sin)
+
+ # 4. Transformer blocks
+ if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+ def create_custom_forward(module, return_dict=None):
+
+ def custom_forward(*inputs):
+ if return_dict is not None:
+ return module(*inputs, return_dict=return_dict)
+ else:
+ return module(*inputs)
+
+ return custom_forward
+
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False}
+
+ for block in self.transformer_blocks:
+ hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states,
+ encoder_hidden_states,
+ temb,
+ None,
+ image_rotary_emb,
+ **ckpt_kwargs,
+ )
+
+ for block in self.single_transformer_blocks:
+ hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+ create_custom_forward(block),
+ hidden_states,
+ encoder_hidden_states,
+ temb,
+ None,
+ image_rotary_emb,
+ **ckpt_kwargs,
+ )
+
+ else:
+ for block in self.transformer_blocks:
+ hidden_states, encoder_hidden_states = block(
+ hidden_states, encoder_hidden_states, temb, None,
+ image_rotary_emb)
+
+ for block in self.single_transformer_blocks:
+ hidden_states, encoder_hidden_states = block(
+ hidden_states, encoder_hidden_states, temb, None,
+ image_rotary_emb)
+
+ # 5. Output projection
+ hidden_states = self.norm_out(hidden_states, temb)
+ hidden_states = self.proj_out(hidden_states)
+
+ hidden_states = get_sp_group().all_gather(hidden_states, dim=-2)
+ hidden_states = get_cfg_group().all_gather(hidden_states, dim=0)
+
+ hidden_states = hidden_states.reshape(batch_size,
+ post_patch_num_frames,
+ post_patch_height,
+ post_patch_width, -1, p_t, p, p)
+
+ hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
+ hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+ if USE_PEFT_BACKEND:
+ # remove `lora_scale` from each PEFT layer
+ unscale_lora_layers(self, lora_scale)
+
+ if not return_dict:
+ return (hidden_states, )
+
+ return Transformer2DModelOutput(sample=hidden_states)
+
+ new_forward = new_forward.__get__(transformer)
+ transformer.forward = new_forward
+
+ for block in transformer.transformer_blocks + transformer.single_transformer_blocks:
+ block.attn.processor = xFuserHunyuanVideoAttnProcessor2_0()
+
+
+def main():
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+
+ engine_config, input_config = engine_args.create_config()
+ local_rank = get_world_group().local_rank
+
+ assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+ assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for HunyuanVideo"
+
+ transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ subfolder="transformer",
+ torch_dtype=torch.bfloat16,
+ revision="refs/pr/18",
+ )
+ pipe = HunyuanVideoPipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ transformer=transformer,
+ torch_dtype=torch.float16,
+ revision="refs/pr/18",
+ )
+
+ initialize_runtime_state(pipe, engine_config)
+ get_runtime_state().set_video_input_parameters(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ batch_size=1,
+ num_inference_steps=input_config.num_inference_steps,
+ split_text_embed_in_sp=get_pipeline_parallel_world_size() == 1,
+ )
+
+
+ if args.tensor_parallel_degree > 1:
+ tp_applicator = TensorParallelApplicator(get_tensor_model_parallel_world_size(), get_tensor_model_parallel_rank())
+ tp_applicator.apply_to_model(pipe.transformer)
+ tp_applicator.apply_to_llamamodel(pipe.text_encoder)
+
+ parallelize_transformer(pipe)
+ if args.enable_sequential_cpu_offload:
+ pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} sequential CPU offload enabled")
+ elif args.enable_model_cpu_offload:
+ pipe.enable_model_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} model CPU offload enabled")
+ else:
+ device = torch.device(f"cuda:{local_rank}")
+ pipe = pipe.to(device)
+
+ if args.enable_tiling:
+ pipe.vae.enable_tiling(
+ # Make it runnable on GPUs with 48GB memory
+ # tile_sample_min_height=128,
+ # tile_sample_stride_height=96,
+ # tile_sample_min_width=128,
+ # tile_sample_stride_width=96,
+ # tile_sample_min_num_frames=32,
+ # tile_sample_stride_num_frames=24,
+ )
+
+ if args.enable_slicing:
+ pipe.vae.enable_slicing()
+
+ parameter_peak_memory = torch.cuda.max_memory_allocated(
+ device=f"cuda:{local_rank}")
+
+ if engine_config.runtime_config.use_torch_compile:
+ torch._inductor.config.reorder_for_compute_comm_overlap = True
+ pipe.transformer.compile()
+
+ # one step to warmup the torch compiler
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ num_inference_steps=1,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(
+ input_config.seed),
+ ).frames[0]
+
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(
+ input_config.seed),
+ ).frames[0]
+
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"tp{engine_args.tensor_parallel_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ if is_dp_last_group():
+ resolution = f"{input_config.width}x{input_config.height}"
+ output_filename = f"results/hunyuan_video_{parallel_info}_{resolution}.mp4"
+ export_to_video(output, output_filename, fps=15)
+ print(f"output saved to {output_filename}")
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(
+ f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9} GB"
+ )
+ get_runtime_state().destroy_distributed_env()
+
+
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 320 --width 512 --num_frames 61 --enable_tiling --enable_model_cpu_offload
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 544 --width 960 --num_frames 129 --enable_tiling --enable_model_cpu_offload
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffca2c6eb0de7a1cfc9610b1aa3b32b406b6e44b
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py
@@ -0,0 +1,180 @@
+# from https://github.com/chengzeyi/ParaAttention/blob/main/examples/run_hunyuan_video.py
+import functools
+from typing import Any, Dict, Union, Optional
+import logging
+import time
+
+import torch
+
+from diffusers import DiffusionPipeline, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
+from xfuser import xFuserHunyuanVideoPipeline
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import scale_lora_layers, unscale_lora_layers, USE_PEFT_BACKEND
+from diffusers.utils import export_to_video
+from xfuser.model_executor.models.customized.hunyuan_video.tp_applicator import TensorParallelApplicator
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from xfuser import xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ get_data_parallel_world_size,
+ get_data_parallel_rank,
+ get_runtime_state,
+ get_classifier_free_guidance_world_size,
+ get_classifier_free_guidance_rank,
+ get_cfg_group,
+ get_sequence_parallel_world_size,
+ get_sequence_parallel_rank,
+ get_sp_group,
+ is_dp_last_group,
+ initialize_runtime_state,
+ get_pipeline_parallel_world_size,
+)
+
+from xfuser.model_executor.layers.attention_processor import xFuserHunyuanVideoAttnProcessor2_0
+
+assert xFuserHunyuanVideoAttnProcessor2_0 is not None
+from w8a8_linear import apply_quant_linear_i8w8o16
+
+
+def main():
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+
+ engine_config, input_config = engine_args.create_config()
+ local_rank = get_world_group().local_rank
+
+ assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+ # assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for HunyuanVideo"
+
+ transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ subfolder="transformer",
+ torch_dtype=torch.bfloat16,
+ revision="refs/pr/18",
+ )
+ rel_l1_thresh =0.12
+ if engine_args.use_fbcache:
+ rel_l1_thresh = 0.06
+ cache_args = {
+ "use_teacache": engine_args.use_teacache,
+ "use_fbcache": engine_args.use_fbcache,
+ "rel_l1_thresh": rel_l1_thresh,
+ "return_hidden_states_first": True,
+ "num_steps": input_config.num_inference_steps,
+ }
+ # pipe = HunyuanVideoPipeline.from_pretrained(
+ pipe = xFuserHunyuanVideoPipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ transformer=transformer,
+ torch_dtype=torch.float16,
+ revision="refs/pr/18",
+ engine_config=engine_config,
+ cache_args=cache_args,
+ )
+
+ # initialize_runtime_state(pipe, engine_config)
+ get_runtime_state().set_video_input_parameters(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ batch_size=1,
+ num_inference_steps=input_config.num_inference_steps,
+ split_text_embed_in_sp=get_pipeline_parallel_world_size() == 1,
+ )
+
+
+ if args.tensor_parallel_degree > 1:
+ tp_applicator = TensorParallelApplicator(get_tensor_model_parallel_world_size(), get_tensor_model_parallel_rank())
+ tp_applicator.apply_to_model(pipe.transformer)
+ tp_applicator.apply_to_llamamodel(pipe.text_encoder)
+
+ pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+ pipe.text_encoder=apply_quant_linear_i8w8o16(pipe.text_encoder)
+
+ if args.enable_sequential_cpu_offload:
+ pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} sequential CPU offload enabled")
+ elif args.enable_model_cpu_offload:
+ pipe.enable_model_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} model CPU offload enabled")
+ else:
+ device = torch.device(f"cuda:{local_rank}")
+ pipe = pipe.to(device)
+
+ if args.enable_tiling:
+ pipe.vae.enable_tiling()
+
+ if args.enable_slicing:
+ pipe.vae.enable_slicing()
+
+ parameter_peak_memory = torch.cuda.max_memory_allocated(
+ device=f"cuda:{local_rank}")
+
+ if engine_config.runtime_config.use_torch_compile:
+ torch._inductor.config.reorder_for_compute_comm_overlap = True
+ pipe.transformer = torch.compile(pipe.transformer,
+ mode="max-autotune-no-cudagraphs")
+
+ # one step to warmup the torch compiler
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ num_inference_steps=1,
+ generator=torch.Generator(device="cuda").manual_seed(
+ input_config.seed),
+ ).frames[0]
+ warmup =False
+ if warmup:
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ num_inference_steps=1,
+ generator=torch.Generator(device="cuda").manual_seed(
+ input_config.seed),
+ ).frames[0]
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ generator=torch.Generator(device="cuda").manual_seed(
+ input_config.seed),
+ ).frames[0]
+
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"tp{engine_args.tensor_parallel_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ if is_dp_last_group():
+ resolution = f"{input_config.width}x{input_config.height}"
+ output_filename = f"results/hunyuan_video_{parallel_info}_{resolution}.mp4"
+ export_to_video(output, output_filename, fps=15)
+ print(f"output saved to {output_filename}")
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(
+ f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9} GB"
+ )
+ get_runtime_state().destroy_distributed_env()
+
+
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 320 --width 512 --num_frames 61 --enable_tiling --enable_model_cpu_offload
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 544 --width 960 --num_frames 129 --enable_tiling --enable_model_cpu_offload
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt b/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2a81dd7eef2aec1fab733810b1ed8531ac1515a4
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt
@@ -0,0 +1,9 @@
+yunchang
+diffusers
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ba4dd3ed3735cae1084a6130987c3e41d8bd93f7
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -x
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+export NCCL_USE_HIGHPRIORITYWARP=1
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4
+
+SCRIPT="hunyuan_video_usp_example.py"
+MODEL_ID="/data/nlp/HunyuanVideo/"
+INFERENCE_STEP=50
+mkdir -p ./results
+
+TASK_ARGS="--height 720 --width 1280 --num_frames 133 --guidance_scale 5.0"
+
+N_GPUS=8
+PARALLEL_ARGS="--ulysses_degree 4 --ring_degree 2"
+ENABLE_TILING="--enable_tiling"
+ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload"
+COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A cat walks on the grass, realistic" \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$ENABLE_TILING \
+$ENABLE_MODEL_CPU_OFFLOAD \
+$COMPILE_FLAG
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fab6e8fceeb13793e330cf90b4e9c21b1543df30
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+set -x
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+export NCCL_USE_HIGHPRIORITYWARP=1
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4
+
+SCRIPT="hunyuan_video_usp_example_teacache.py"
+MODEL_ID="/data/nlp/HunyuanVideo/"
+
+INFERENCE_STEP=50
+
+mkdir -p ./results
+
+TASK_ARGS="--height 720 --width 1280 --num_frames 129 --seed 24"
+
+# CogVideoX parallel configuration
+N_GPUS=8
+PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 8"
+ENABLE_TILING="--enable_tiling"
+ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload"
+# COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A cat walks on the grass, realistic" \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$ENABLE_TILING \
+$ENABLE_MODEL_CPU_OFFLOAD \
+$COMPILE_FLAG \
+--use_teacache
+
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..d320f03d0eae209c68db65e1f2ea438cb79a708c
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py
@@ -0,0 +1,106 @@
+import torch
+from typing import Optional
+from torch.nn.parameter import Parameter
+from ixformer.inference.functions.w8a8 import w8a8, dynamic_scaled_int8_quant
+
+
+def perchannel_quantize_weight_int8(weight: torch.Tensor):
+ weight = weight.cpu().to(torch.float32)
+ n_bit = 8
+ eps = 1e-5
+ max_int = 2**(n_bit - 1) - 1
+ min_int = -(2**(n_bit - 1)-1)
+ max_val = weight.abs().amax(dim=-1, keepdim=True)
+ # max_val = max_val.clamp(min=eps)
+ qscale = max_val / max_int
+ qweight = torch.clamp(torch.round(weight * (1.0 / qscale)), min_int,
+ max_int).to(torch.int8)
+ qscale = qscale.squeeze().to(torch.float32)
+ return qweight, qscale
+class DynamicQuantizeLinear(torch.nn.Module):
+ def __init__(self,
+ unquantized: torch.nn.Module,
+ output_dtype: Optional[torch.dtype] = None,
+ ):
+
+ super().__init__()
+ assert isinstance(unquantized, torch.nn.Linear)
+ self.in_features = unquantized.in_features
+ self.out_features = unquantized.out_features
+
+ self.device = unquantized.weight.device
+ self.output_dtype =output_dtype
+
+ qweight, qscale = perchannel_quantize_weight_int8(unquantized.weight)
+ self.weight = Parameter(qweight.to(self.device), requires_grad=False)
+ self.scale = Parameter(qscale.to(self.device), requires_grad=False)
+
+ if unquantized.bias is not None:
+ self.bias = unquantized.bias.to(self.device)
+ else:
+ self.register_parameter("bias", None)
+
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
+ device =self.weight.device
+ assert x.device == device
+ output_dtype = x.dtype if self.output_dtype is None else self.output_dtype
+ inputs = torch.empty(x.shape, dtype=torch.int8, device=device)
+ i_scales = torch.empty(x.shape[:-1], dtype=torch.float32, device=device)
+ dynamic_scaled_int8_quant(inputs, x.contiguous(), i_scales)
+
+ output = torch.empty(
+ (inputs.shape[:-1] + (self.weight.shape[0],)),
+ dtype=output_dtype,
+ device=device,
+ )
+
+ out = w8a8(inputs, self.weight, i_scales, self.scale,self.bias, output)
+ # if self.bias is not None:
+ # out =out +self.bias
+ return out
+
+def _is_linear(mod, *args):
+ # return isinstance(mod, torch.nn.Linear) and args[0] in ["to_qkv", "to_added_qkv", "proj"]
+ # if isinstance(mod, torch.nn.Linear):
+ # print(args[0])
+ return isinstance(mod, torch.nn.Linear) and "transformer" in args[0] and ("attn1" in args[0] or "attn" in args[0] or "ff" in args[0] or "proj_mlp" in args[0] or "proj_out" in args[0])
+
+def _is_linear_flux(mod, *args):
+ # return isinstance(mod, torch.nn.Linear) and args[0] in ["to_qkv", "to_added_qkv", "proj"]
+ # if isinstance(mod, torch.nn.Linear):
+ # print(args[0])
+ return isinstance(mod, torch.nn.Linear) and "transformer" in args[0] and ( "attn" in args[0] or "ff" in args[0] or "proj_out" in args[0] )
+
+def _is_linear_sd3(mod, *args):
+ return isinstance(mod, torch.nn.Linear) and "transformer" in args[0] and ("attn" in args[0] or "ff" in args[0] or "proj_out" in args[0])
+
+def _is_linear_hunyuandit(mod, *args):
+ return isinstance(mod, torch.nn.Linear) and "blocks" in args[0]
+
+def _is_wan_linear(mod, *args):
+ return isinstance(mod, torch.nn.Linear) and ("attn1" in args[0] or "attn" in args[0] or "attn2" in args[0] or "ffn" in args[0] or "proj_out" in args[0])
+
+
+def apply_quant_linear_i8w8o16(model, cls=DynamicQuantizeLinear, filter_fn = None):
+ if filter_fn is None:
+ filter_fn = _is_linear
+ if type(model).__name__ == "FluxTransformer2DModel" or type(model).__name__ == "xFuserFluxTransformer2DWrapper":
+ filter_fn = _is_linear_flux
+ elif type(model).__name__ == "HunyuanDiT2D" or type(model).__name__ == "xFuserHunyuanDiT2DWrapper":
+ filter_fn = _is_linear_hunyuandit
+ # elif type(model).__name__ == "SD3Transformer2DModel" or type(model).__name__ == "xFuserSD3Transformer2DWrapper":
+ # filter_fn = _is_linear_sd3
+ elif type(model).__name__ == "WanTransformer3DModel" or type(model).__name__ == "xFuserWanTransformer3DModelWrapper":
+ filter_fn = _is_wan_linear
+ # for name, child in model.named_children():
+ # if filter_fn(child, name):
+ # setattr(model, name, cls(child))
+ # else:
+ # apply_quant_linear_i8w8o16(child, cls, filter_fn)
+ for name, m in model.named_modules():
+ if filter_fn(m,name):
+ parent_module_name, child_name = name.rsplit('.', 1) if '.' in name else ('', name)
+ parent_module = model.get_submodule(parent_module_name)
+ # print(parent_module_name,name)
+ setattr(parent_module, child_name, cls(m))
+ return model
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4dc2cd9c081086e96d65253f691053f126db1a5b
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md
@@ -0,0 +1,50 @@
+# Stable Diffusion 3 Medium (xDiT)
+
+## Model Description
+
+Stable Diffusion 3 Medium is Stability AI's latest text-to-image diffusion model, featuring significant improvements in image quality, prompt adherence, and typography rendering. It uses a new Multimodal Diffusion Transformer (MMDiT) architecture with separate sets of weights for text and image encoders.
+
+This version runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_sd3.sh
+# Update MODEL_ID to your actual model path
+```
+
+2. Run script:
+```bash
+bash run_sd3.sh
+```
+
+## References
+
+- [Stable Diffusion 3](https://github.com/Stability-AI/stable-diffusion)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2969a4385d913c98a2cb13adfa2bb29f3d3f0938
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..98f21c9fd429500abc55db01ac637626bbe7f546
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh
@@ -0,0 +1,45 @@
+# set -x
+export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+#多ring 没提升
+# export NCCL_USE_HIGHPRIORITYWARP=1
+
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4
+export ENABLE_IXFORMER_W8A8LINEAR=1
+
+# Select the model type
+SCRIPT=sd3_example.py
+MODEL_ID=/data/nlp/stable-diffusion-3-medium-diffusers
+INFERENCE_STEP=50
+
+echo ${MODEL_CONFIGS[$MODEL_TYPE]}
+
+mkdir -p ./results
+
+# task args
+TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+
+
+N_GPUS=4
+PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 2 --tensor_parallel_degree 1 --data_parallel_degree 1"
+
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 1 \
+--prompt "brown dog laying on the ground with a metal bowl in front of him." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG \
+$QUANTIZE_FLAG \
+$CACHE_ARGS \
+
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc86bd64ff213a6884a1750f22e0aac67bef071
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py
@@ -0,0 +1,132 @@
+import time
+import os
+import torch
+import torch.distributed
+from transformers import T5EncoderModel
+from xfuser import xFuserStableDiffusion3Pipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ is_dp_last_group,
+ get_data_parallel_rank,
+ get_runtime_state,
+)
+from xfuser.core.distributed.parallel_state import get_data_parallel_world_size
+
+from apex.normalization.fused_layer_norm import FusedRMSNorm
+import torch
+import torch.nn as nn
+
+
+from ixformer.inference.functions.rms_norm import rms_norm
+
+
+class T5LayerNorm(nn.Module):
+ def __init__(self, hidden_size, eps=1e-6):
+ """
+ Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+ """
+ super().__init__()
+ self.weight = nn.Parameter(torch.ones(hidden_size))
+ self.variance_epsilon = eps
+
+ def forward(self, hidden_states):
+ # Compute variance without subtracting mean (RMSNorm)
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+ # Cast back to half precision if needed
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
+ hidden_states = hidden_states.to(self.weight.dtype)
+
+ return self.weight * hidden_states
+
+
+#https://github.com/huggingface/transformers/issues/20287 fix apex, from apex. normalization import FusedRMSNorm
+def replace_fused_rmsnorm_with_t5(module):
+ for name, child in module.named_children():
+ if isinstance(child, FusedRMSNorm):
+ hidden_size = child.weight.shape[0]
+ eps = getattr(child, "eps", 1e-6)
+ new_ln = T5LayerNorm(hidden_size, eps=eps)
+ new_ln.weight.data = child.weight.data.clone()
+ setattr(module, name, new_ln)
+ else:
+ replace_fused_rmsnorm_with_t5(child)
+
+def main():
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+ engine_config, input_config = engine_args.create_config()
+ local_rank = get_world_group().local_rank
+ torch.cuda.set_device(local_rank)
+ text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16)
+ if args.use_fp8_t5_encoder:
+ from optimum.quanto import freeze, qfloat8, quantize
+ print(f"rank {local_rank} quantizing text encoder 2")
+ quantize(text_encoder_3, weights=qfloat8)
+ freeze(text_encoder_3)
+
+ pipe = xFuserStableDiffusion3Pipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ engine_config=engine_config,
+ torch_dtype=torch.float16,
+ text_encoder_3=text_encoder_3,
+ ).to(f"cuda:{local_rank}")
+
+ replace_fused_rmsnorm_with_t5(text_encoder_3)
+ parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+ import os
+ if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1":
+ from w8a8_linear import apply_quant_linear_i8w8o16
+ pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+
+ pipe.prepare_run(input_config)
+
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ prompt=input_config.prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ output_type=input_config.output_type,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ )
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ if input_config.output_type == "pil":
+ dp_group_index = get_data_parallel_rank()
+ num_dp_groups = get_data_parallel_world_size()
+ dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
+ if pipe.is_dp_last_group():
+ if not os.path.exists("results"):
+ os.mkdir("results")
+ for i, image in enumerate(output.images):
+ image_rank = dp_group_index * dp_batch_size + i
+ image.save(
+ f"./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png"
+ )
+ print(
+ f"image {i} saved to ./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png"
+ )
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(
+ f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
+ )
+
+ get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..01a4357fba4260239c00a819d7408237777602d2
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md
@@ -0,0 +1,61 @@
+# Wan2.1-T2V-14B-Diffusers (xDiT)
+
+## Model Description
+
+Wan2.1-T2V-14B is Wan AI's large-scale text-to-video diffusion model with 14B parameters. It generates high-quality, cinematic videos from text prompts with excellent motion dynamics and visual fidelity.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_wan_2.1_t2v_14b.sh
+# Update MODEL_ID to your actual model path
+# Modify TASK_ARGS if needed
+```
+
+2. Run script:
+```bash
+bash run_wan_2.1_t2v_14b.sh
+```
+
+3. The model supports BS=1/BS=2. Different BS prompts format:
+```bash
+# BS1 (default) prompt format
+--prompt "一个虎虎生威的老虎" \
+--negative_prompt "畸形,光照不好" \
+# BS2 prompt format
+--prompt "一个虎虎生威的老虎" "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "畸形,光照不好" "畸形,光照不好" \
+```
+
+## References
+
+- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fa5b1c5f7e309593a6e11bde46979d4c4255b4b
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers==4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..133c5fd10a7c2520a5bf27710d3b5e5850616537
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -x
+export WORD_RANK_SUPPORT_TP=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1 #xdit ==0.4.4
+export TOKENIZERS_PARALLELISM=true
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+# CogVideoX configuration
+SCRIPT="wan2.1_t2v_example.py"
+MODEL_ID="/data/nlp/Wan2.1-T2V-14B-Diffusers/"
+INFERENCE_STEP=20
+
+mkdir -p ./results
+
+# CogVideoX specific task args
+TASK_ARGS="--height 480 --width 832 --num_frames 33 --seed 33 "
+
+# CogVideoX parallel configuration
+N_GPUS=4
+PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1 --tensor_parallel_degree 2"
+CFG_ARGS="--use_cfg_parallel"
+
+# Uncomment and modify these as needed
+# PIPEFUSION_ARGS="--num_pipeline_patch 8"
+# OUTPUT_ARGS="--output_type latent"
+# PARALLLEL_VAE="--use_parallel_vae"
+# ENABLE_TILING="--enable_tiling"
+# MODEL_OFFLOAD="--enable_model_cpu_offload"
+ENABLE_CACHE="--use_teacache"
+COMPILE_FLAG="--use_torch_compile"
+#ENABLE_W8A8="--use_w8a8_linear"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$ENABLE_W8A8 \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A rainy night in a dense cyberpunk market, neon kanji signs flicker overhead. The camera starts shoulder-height behind a hooded courier, steadily tracking forward as he weaves through crowds of holographic umbrellas. Volumetric pink-blue backlight cuts through steam vents, puddles mirror the glow. Lens flare, shallow depth of field. Moody, Blade-Runner vibe." \
+--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
+$ENABLE_TILING \
+$ENABLE_CACHE \
+$COMPILE_FLAG \
+$CFG_ARGS
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..73da9b094935591b99a81d05dca9d17809f0ec97
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py
@@ -0,0 +1,131 @@
+import logging
+import time
+import torch
+import torch.distributed
+from diffusers import AutoencoderKLTemporalDecoder
+from xfuser import xFuserWanPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ get_data_parallel_rank,
+ get_data_parallel_world_size,
+ get_runtime_state,
+ is_dp_last_group,
+ get_world_group
+)
+from diffusers import WanPipeline
+
+from xfuser.model_executor.cache.teacache.backend import TeaCacheBackend
+from xfuser.model_executor.cache.data import DiffusionCacheConfig
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from diffusers.utils import export_to_video
+
+
+def main():
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+
+ engine_config, input_config = engine_args.create_config()
+ local_rank = get_world_group().local_rank
+
+ assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+ # assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for CogVideo"
+ # assert not (engine_args.tensor_parallel_degree > 1 and engine_args.ulysses_degree > 1), "This script cannot support tensor_parallel_degree and ulysses_degree at the same time."
+
+ pipe = xFuserWanPipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ engine_config=engine_config,
+ torch_dtype=torch.bfloat16,
+ )
+
+ # https://github.com/ali-vilab/TeaCache/blob/main/TeaCache4Wan2.1/teacache_generate.py#L892
+ if engine_args.use_teacache:
+ config = DiffusionCacheConfig(rel_l1_thresh = 0.2,
+ coefficients = [-5784.54975374, 5449.50911966, -1811.16591783, 256.27178429, -13.02252404]
+ )
+ backend = TeaCacheBackend(config)
+ backend.enable(pipe,transformer_key = "transformer")
+ backend.refresh(pipe, input_config.num_inference_steps, transformer_key = "transformer")
+
+ if args.enable_sequential_cpu_offload:
+ pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} sequential CPU offload enabled")
+ elif args.enable_model_cpu_offload:
+ pipe.enable_model_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} model CPU offload enabled")
+ else:
+ device = torch.device(f"cuda:{local_rank}")
+ pipe = pipe.to(device)
+
+ if args.enable_tiling:
+ pipe.vae.enable_tiling()
+
+ if args.enable_slicing:
+ pipe.vae.enable_slicing()
+
+ if args.use_easycache:
+ cache_kwargs = {
+ "use_easycache":True,
+ "cache_thresh":0.02 #easy eacch thresh
+ }
+ else:
+ cache_kwargs = None
+
+
+ if engine_args.use_w8a8_linear:
+ from w8a8_linear import apply_quant_linear_i8w8o16
+ pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+
+ # warmup
+ # output = pipe(
+ # height=input_config.height,
+ # width=input_config.width,
+ # num_frames=input_config.num_frames,
+ # prompt=input_config.prompt,
+ # num_inference_steps=1,
+ # generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ # ).frames
+
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+
+ prompt=["一个虎虎生威的老虎","Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage"]
+ negative_prompt = ["畸形,光照不好","畸形,光照不好"]
+
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ negative_prompt = input_config.negative_prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ cache_kwargs = cache_kwargs
+ )
+
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_reserved(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"tp{engine_args.tensor_parallel_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ # if is_dp_last_group():
+ resolution = f"{input_config.width}x{input_config.height}"
+ for i, frames in enumerate(output.frames):
+ output_filename = f"results/wan2.1_t2v_14b_{i}_{parallel_info}_{resolution}.mp4"
+ export_to_video(frames, output_filename, fps=16)
+ print(f"output saved to {output_filename}")
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
+ # get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad0ac17341e34ab57603a858950e5a6f3547b2dd
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md
@@ -0,0 +1,51 @@
+# Wan2.2-TI2V-5B-Diffusers (xDiT)
+
+## Model Description
+
+Wan2.2-TI2V-5B is Wan AI's image-to-video diffusion model with 5B parameters. It generates smooth, high-quality videos from input images, maintaining visual consistency and adding natural motion.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_wan_2.2_t2v_5b.sh
+# Update MODEL_ID to your actual model path
+# Modify TASK_ARGS if needed
+```
+
+2. Run script:
+```bash
+bash run_wan_2.2_t2v_5b.sh
+```
+
+## References
+
+- [Wan2.2](https://github.com/Wan-Video/Wan2.1)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fa5b1c5f7e309593a6e11bde46979d4c4255b4b
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers==4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9bea267c7861522207d604e3c115da59b6da9ff4
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+set -x
+#export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+# CogVideoX configuration
+SCRIPT="wan2.2_t2v_example.py"
+MODEL_ID="/data/nlp/Wan2.2-TI2V-5B-Diffusers/"
+INFERENCE_STEP=50
+
+mkdir -p ./results
+
+# CogVideoX specific task args
+TASK_ARGS="--height 704 --width 1280 --num_frames 33 --seed 32 "
+
+# CogVideoX parallel configuration
+N_GPUS=4
+PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1 --tensor_parallel_degree 1"
+CFG_ARGS="--use_cfg_parallel"
+
+# Uncomment and modify these as needed
+# PIPEFUSION_ARGS="--num_pipeline_patch 8"
+# OUTPUT_ARGS="--output_type latent"
+# PARALLLEL_VAE="--use_parallel_vae"
+# ENABLE_TILING="--enable_tiling"
+# MODEL_OFFLOAD="--enable_model_cpu_offload"
+# COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \
+$ENABLE_TILING \
+$ENABLE_CACHE \
+$COMPILE_FLAG \
+$CFG_ARGS
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..346b9e937cba7a0c4add5b860732fab7225e2818
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py
@@ -0,0 +1,131 @@
+import logging
+import time
+import torch
+import torch.distributed
+from diffusers import AutoencoderKLTemporalDecoder
+from xfuser import xFuserWanPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+ get_world_group,
+ get_data_parallel_rank,
+ get_data_parallel_world_size,
+ get_runtime_state,
+ is_dp_last_group,
+ get_world_group
+)
+from diffusers import WanPipeline
+
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from diffusers.utils import export_to_video
+from xfuser.model_executor.cache.teacache.backend import TeaCacheBackend
+from xfuser.model_executor.cache.data import DiffusionCacheConfig
+
+
+def main():
+ parser = FlexibleArgumentParser(description="xFuser Arguments")
+ args = xFuserArgs.add_cli_args(parser).parse_args()
+ engine_args = xFuserArgs.from_cli_args(args)
+
+ engine_config, input_config = engine_args.create_config()
+ local_rank = get_world_group().local_rank
+
+ assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+
+ pipe = xFuserWanPipeline.from_pretrained(
+ pretrained_model_name_or_path=engine_config.model_config.model,
+ engine_config=engine_config,
+ torch_dtype=torch.bfloat16,
+ )
+
+ if engine_args.use_teacache:
+ config = DiffusionCacheConfig(rel_l1_thresh = 0.2,
+ coefficients = [
+ 6.85271205e+04,
+ -9.88214072e+03,
+ 5.08858742e+02,
+ -7.39731467e+00,
+ 1.22746295e-01,])
+ backend = TeaCacheBackend(config)
+ backend.enable(pipe,transformer_key = "transformer_2")
+ backend.refresh(pipe, input_config.num_inference_steps, transformer_key = "transformer_2")
+
+
+ if args.enable_sequential_cpu_offload:
+ pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} sequential CPU offload enabled")
+ elif args.enable_model_cpu_offload:
+ pipe.enable_model_cpu_offload(gpu_id=local_rank)
+ logging.info(f"rank {local_rank} model CPU offload enabled")
+ else:
+ device = torch.device(f"cuda:{local_rank}")
+ pipe = pipe.to(device)
+
+ if args.enable_tiling:
+ pipe.vae.enable_tiling()
+
+ if args.enable_slicing:
+ pipe.vae.enable_slicing()
+
+ if engine_args.use_w8a8_linear:
+ from w8a8_linear import apply_quant_linear_i8w8o16
+ pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+
+ # warmup
+ # output = pipe(
+ # height=input_config.height,
+ # width=input_config.width,
+ # num_frames=input_config.num_frames,
+ # prompt=input_config.prompt,
+ # num_inference_steps=1,
+ # generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ # ).frames
+
+ torch.cuda.reset_peak_memory_stats()
+ start_time = time.time()
+
+
+ if args.use_easycache:
+ cache_kwargs = {
+ "use_easycache":True,
+ "cache_thresh":0.02, #easy eacch thresh
+ #"ret_steps":10
+ }
+ else:
+ cache_kwargs = None
+
+ output = pipe(
+ height=input_config.height,
+ width=input_config.width,
+ num_frames=input_config.num_frames,
+ prompt=input_config.prompt,
+ negative_prompt = input_config.negative_prompt,
+ num_inference_steps=input_config.num_inference_steps,
+ guidance_scale=input_config.guidance_scale,
+ generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+ cache_kwargs = cache_kwargs
+ )
+
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ peak_memory = torch.cuda.max_memory_reserved(device=f"cuda:{local_rank}")
+
+ parallel_info = (
+ f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+ f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+ f"tp{engine_args.tensor_parallel_degree}_"
+ f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+ )
+ # if is_dp_last_group():
+ resolution = f"{input_config.width}x{input_config.height}"
+ for i, frames in enumerate(output.frames):
+ output_filename = f"results/wan2.2_t2v_{i}_{parallel_info}_{resolution}.mp4"
+ export_to_video(frames, output_filename, fps=16)
+ print(f"output saved to {output_filename}")
+
+ if get_world_group().rank == get_world_group().world_size - 1:
+ print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
+ get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md
index bf11ca80b8e0699868b7a002d488cfb1b50f5938..be0e86e64ef652cd7c502375791ce5181fe4c286 100644
--- a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md
@@ -9,6 +9,7 @@ Qwen2.5-VL is not only proficient in recognizing common objects such as flowers,
| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
| :----: | :----: | :----: |
| MR-V100 | 4.3.0 | 25.09 |
+| MR-V100 | 4.4.0 | 26.03 |
## Model Preparation
@@ -32,6 +33,49 @@ export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
python3 offline_inference_vision_language.py --model /path/to/Qwen2.5-VL-3B-Instruct/ -tp 4 --trust-remote-code --temperature 0.0 --max-token 256
```
+### Qwen2.5-VL-32B-Instruct (W8A8/W4A16)
+
+#### Performance Test
+
+1. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+2. Start server:
+```bash
+vllm serve /path/to/model --max-num-seqs 1 --max-model-len 98304 --limit_mm_per_prompt '{"image": 5}' --disable-cascade-attn --tensor-parallel-size 4 --gpu_memory_utilization 0.9 --pipeline-parallel-size 1 --host 0.0.0.0 --port 8000 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+3. Run client:
+```bash
+# Use the pre-copied guidellm
+cd guidellm && pip install .
+pip install beautifulsoup4
+cd ..
+guidellm --data "prompt_tokens=512,generated_tokens=512,images=1,width=1770,height=1180" --data-type emulated --model /path/to/model --target "http://localhost:8000/v1" --max-requests 1
+```
+
+### Qwen2.5-VL-72B-Instruct (W4A16)
+
+#### Performance Test
+
+1. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+2. Start server:
+```bash
+vllm serve /path/to/model --max-num-seqs 1 --max-model-len 98304 --limit_mm_per_prompt '{"image": 5}' --disable-cascade-attn --tensor-parallel-size 8 --gpu_memory_utilization 0.9 --pipeline-parallel-size 1 --host 0.0.0.0 --port 8000 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+3. Run client:
+```bash
+# Same as 32B version
+guidellm --data "prompt_tokens=512,generated_tokens=512,images=1,width=1770,height=1180" --data-type emulated --model /path/to/model --target "http://localhost:8000/v1" --max-requests 1
+```
+
## Model Results
### Benchmarking vLLM
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..6ab2c6e9940c580355ebf34c530ffa4fb6b5ce83
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml
@@ -0,0 +1,212 @@
+[build-system]
+requires = ["setuptools >= 61.0", "wheel", "build"]
+build-backend = "setuptools.build_meta"
+
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["*"]
+
+[tool.setuptools.package-data]
+guidellm = ["*"]
+
+
+# ************************************************
+# ********** Project Metadata **********
+# ************************************************
+
+[project]
+name = "guidellm"
+version = "0.1.0"
+description = "Guidance platform for deploying and managing large language models."
+readme = { file = "README.md", content-type = "text/markdown" }
+requires-python = ">=3.8.0,<4.0"
+license = { file = "LICENSE" }
+authors = [ { name = "Neuralmagic, Inc." } ]
+urls = { homepage = "https://github.com/neuralmagic/guidellm" }
+dependencies = [
+ "click",
+ "datasets",
+ "ftfy>=6.0.0",
+ "loguru",
+ "numpy",
+ "openai",
+ "pydantic>=2.0.0",
+ "pydantic-settings>=2.0.0",
+ "pyyaml>=6.0.0",
+ "requests",
+ "rich",
+ "transformers",
+]
+
+[project.optional-dependencies]
+dev = [
+ # general and configurations
+ "pre-commit~=3.5.0",
+ "scipy~=1.10",
+ "sphinx~=7.1.2",
+ "tox~=4.16.0",
+
+ # testing
+ "pytest~=8.2.2",
+ "pytest-asyncio~=0.23.8",
+ "pytest-cov~=5.0.0",
+ "pytest-mock~=3.14.0",
+ "pytest-rerunfailures~=14.0",
+ "requests-mock~=1.12.1",
+
+ # code quality
+ "mypy~=1.10.1",
+ "ruff~=0.5.2",
+
+ # docs quality
+ "mdformat~=0.7.17",
+ "mdformat-footnote~=0.1.1",
+ "mdformat-frontmatter~=2.0.8",
+ "mdformat-gfm~=0.3.6",
+
+ # type-checking
+ "types-click~=7.1.8",
+ "types-PyYAML~=6.0.1",
+ "types-requests~=2.32.0",
+ "types-toml",
+]
+
+
+[project.entry-points.console_scripts]
+guidellm = "guidellm.main:generate_benchmark_report_cli"
+guidellm-config = "guidellm.config:print_config"
+
+
+# ************************************************
+# ********** Code Quality Tools **********
+# ************************************************
+
+[tool.black]
+line-length = 88
+target-version = ['py38']
+
+
+[tool.isort]
+profile = "black"
+
+
+[tool.mypy]
+files = ["src/guidellm", "tests"]
+python_version = '3.8'
+warn_redundant_casts = true
+warn_unused_ignores = false
+show_error_codes = true
+namespace_packages = true
+exclude = ["venv", ".tox"]
+
+# Silence "type import errors" as our 3rd-party libs does not have types
+# Check: https://mypy.readthedocs.io/en/latest/config_file.html#import-discovery
+follow_imports = 'silent'
+
+[[tool.mypy.overrides]]
+module = ["datasets.*"]
+ignore_missing_imports=true
+
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+exclude = ["build", "dist", "env", ".venv"]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+
+[tool.ruff.lint]
+ignore = [
+ "PLR0913",
+ "TCH001",
+ "COM812",
+ "ISC001",
+ "TCH002",
+ "PLW1514", # allow Path.open without encoding
+ "RET505", # allow `else` blocks
+ "RET506" # allow `else` blocks
+
+]
+select = [
+ # Rules reference: https://docs.astral.sh/ruff/rules/
+
+ # Code Style / Formatting
+ "E", # pycodestyle: checks adherence to PEP 8 conventions including spacing, indentation, and line length
+ "W", # pycodestyle: checks adherence to PEP 8 conventions including spacing, indentation, and line length
+ "A", # flake8-builtins: prevents shadowing of Python built-in names
+ "C", # Convention: ensures code adheres to specific style and formatting conventions
+ "COM", # flake8-commas: enforces the correct use of trailing commas
+ "ERA", # eradicate: detects commented-out code that should be removed
+ "I", # isort: ensures imports are sorted in a consistent manner
+ "ICN", # flake8-import-conventions: enforces import conventions for better readability
+ "N", # pep8-naming: enforces PEP 8 naming conventions for classes, functions, and variables
+ "NPY", # NumPy: enforces best practices for using the NumPy library
+ "PD", # pandas-vet: enforces best practices for using the pandas library
+ "PT", # flake8-pytest-style: enforces best practices and style conventions for pytest tests
+ "PTH", # flake8-use-pathlib: encourages the use of pathlib over os.path for file system operations
+ "Q", # flake8-quotes: enforces consistent use of single or double quotes
+ "TCH", # flake8-type-checking: enforces type checking practices and standards
+ "TID", # flake8-tidy-imports: enforces tidy and well-organized imports
+ "RUF022", # flake8-ruff: enforce sorting of __all__ in modules
+
+ # Code Structure / Complexity
+ "C4", # flake8-comprehensions: improves readability and performance of list, set, and dict comprehensions
+ "C90", # mccabe: checks for overly complex code using cyclomatic complexity
+ "ISC", # flake8-implicit-str-concat: prevents implicit string concatenation
+ "PIE", # flake8-pie: identifies and corrects common code inefficiencies and mistakes
+ "R", # Refactor: suggests improvements to code structure and readability
+ "SIM", # flake8-simplify: simplifies complex expressions and improves code readability
+
+ # Code Security / Bug Prevention
+ "ARG", # flake8-unused-arguments: detects unused function and method arguments
+ "ASYNC", # flake8-async: identifies incorrect or inefficient usage patterns in asynchronous code
+ "B", # flake8-bugbear: detects common programming mistakes and potential bugs
+ "BLE", # flake8-blind-except: prevents blind exceptions that catch all exceptions without handling
+ "E", # Error: detects and reports errors in the code
+ "F", # Pyflakes: detects unused imports, shadowed imports, undefined variables, and various formatting errors in string operations
+ "INP", # flake8-no-pep420: prevents implicit namespace packages by requiring __init__.py
+ "PGH", # pygrep-hooks: detects deprecated and dangerous code patterns
+ "PL", # Pylint: comprehensive source code analyzer for enforcing coding standards and detecting errors
+ "RSE", # flake8-raise: ensures exceptions are raised correctly
+ "S", # flake8-bandit: detects security issues and vulnerabilities in the code
+ "SLF", # flake8-self: prevents incorrect usage of the self argument in class methods
+ "T10", # flake8-debugger: detects the presence of debugging tools such as pdb
+ "T20", # flake8-print: detects print statements left in the code
+ "UP", # pyupgrade: automatically upgrades syntax for newer versions of Python
+ "W", # Warning: provides warnings about potential issues in the code
+ "YTT", # flake8-2020: identifies code that will break with future Python releases
+
+ # Code Documentation
+ "FIX", # flake8-fixme: detects FIXMEs and other temporary comments that should be resolved
+]
+
+[tool.ruff.lint.extend-per-file-ignores]
+"tests/**/*.py" = [
+ "S101", # asserts allowed in tests
+ "ARG", # Unused function args allowed in tests
+ "PLR2004", # Magic value used in comparison
+ "TCH002", # No import only type checking in tests
+ "SLF001", # enable private member access in tests
+ "S105", # allow hardcoded passwords in tests
+ "S311", # allow standard pseudo-random generators in tests
+ "PT011", # allow generic exceptions in tests
+ "N806", # allow uppercase variable names in tests
+ "PGH003", # allow general ignores in tests
+ "S106", # allow hardcoded passwords in tests
+ "PLR0915", # allow complext statements in tests
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["guidellm", "tests"]
+
+
+[tool.pytest.ini_options]
+addopts = '-s -vvv --cache-clear'
+markers = [
+ "smoke: quick tests to check basic functionality",
+ "sanity: detailed tests to ensure major functions work correctly",
+ "regression: tests to ensure that new changes do not break existing functionality"
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b10b4455ae29b9476829955b81bdfef07f515b25
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py
@@ -0,0 +1,20 @@
+"""
+Guidellm is a package that provides an easy and intuitive interface for
+evaluating and benchmarking large language models (LLMs).
+"""
+
+# flake8: noqa
+
+import os
+
+import transformers # type: ignore
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false" # Silence warnings for tokenizers
+transformers.logging.set_verbosity_error() # Silence warnings for transformers
+
+
+from .config import settings
+from .logger import configure_logger, logger
+from .main import generate_benchmark_report
+
+__all__ = ["configure_logger", "logger", "settings", "generate_benchmark_report"]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13910180a77e3958a18da428932bce45aeff538a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py
@@ -0,0 +1,12 @@
+from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
+from .openai import OpenAIBackend
+from .aiohttp import AiohttpBackend
+
+__all__ = [
+ "Backend",
+ "BackendEngine",
+ "BackendEnginePublic",
+ "GenerativeResponse",
+ "OpenAIBackend",
+ "AiohttpBackend"
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbbd97158fab0f547a812534eb06152e83366328
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py
@@ -0,0 +1,180 @@
+import base64
+import io
+from typing import AsyncGenerator, Dict, List, Optional
+from loguru import logger
+
+import aiohttp
+import json
+
+from guidellm.backend.base import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+__all__ = ["AiohttpBackend"]
+
+@Backend.register("aiohttp_server")
+class AiohttpBackend(Backend):
+ """
+ An aiohttp-based backend implementation for LLM requests.
+
+ This class provides an interface to communicate with a server hosting
+ an LLM API using aiohttp for asynchronous requests.
+ """
+
+ def __init__(
+ self,
+ openai_api_key: Optional[str] = None,
+ target: Optional[str] = None,
+ model: Optional[str] = None,
+ timeout: Optional[float] = None,
+ **request_args,
+ ):
+ self._request_args: Dict = request_args
+ self._api_key: str = openai_api_key or settings.aiohttp.api_key
+
+ if not self._api_key:
+ err = ValueError(
+ "`GUIDELLM__AIOHTTP__API_KEY` environment variable or "
+ "--openai-api-key CLI parameter must be specified for the "
+ "aiohttp backend."
+ )
+ logger.error("{}", err)
+ raise err
+
+ base_url = target or settings.aiohttp.base_url
+ self._api_url = f"{base_url}/chat/completions"
+
+ if not base_url:
+ err = ValueError(
+ "`GUIDELLM__AIOHTTP__BASE_URL` environment variable or "
+ "target parameter must be specified for the OpenAI backend."
+ )
+ logger.error("{}", err)
+ raise err
+
+ self._timeout = aiohttp.ClientTimeout(total=timeout or settings.request_timeout)
+ self._model = model
+
+ super().__init__(type_="aiohttp_backend", target=base_url, model=self._model)
+ logger.info("aiohttp {} Backend listening on {}", self._model, base_url)
+
+ async def make_request(
+ self,
+ request: TextGenerationRequest,
+ ) -> AsyncGenerator[GenerativeResponse, None]:
+ """
+ Make a request to the aiohttp backend.
+
+ Sends a prompt to the LLM server and streams the response tokens.
+
+ :param request: The text generation request to submit.
+ :type request: TextGenerationRequest
+ :yield: A stream of GenerativeResponse objects.
+ :rtype: AsyncGenerator[GenerativeResponse, None]
+ """
+
+ async with aiohttp.ClientSession(timeout=self._timeout) as session:
+ logger.debug("Making request to aiohttp backend with prompt: {}", request.prompt)
+
+ request_args = {}
+ if request.output_token_count is not None:
+ request_args.update(
+ {
+ "max_completion_tokens": request.output_token_count,
+ "stop": None,
+ "ignore_eos": True,
+ }
+ )
+ elif settings.aiohttp.max_gen_tokens and settings.aiohttp.max_gen_tokens > 0:
+ request_args.update(
+ {
+ "max_tokens": settings.aiohttp.max_gen_tokens,
+ }
+ )
+
+ request_args.update(self._request_args)
+
+ messages = self._build_messages(request)
+
+ payload = {
+ "model": self._model,
+ "messages": messages,
+ "stream": True,
+ **request_args,
+ }
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self._api_key}",
+ }
+
+ try:
+ async with session.post(url=self._api_url, json=payload, headers=headers) as response:
+ if response.status != 200:
+ error_message = await response.text()
+ logger.error("Request failed: {} - {}", response.status, error_message)
+ raise Exception(f"Failed to generate response: {error_message}")
+
+ token_count = 0
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
+ continue
+
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+ if chunk == "[DONE]":
+ # Final response
+ yield GenerativeResponse(
+ type_="final",
+ prompt=request.prompt,
+ output_token_count=token_count,
+ prompt_token_count=request.prompt_token_count,
+ )
+ else:
+ # Intermediate token response
+ token_count += 1
+ data = json.loads(chunk)
+ delta = data["choices"][0]["delta"]
+ token = delta["content"]
+ yield GenerativeResponse(
+ type_="token_iter",
+ add_token=token,
+ prompt=request.prompt,
+ output_token_count=token_count,
+ prompt_token_count=request.prompt_token_count,
+ )
+ except Exception as e:
+ logger.error("Error while making request: {}", e)
+ raise
+
+ def available_models(self) -> List[str]:
+ """
+ Retrieve a list of available models from the server.
+ """
+ # This could include an API call to `self._api_url/models` if the server supports it.
+ logger.warning("Fetching available models is not implemented for aiohttp backend.")
+ return []
+
+ def validate_connection(self):
+ """
+ Validate the connection to the backend server.
+ """
+ logger.info("Connection validation is not explicitly implemented for aiohttp backend.")
+
+ def _build_messages(self, request: TextGenerationRequest) -> Dict:
+ if request.number_images == 0:
+ messages = [{"role": "user", "content": request.prompt}]
+ else:
+ content = []
+ for image in request.images:
+ stream = io.BytesIO()
+ im_format = image.image.format or "PNG"
+ image.image.save(stream, format=im_format)
+ im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
+ image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+ content.append({"type": "image_url", "image_url": image_url})
+
+ content.append({"type": "text", "text": request.prompt})
+ messages = [{"role": "user", "content": content}]
+
+ return messages
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a165859454ac462a8dfedade0240a7e118acf50a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py
@@ -0,0 +1,320 @@
+import asyncio
+import functools
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Dict, List, Literal, Optional, Type, Union
+
+from loguru import logger
+from pydantic import BaseModel
+from transformers import ( # type: ignore # noqa: PGH003
+ AutoTokenizer,
+ PreTrainedTokenizer,
+)
+
+from guidellm.core import TextGenerationRequest, TextGenerationResult
+
+__all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
+
+
+BackendEnginePublic = Literal["openai_server", "aiohttp_server"]
+BackendEngine = Union[BackendEnginePublic, Literal["test"]]
+
+
+class GenerativeResponse(BaseModel):
+ """
+ A model representing a response from a generative AI backend.
+
+ :param type_: The type of response, either 'token_iter' for intermediate
+ token output or 'final' for the final result.
+ :type type_: Literal["token_iter", "final"]
+ :param add_token: The token to add to the output
+ (only applicable if type_ is 'token_iter').
+ :type add_token: Optional[str]
+ :param prompt: The original prompt sent to the backend.
+ :type prompt: Optional[str]
+ :param output: The final generated output (only applicable if type_ is 'final').
+ :type output: Optional[str]
+ :param prompt_token_count: The number of tokens in the prompt.
+ :type prompt_token_count: Optional[int]
+ :param output_token_count: The number of tokens in the output.
+ :type output_token_count: Optional[int]
+ """
+
+ type_: Literal["token_iter", "final"]
+ add_token: Optional[str] = None
+ prompt: Optional[str] = None
+ output: Optional[str] = None
+ prompt_token_count: Optional[int] = None
+ output_token_count: Optional[int] = None
+
+
+class Backend(ABC):
+ """
+ Abstract base class for generative AI backends.
+
+ This class provides a common interface for creating and interacting with different
+ generative AI backends. Subclasses should implement the abstract methods to
+ define specific backend behavior.
+
+ :cvar _registry: A dictionary that maps BackendEngine types to backend classes.
+ :type _registry: Dict[BackendEngine, Type[Backend]]
+ :param type_: The type of the backend.
+ :type type_: BackendEngine
+ :param target: The target URL for the backend.
+ :type target: str
+ :param model: The model used by the backend.
+ :type model: str
+ """
+
+ _registry: Dict[BackendEngine, "Type[Backend]"] = {}
+
+ @classmethod
+ def register(cls, backend_type: BackendEngine):
+ """
+ A decorator to register a backend class in the backend registry.
+
+ :param backend_type: The type of backend to register.
+ :type backend_type: BackendEngine
+ :return: The decorated backend class.
+ :rtype: Type[Backend]
+ """
+
+ def inner_wrapper(wrapped_class: Type["Backend"]):
+ cls._registry[backend_type] = wrapped_class
+ logger.info("Registered backend type: {}", backend_type)
+ return wrapped_class
+
+ return inner_wrapper
+
+ @classmethod
+ def create(cls, backend_type: BackendEngine, **kwargs) -> "Backend":
+ """
+ Factory method to create a backend instance based on the backend type.
+
+ :param backend_type: The type of backend to create.
+ :type backend_type: BackendEngine
+ :param kwargs: Additional arguments for backend initialization.
+ :return: An instance of a subclass of Backend.
+ :rtype: Backend
+ :raises ValueError: If the backend type is not registered.
+ """
+
+ logger.info("Creating backend of type {}", backend_type)
+
+ if backend_type not in cls._registry:
+ err = ValueError(f"Unsupported backend type: {backend_type}")
+ logger.error("{}", err)
+ raise err
+
+ return Backend._registry[backend_type](**kwargs)
+
+ def __init__(self, type_: BackendEngine, target: str, model: str):
+ """
+ Base constructor for the Backend class.
+ Calls into test_connection to ensure the backend is reachable.
+ Ensure all setup is done in the subclass constructor before calling super.
+
+ :param type_: The type of the backend.
+ :param target: The target URL for the backend.
+ :param model: The model used by the backend.
+ """
+ self._type = type_
+ self._target = target
+ self._model = model
+
+ self.test_connection()
+
+ @property
+ def default_model(self) -> str:
+ """
+ Get the default model for the backend.
+
+ :return: The default model.
+ :rtype: str
+ :raises ValueError: If no models are available.
+ """
+ return _cachable_default_model(self)
+
+ @property
+ def type_(self) -> BackendEngine:
+ """
+ Get the type of the backend.
+
+ :return: The type of the backend.
+ :rtype: BackendEngine
+ """
+ return self._type
+
+ @property
+ def target(self) -> str:
+ """
+ Get the target URL for the backend.
+
+ :return: The target URL.
+ :rtype: str
+ """
+ return self._target
+
+ @property
+ def model(self) -> str:
+ """
+ Get the model used by the backend.
+
+ :return: The model name.
+ :rtype: str
+ """
+ return self._model
+
+ def model_tokenizer(self) -> PreTrainedTokenizer:
+ """
+ Get the tokenizer for the backend model.
+
+ :return: The tokenizer instance.
+ """
+ return AutoTokenizer.from_pretrained(self.model)
+
+ def test_connection(self) -> bool:
+ """
+ Test the connection to the backend by running a short text generation request.
+ If successful, returns True, otherwise raises an exception.
+
+ :return: True if the connection is successful.
+ :rtype: bool
+ :raises ValueError: If the connection test fails.
+ """
+ try:
+ asyncio.get_running_loop()
+ is_async = True
+ except RuntimeError:
+ is_async = False
+
+ if is_async:
+ logger.warning("Running in async mode, cannot test connection")
+ return True
+
+ try:
+ request = TextGenerationRequest(
+ prompt="Test connection", output_token_count=5
+ )
+
+ asyncio.run(self.submit(request))
+ return True
+ except Exception as err:
+ raise_err = RuntimeError(
+ f"Backend connection test failed for backend type={self.type_} "
+ f"with target={self.target} and model={self.model} with error: {err}"
+ )
+ logger.error(raise_err)
+ raise raise_err from err
+
+ async def submit(self, request: TextGenerationRequest) -> TextGenerationResult:
+ """
+ Submit a text generation request and return the result.
+
+ This method handles the request submission to the backend and processes
+ the response in a streaming fashion if applicable.
+
+ :param request: The request object containing the prompt
+ and other configurations.
+ :type request: TextGenerationRequest
+ :return: The result of the text generation request.
+ :rtype: TextGenerationResult
+ :raises ValueError: If no response is received from the backend.
+ """
+
+ logger.debug("Submitting request with prompt: {}", request.prompt)
+
+ result = TextGenerationResult(request=request)
+ result.start(request.prompt)
+ received_final = False
+
+ async for response in self.make_request(request):
+ logger.debug("Received response: {}", response)
+ if response.type_ == "token_iter":
+ result.output_token(response.add_token if response.add_token else "")
+ elif response.type_ == "final":
+ if received_final:
+ err = ValueError(
+ "Received multiple final responses from the backend."
+ )
+ logger.error(err)
+ raise err
+
+ result.end(
+ output=response.output,
+ prompt_token_count=response.prompt_token_count,
+ output_token_count=response.output_token_count,
+ )
+ received_final = True
+ else:
+ err = ValueError(
+ f"Invalid response received from the backend of type: "
+ f"{response.type_} for {response}"
+ )
+ logger.error(err)
+ raise err
+
+ if not received_final:
+ err = ValueError("No final response received from the backend.")
+ logger.error(err)
+ raise err
+
+ logger.info("Request completed with output: {}", result.output)
+
+ return result
+
+ @abstractmethod
+ async def make_request(
+ self,
+ request: TextGenerationRequest,
+ ) -> AsyncGenerator[GenerativeResponse, None]:
+ """
+ Abstract method to make a request to the backend.
+
+ Subclasses must implement this method to define how requests are handled
+ by the backend.
+
+ :param request: The request object containing the prompt and
+ other configurations.
+ :type request: TextGenerationRequest
+ :yield: A generator yielding responses from the backend.
+ :rtype: AsyncGenerator[GenerativeResponse, None]
+ """
+ yield None # type: ignore # noqa: PGH003
+
+ @abstractmethod
+ def available_models(self) -> List[str]:
+ """
+ Abstract method to get the available models for the backend.
+
+ Subclasses must implement this method to provide the list of models
+ supported by the backend.
+
+ :return: A list of available models.
+ :rtype: List[str]
+ :raises NotImplementedError: If the method is not implemented by a subclass.
+ """
+ raise NotImplementedError
+
+
+@functools.lru_cache(maxsize=1)
+def _cachable_default_model(backend: Backend) -> str:
+ """
+ Get the default model for a backend using LRU caching.
+
+ This function caches the default model to optimize repeated lookups.
+
+ :param backend: The backend instance for which to get the default model.
+ :type backend: Backend
+ :return: The default model.
+ :rtype: str
+ :raises ValueError: If no models are available.
+ """
+ logger.debug("Getting default model for backend: {}", backend)
+ models = backend.available_models()
+ if models:
+ logger.debug("Default model: {}", models[0])
+ return models[0]
+
+ err = ValueError("No models available.")
+ logger.error(err)
+ raise err
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..9843fc1a06ac7fbecd2198b7317af8545a07c81a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py
@@ -0,0 +1,192 @@
+import base64
+import io
+from typing import AsyncGenerator, Dict, List, Optional
+
+from loguru import logger
+from openai import AsyncOpenAI, OpenAI
+
+from guidellm.backend.base import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+__all__ = ["OpenAIBackend"]
+
+
+@Backend.register("openai_server")
+class OpenAIBackend(Backend):
+ """
+ An OpenAI backend implementation for generative AI results.
+
+ This class provides an interface to communicate with the
+ OpenAI server for generating responses based on given prompts.
+
+ :param openai_api_key: The API key for OpenAI.
+ If not provided, it will default to the key from settings.
+ :type openai_api_key: Optional[str]
+ :param target: The target URL string for the OpenAI server.
+ :type target: Optional[str]
+ :param model: The OpenAI model to use, defaults to the first available model.
+ :type model: Optional[str]
+ :param request_args: Additional arguments for the OpenAI request.
+ :type request_args: Dict[str, Any]
+ """
+
+ def __init__(
+ self,
+ openai_api_key: Optional[str] = None,
+ target: Optional[str] = None,
+ model: Optional[str] = None,
+ **request_args,
+ ):
+ self._request_args: Dict = request_args
+ api_key: str = openai_api_key or settings.openai.api_key
+
+ if not api_key:
+ err = ValueError(
+ "`GUIDELLM__OPENAI__API_KEY` environment variable or "
+ "--openai-api-key CLI parameter must be specified for the "
+ "OpenAI backend."
+ )
+ logger.error("{}", err)
+ raise err
+
+ base_url = target or settings.openai.base_url
+
+ if not base_url:
+ err = ValueError(
+ "`GUIDELLM__OPENAI__BASE_URL` environment variable or "
+ "target parameter must be specified for the OpenAI backend."
+ )
+ logger.error("{}", err)
+ raise err
+
+ self._async_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
+ self._client = OpenAI(api_key=api_key, base_url=base_url)
+ self._model = model or self.default_model
+
+ super().__init__(type_="openai_server", target=base_url, model=self._model)
+ logger.info("OpenAI {} Backend listening on {}", self._model, base_url)
+
+ async def make_request(
+ self,
+ request: TextGenerationRequest,
+ ) -> AsyncGenerator[GenerativeResponse, None]:
+ """
+ Make a request to the OpenAI backend.
+
+ This method sends a prompt to the OpenAI backend and streams
+ the response tokens back.
+
+ :param request: The text generation request to submit.
+ :type request: TextGenerationRequest
+ :yield: A stream of GenerativeResponse objects.
+ :rtype: AsyncGenerator[GenerativeResponse, None]
+ """
+
+ logger.debug("Making request to OpenAI backend with prompt: {}", request.prompt)
+
+ request_args: Dict = {
+ "n": 1, # Number of completions for each prompt
+ }
+
+ if request.output_token_count is not None:
+ request_args.update(
+ {
+ "max_tokens": request.output_token_count,
+ "stop": None,
+ "extra_body": {
+ "ignore_eos": True,
+ }
+ }
+ )
+ elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:
+ request_args.update(
+ {
+ "max_tokens": settings.openai.max_gen_tokens,
+ }
+ )
+
+ request_args.update(self._request_args)
+
+ messages = self._build_messages(request)
+
+ stream = await self._async_client.chat.completions.create(
+ model=self.model,
+ messages=messages,
+ stream=True,
+ **request_args,
+ )
+
+ token_count = 0
+ async for chunk in stream:
+ choice = chunk.choices[0]
+ token = choice.delta.content or ""
+
+ if choice.finish_reason is not None:
+ yield GenerativeResponse(
+ type_="final",
+ prompt=request.prompt,
+ prompt_token_count=request.prompt_token_count,
+ output_token_count=token_count,
+ )
+ break
+
+ token_count += 1
+ yield GenerativeResponse(
+ type_="token_iter",
+ add_token=token,
+ prompt=request.prompt,
+ prompt_token_count=request.prompt_token_count,
+ output_token_count=token_count,
+ )
+
+ def available_models(self) -> List[str]:
+ """
+ Get the available models for the backend.
+
+ This method queries the OpenAI API to retrieve a list of available models.
+
+ :return: A list of available models.
+ :rtype: List[str]
+ :raises openai.OpenAIError: If an error occurs while retrieving models.
+ """
+
+ try:
+ return [model.id for model in self._client.models.list().data]
+ except Exception as error:
+ logger.error("Failed to retrieve available models: {}", error)
+ raise error
+
+ def validate_connection(self):
+ """
+ Validate the connection to the OpenAI backend.
+
+ This method checks that the OpenAI backend is reachable and
+ the API key is valid.
+
+ :raises openai.OpenAIError: If the connection is invalid.
+ """
+
+ try:
+ self._client.models.list()
+ except Exception as error:
+ logger.error("Failed to validate OpenAI connection: {}", error)
+ raise error
+
+ def _build_messages(self, request: TextGenerationRequest) -> Dict:
+ if request.number_images == 0:
+ messages = [{"role": "user", "content": request.prompt}]
+ else:
+ content = []
+ for image in request.images:
+ stream = io.BytesIO()
+ im_format = image.image.format or "PNG"
+ image.image.save(stream, format=im_format)
+ im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
+ image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+ content.append({"type": "image_url", "image_url": image_url})
+
+ content.append({"type": "text", "text": request.prompt})
+ messages = [{"role": "user", "content": content}]
+
+ return messages
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81f67a6666490dd48a1eddba4dad6c90e2bf08a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py
@@ -0,0 +1,239 @@
+import json
+from enum import Enum
+from typing import Dict, List, Optional, Sequence
+
+from pydantic import BaseModel, Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+__all__ = [
+ "DatasetSettings",
+ "EmulatedDataSettings",
+ "Environment",
+ "LoggingSettings",
+ "OpenAISettings",
+ "print_config",
+ "ReportGenerationSettings",
+ "Settings",
+ "reload_settings",
+ "settings",
+]
+
+
+class Environment(str, Enum):
+ """
+ Enum for the supported environments
+ """
+
+ LOCAL = "local"
+ DEV = "dev"
+ STAGING = "staging"
+ PROD = "prod"
+
+
+ENV_REPORT_MAPPING = {
+ Environment.PROD: "https://guidellm.neuralmagic.com/local-report/index.html",
+ Environment.STAGING: "https://staging.guidellm.neuralmagic.com/local-report/index.html",
+ Environment.DEV: "https://dev.guidellm.neuralmagic.com/local-report/index.html",
+ Environment.LOCAL: "tests/dummy/report.html",
+}
+
+
+class LoggingSettings(BaseModel):
+ """
+ Logging settings for the application
+ """
+
+ disabled: bool = False
+ clear_loggers: bool = True
+ console_log_level: str = "WARNING"
+ log_file: Optional[str] = None
+ log_file_level: Optional[str] = None
+
+
+class DatasetSettings(BaseModel):
+ """
+ Dataset settings for the application
+ """
+
+ preferred_data_columns: List[str] = Field(
+ default_factory=lambda: [
+ "prompt",
+ "instruction",
+ "input",
+ "inputs",
+ "question",
+ "context",
+ "text",
+ "content",
+ "body",
+ "data",
+ ]
+ )
+ preferred_data_splits: List[str] = Field(
+ default_factory=lambda: ["test", "tst", "validation", "val", "train"]
+ )
+
+
+class EmulatedDataSettings(BaseModel):
+ """
+ Emulated data settings for the application to use
+ """
+
+ source: str = "http://localhost:666/aimages/1342-0.txt"
+ filter_start: str = "It is a truth universally acknowledged, that a"
+ filter_end: str = "CHISWICK PRESS:--CHARLES WHITTINGHAM AND CO."
+ clean_text_args: Dict[str, bool] = Field(
+ default_factory=lambda: {
+ "fix_encoding": True,
+ "clean_whitespace": True,
+ "remove_empty_lines": True,
+ "force_new_line_punctuation": True,
+ }
+ )
+ image_source: List[str] = "http://localhost:666/aimages/pg1-images.html"
+
+
+class OpenAISettings(BaseModel):
+ """
+ OpenAI settings for the application to connect to the API
+ for OpenAI server based pathways
+ """
+
+ # OpenAI API key.
+ api_key: str = "invalid_token"
+
+ # OpenAI-compatible server URL
+ # NOTE: The default value is default address of llama.cpp web server
+ base_url: str = "http://localhost:8000/v1"
+
+ max_gen_tokens: int = 4096
+
+
+class AiohttpSettings(OpenAISettings):
+ pass
+
+class ReportGenerationSettings(BaseModel):
+ """
+ Report generation settings for the application
+ """
+
+ source: str = ""
+ report_html_match: str = "window.report_data = {};"
+ report_html_placeholder: str = "{}"
+
+
+class Settings(BaseSettings):
+ """
+ All the settings are powered by pydantic_settings and could be
+ populated from the .env file.
+
+ The format to populate the settings is next
+
+ ```sh
+ export GUIDELLM__LOGGING__DISABLED=true
+ export GUIDELLM__OPENAI__API_KEY=******
+ ```
+ """
+
+ model_config = SettingsConfigDict(
+ env_prefix="GUIDELLM__",
+ env_nested_delimiter="__",
+ extra="ignore",
+ validate_default=True,
+ env_file=".env",
+ )
+
+ # general settings
+ env: Environment = Environment.PROD
+ request_timeout: int = 30
+ max_concurrency: int = 512
+ num_sweep_profiles: int = 9
+ logging: LoggingSettings = LoggingSettings()
+
+ # Data settings
+ dataset: DatasetSettings = DatasetSettings()
+ emulated_data: EmulatedDataSettings = EmulatedDataSettings()
+
+ # Request settings
+ openai: OpenAISettings = OpenAISettings()
+ aiohttp: AiohttpSettings = AiohttpSettings()
+
+ # Report settings
+ report_generation: ReportGenerationSettings = ReportGenerationSettings()
+
+ @model_validator(mode="after")
+ @classmethod
+ def set_default_source(cls, values):
+ if not values.report_generation.source:
+ values.report_generation.source = ENV_REPORT_MAPPING.get(values.env)
+
+ return values
+
+ def generate_env_file(self) -> str:
+ """
+ Generate the .env file from the current settings
+ """
+ return Settings._recursive_generate_env(
+ self,
+ self.model_config["env_prefix"], # type: ignore # noqa: PGH003
+ self.model_config["env_nested_delimiter"], # type: ignore # noqa: PGH003
+ )
+
+ @staticmethod
+ def _recursive_generate_env(model: BaseModel, prefix: str, delimiter: str) -> str:
+ env_file = ""
+ add_models = []
+ for key, value in model.model_dump().items():
+ if isinstance(value, BaseModel):
+ # add nested properties to be processed after the current level
+ add_models.append((key, value))
+ continue
+
+ dict_values = (
+ {
+ f"{prefix}{key.upper()}{delimiter}{sub_key.upper()}": sub_value
+ for sub_key, sub_value in value.items()
+ }
+ if isinstance(value, dict)
+ else {f"{prefix}{key.upper()}": value}
+ )
+
+ for tag, sub_value in dict_values.items():
+ if isinstance(sub_value, Sequence) and not isinstance(sub_value, str):
+ value_str = ",".join(f'"{item}"' for item in sub_value)
+ env_file += f"{tag}=[{value_str}]\n"
+ elif isinstance(sub_value, Dict):
+ value_str = json.dumps(sub_value)
+ env_file += f"{tag}={value_str}\n"
+ elif not sub_value:
+ env_file += f"{tag}=\n"
+ else:
+ env_file += f'{tag}="{sub_value}"\n'
+
+ for key, value in add_models:
+ env_file += Settings._recursive_generate_env(
+ value, f"{prefix}{key.upper()}{delimiter}", delimiter
+ )
+ return env_file
+
+
+settings = Settings()
+
+
+def reload_settings():
+ """
+ Reload the settings from the environment variables
+ """
+ new_settings = Settings()
+ settings.__dict__.update(new_settings.__dict__)
+
+
+def print_config():
+ """
+ Print the current configuration settings
+ """
+ print(f"Settings: \n{settings.generate_env_file()}") # noqa: T201
+
+
+if __name__ == "__main__":
+ print_config()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e738aa769737a158dccb697a36d61697488d6b55
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py
@@ -0,0 +1,24 @@
+from .distribution import Distribution
+from .report import GuidanceReport
+from .request import TextGenerationRequest
+from .result import (
+ RequestConcurrencyMeasurement,
+ TextGenerationBenchmark,
+ TextGenerationBenchmarkReport,
+ TextGenerationError,
+ TextGenerationResult,
+)
+from .serializable import Serializable, SerializableFileType
+
+__all__ = [
+ "Distribution",
+ "GuidanceReport",
+ "RequestConcurrencyMeasurement",
+ "Serializable",
+ "SerializableFileType",
+ "TextGenerationBenchmark",
+ "TextGenerationBenchmarkReport",
+ "TextGenerationError",
+ "TextGenerationRequest",
+ "TextGenerationResult",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f770528c3bcc1a1ba0049797ee59067f816d9a0
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py
@@ -0,0 +1,190 @@
+from typing import List, Sequence
+
+import numpy as np
+from loguru import logger
+from pydantic import Field
+
+from guidellm.core.serializable import Serializable
+
+__all__ = ["Distribution"]
+
+
+class Distribution(Serializable):
+ """
+ A class to represent a statistical distribution and perform various
+ statistical analyses.
+ """
+
+ data: Sequence[float] = Field(
+ default_factory=list,
+ description="The data points of the distribution.",
+ )
+
+ def __str__(self):
+ return f"Distribution({self.describe()})"
+
+ def __len__(self):
+ return len(self.data)
+
+ @property
+ def mean(self) -> float:
+ """
+ Calculate and return the mean of the distribution.
+ :return: The mean of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate mean.")
+ return 0.0
+
+ mean_value = np.mean(self.data).item()
+ logger.debug(f"Calculated mean: {mean_value}")
+ return mean_value
+
+ @property
+ def median(self) -> float:
+ """
+ Calculate and return the median of the distribution.
+ :return: The median of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate median.")
+ return 0.0
+
+ median_value = np.median(self.data).item()
+ logger.debug(f"Calculated median: {median_value}")
+ return median_value
+
+ @property
+ def variance(self) -> float:
+ """
+ Calculate and return the variance of the distribution.
+ :return: The variance of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate variance.")
+ return 0.0
+
+ variance_value = np.var(self.data).item()
+ logger.debug(f"Calculated variance: {variance_value}")
+ return variance_value
+
+ @property
+ def std_deviation(self) -> float:
+ """
+ Calculate and return the standard deviation of the distribution.
+ :return: The standard deviation of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate standard deviation.")
+ return 0.0
+
+ std_deviation_value = np.std(self.data).item()
+ logger.debug(f"Calculated standard deviation: {std_deviation_value}")
+ return std_deviation_value
+
+ def percentile(self, percentile: float) -> float:
+ """
+ Calculate and return the specified percentile of the distribution.
+ :param percentile: The desired percentile to calculate (0-100).
+ :return: The specified percentile of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate percentile.")
+ return 0.0
+
+ percentile_value = np.percentile(self.data, percentile).item()
+ logger.debug(f"Calculated {percentile}th percentile: {percentile_value}")
+ return percentile_value
+
+ def percentiles(self, percentiles: List[float]) -> List[float]:
+ """
+ Calculate and return the specified percentiles of the distribution.
+ :param percentiles: A list of desired percentiles to calculate (0-100).
+ :return: A list of the specified percentiles of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate percentiles.")
+ return [0.0] * len(percentiles)
+
+ percentiles_values: List[float] = np.percentile(self.data, percentiles).tolist() # type: ignore # noqa: PGH003
+ logger.debug(f"Calculated percentiles {percentiles}: {percentiles_values}")
+ return percentiles_values
+
+ @property
+ def min(self) -> float:
+ """
+ Return the minimum value of the distribution.
+ :return: The minimum value of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate minimum.")
+ return 0.0
+
+ min_value: float = np.min(self.data).item() # type: ignore # noqa: PGH003
+ logger.debug(f"Calculated min: {min_value}")
+ return min_value
+
+ @property
+ def max(self) -> float:
+ """
+ Return the maximum value of the distribution.
+ :return: The maximum value of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate maximum.")
+ return 0.0
+
+ max_value: float = np.max(self.data).item() # type: ignore # noqa: PGH003
+ logger.debug(f"Calculated max: {max_value}")
+ return max_value
+
+ @property
+ def range(self) -> float:
+ """
+ Calculate and return the range of the distribution (max - min).
+ :return: The range of the distribution.
+ """
+ if not self.data:
+ logger.warning("No data points available to calculate range.")
+ return 0.0
+
+ range_value = self.max - self.min
+ logger.debug(f"Calculated range: {range_value}")
+ return range_value
+
+ def describe(self) -> dict:
+ """
+ Return a dictionary describing various statistics of the distribution.
+ :return: A dictionary with statistical summaries of the distribution.
+ """
+ description = {
+ "mean": self.mean,
+ "median": self.median,
+ "variance": self.variance,
+ "std_deviation": self.std_deviation,
+ "percentile_indices": [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99],
+ "percentile_values": self.percentiles(
+ [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99],
+ ),
+ "min": self.min,
+ "max": self.max,
+ "range": self.range,
+ }
+ logger.debug(f"Generated description: {description}")
+ return description
+
+ def add_data(self, new_data: Sequence[float]):
+ """
+ Add new data points to the distribution.
+ :param new_data: A list of new numerical data points to add.
+ """
+ self.data = list(self.data) + list(new_data)
+ logger.debug(f"Added new data: {new_data}")
+
+ def remove_data(self, remove_data: Sequence[float]):
+ """
+ Remove specified data points from the distribution.
+ :param remove_data: A list of numerical data points to remove.
+ """
+ self.data = [item for item in self.data if item not in remove_data]
+ logger.debug(f"Removed data: {remove_data}")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py
new file mode 100644
index 0000000000000000000000000000000000000000..c48eed561d4eaad4a84dc934264ed4b68d17830a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py
@@ -0,0 +1,311 @@
+import time
+from datetime import datetime
+from typing import List, Optional
+
+from loguru import logger
+from pydantic import Field
+from rich.console import Console, Group
+from rich.live import Live
+from rich.panel import Panel
+from rich.table import Table
+
+from guidellm.core.result import TextGenerationBenchmark, TextGenerationBenchmarkReport
+from guidellm.core.serializable import Serializable
+
+__all__ = ["GuidanceReport"]
+
+
+def _create_benchmark_report_details(report: TextGenerationBenchmarkReport) -> str:
+ """
+ Create a detailed string representation of a benchmark report.
+
+ :param report: The benchmark report to generate details for.
+ :type report: TextGenerationBenchmarkReport
+ :return: A string containing the backend, data, rate, and limits of
+ the benchmark report.
+ :rtype: str
+ """
+ backend = (
+ f"Backend(type={report.args.get('backend_type', 'N/A')}, "
+ f"target={report.args.get('target', 'N/A')}, "
+ f"model={report.args.get('model', 'N/A')})"
+ )
+ data = (
+ f"Data(type={report.args.get('data_type', 'N/A')}, "
+ f"source={report.args.get('data', 'N/A')}, "
+ f"tokenizer={report.args.get('tokenizer', 'N/A')})"
+ )
+ rate = (
+ f"Rate(type={report.args.get('mode', 'N/A')}, "
+ f"rate={report.args.get('rate', 'N/A')})"
+ )
+ limits = (
+ f"Limits(max_number={report.args.get('max_number', 'N/A')} requests, "
+ f"max_duration={report.args.get('max_duration', 'N/A')} sec)"
+ )
+
+ logger.debug(
+ "Created benchmark report details for backend={}, data={}, rate={}, limits={}",
+ backend,
+ data,
+ rate,
+ limits,
+ )
+
+ return backend + "\n" + data + "\n" + rate + "\n" + limits + "\n"
+
+
+def _benchmark_rate_id(benchmark: TextGenerationBenchmark) -> str:
+ """
+ Generate a string identifier for a benchmark rate.
+
+ :param benchmark: The benchmark for which to generate the rate ID.
+ :type benchmark: TextGenerationBenchmark
+ :return: A string representing the benchmark rate ID.
+ :rtype: str
+ """
+ rate_id = (
+ f"{benchmark.mode}@{benchmark.rate:.2f} req/sec"
+ if benchmark.rate
+ else f"{benchmark.mode}"
+ )
+ logger.debug("Generated benchmark rate ID: {}", rate_id)
+ return rate_id
+
+
+def _create_benchmark_report_requests_summary(
+ report: TextGenerationBenchmarkReport,
+) -> Table:
+ """
+ Create a table summarizing the requests of a benchmark report.
+
+ :param report: The benchmark report to summarize.
+ :type report: TextGenerationBenchmarkReport
+ :return: A rich Table object summarizing the requests.
+ :rtype: Table
+ """
+ table = Table(
+ "Benchmark",
+ "Requests Completed",
+ "Request Failed",
+ "Duration",
+ "Start Time",
+ "End Time",
+ title="[magenta]Requests Data by Benchmark[/magenta]",
+ title_style="bold",
+ title_justify="left",
+ show_header=True,
+ )
+
+ for benchmark in report.benchmarks_sorted:
+ start_time_str = (
+ datetime.fromtimestamp(benchmark.start_time).strftime("%H:%M:%S")
+ if benchmark.start_time
+ else "N/A"
+ )
+ end_time_str = (
+ datetime.fromtimestamp(benchmark.end_time).strftime("%H:%M:%S")
+ if benchmark.end_time
+ else "N/A"
+ )
+
+ table.add_row(
+ _benchmark_rate_id(benchmark),
+ f"{benchmark.request_count}/{benchmark.total_count}",
+ f"{benchmark.error_count}/{benchmark.total_count}",
+ f"{benchmark.duration:.2f} sec",
+ f"{start_time_str}",
+ f"{end_time_str}",
+ )
+ logger.debug("Created requests summary table for the report.")
+ return table
+
+
+def _create_benchmark_report_data_tokens_summary(
+ report: TextGenerationBenchmarkReport,
+) -> Table:
+ """
+ Create a table summarizing data tokens of a benchmark report.
+
+ :param report: The benchmark report to summarize.
+ :type report: TextGenerationBenchmarkReport
+ :return: A rich Table object summarizing the data tokens.
+ :rtype: Table
+ """
+ table = Table(
+ "Benchmark",
+ "Prompt",
+ "Prompt (1%, 5%, 50%, 95%, 99%)",
+ "Output",
+ "Output (1%, 5%, 50%, 95%, 99%)",
+ title="[magenta]Tokens Data by Benchmark[/magenta]",
+ title_style="bold",
+ title_justify="left",
+ show_header=True,
+ )
+
+ for benchmark in report.benchmarks_sorted:
+ table.add_row(
+ _benchmark_rate_id(benchmark),
+ f"{benchmark.prompt_token:.2f}",
+ ", ".join(
+ f"{percentile:.1f}"
+ for percentile in benchmark.prompt_token_percentiles
+ ),
+ f"{benchmark.output_token:.2f}",
+ ", ".join(
+ f"{percentile:.1f}"
+ for percentile in benchmark.output_token_percentiles
+ ),
+ )
+ logger.debug("Created data tokens summary table for the report.")
+ return table
+
+
+def _create_benchmark_report_dist_perf_summary(
+ report: TextGenerationBenchmarkReport,
+) -> Table:
+ """
+ Create a table summarizing distribution performance of a benchmark report.
+
+ :param report: The benchmark report to summarize.
+ :type report: TextGenerationBenchmarkReport
+ :return: A rich Table object summarizing the performance statistics.
+ :rtype: Table
+ """
+ table = Table(
+ "Benchmark",
+ "Request Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (sec)",
+ "Time to First Token [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
+ "Inter Token Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
+ title="[magenta]Performance Stats by Benchmark[/magenta]",
+ title_style="bold",
+ title_justify="left",
+ show_header=True,
+ )
+
+ for benchmark in report.benchmarks_sorted:
+ table.add_row(
+ _benchmark_rate_id(benchmark),
+ ", ".join(
+ f"{percentile:.2f}"
+ for percentile in benchmark.request_latency_percentiles
+ ),
+ ", ".join(
+ f"{percentile * 1000:.1f}"
+ for percentile in benchmark.time_to_first_token_percentiles
+ ),
+ ", ".join(
+ f"{percentile * 1000:.1f}"
+ for percentile in benchmark.inter_token_latency_percentiles
+ ),
+ )
+ logger.debug("Created distribution performance summary table for the report.")
+ return table
+
+
+def _create_benchmark_report_summary(report: TextGenerationBenchmarkReport) -> Table:
+ """
+ Create a summary table for a benchmark report.
+
+ :param report: The benchmark report to summarize.
+ :type report: TextGenerationBenchmarkReport
+ :return: A rich Table object summarizing overall performance.
+ :rtype: Table
+ """
+ table = Table(
+ "Benchmark",
+ "Requests per Second",
+ "Request Latency",
+ "Time to First Token",
+ "Inter Token Latency",
+ "Output Token Throughput",
+ title="[magenta]Performance Summary by Benchmark[/magenta]",
+ title_style="bold",
+ title_justify="left",
+ show_header=True,
+ )
+
+ for benchmark in report.benchmarks_sorted:
+ table.add_row(
+ _benchmark_rate_id(benchmark),
+ f"{benchmark.completed_request_rate:.2f} req/sec",
+ f"{benchmark.request_latency:.2f} sec",
+ f"{benchmark.time_to_first_token:.2f} ms",
+ f"{benchmark.inter_token_latency:.2f} ms",
+ f"{benchmark.output_token_throughput:.2f} tokens/sec",
+ )
+ logger.debug("Created overall performance summary table for the report.")
+ return table
+
+
+class GuidanceReport(Serializable):
+ """
+ A class to manage the guidance reports that include the benchmarking details,
+ potentially across multiple runs, for saving and loading from disk.
+
+ :param benchmarks: The list of benchmarking reports.
+ :type benchmarks: List[TextGenerationBenchmarkReport]
+ """
+
+ benchmarks: List[TextGenerationBenchmarkReport] = Field(
+ default_factory=list, description="The list of benchmark reports."
+ )
+
+ def print(
+ self, save_path: Optional[str] = None, continual_refresh: bool = False
+ ) -> None:
+ """
+ Print the guidance report to the console.
+
+ :param save_path: Optional path to save the report to disk.
+ :type save_path: Optional[str]
+ :param continual_refresh: Whether to continually refresh the report.
+ :type continual_refresh: bool
+ :return: None
+ """
+ logger.info("Printing guidance report to console with save_path={}", save_path)
+ report_viz = Panel(
+ Group(
+ *[
+ Panel(
+ Group(
+ _create_benchmark_report_details(benchmark),
+ "",
+ _create_benchmark_report_requests_summary(benchmark),
+ "",
+ _create_benchmark_report_data_tokens_summary(benchmark),
+ "",
+ _create_benchmark_report_dist_perf_summary(benchmark),
+ "",
+ _create_benchmark_report_summary(benchmark),
+ ),
+ title=(
+ f"[bold magenta]Benchmark Report "
+ f"{index + 1}[/bold magenta]"
+ ),
+ expand=True,
+ title_align="left",
+ )
+ for index, benchmark in enumerate(self.benchmarks)
+ ],
+ ),
+ title=(
+ "[bold cyan]GuideLLM Benchmarks Report[/bold cyan] [italic]"
+ f"({save_path})[/italic]"
+ ),
+ expand=True,
+ title_align="left",
+ )
+ console = Console()
+
+ if continual_refresh:
+ logger.info("Starting live report with continual refresh.")
+ with Live(report_viz, refresh_per_second=1, console=console) as live:
+ while True:
+ live.update(report_viz)
+ time.sleep(1)
+ else:
+ console.print(report_viz)
+
+ logger.info("Guidance report printing completed.")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d0f37c8640e637591e199722567dacbf04102b
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py
@@ -0,0 +1,65 @@
+import uuid
+from typing import Any, Dict, List, Optional, Tuple
+
+from pydantic import Field
+
+from guidellm.core.serializable import Serializable
+from guidellm.utils import ImageDescriptor
+
+
+class TextGenerationRequest(Serializable):
+ """
+ A class to represent a text generation request for generative AI workloads.
+ """
+
+ id: str = Field(
+ default_factory=lambda: str(uuid.uuid4()),
+ description="The unique identifier for the request.",
+ )
+ prompt: str = Field(description="The input prompt for the text generation.")
+ images: Optional[List[ImageDescriptor]] = Field(
+ default=None,
+ description="Input images.",
+ )
+ prompt_token_count: Optional[int] = Field(
+ default=None,
+ description="The number of tokens in the input prompt.",
+ )
+ output_token_count: Optional[int] = Field(
+ default=None,
+ description="The number of tokens to generate.",
+ )
+ params: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="The parameters for the text generation request.",
+ )
+
+ @property
+ def number_images(self) -> int:
+ if self.images is None:
+ return 0
+ else:
+ return len(self.images)
+
+ @property
+ def image_resolution(self) -> List[Tuple[int, int]]:
+ if self.images is None:
+ return None
+ else:
+ return [im.size for im in self.images]
+
+
+ def __str__(self) -> str:
+ prompt_short = (
+ self.prompt[:32] + "..."
+ if self.prompt and len(self.prompt) > 32 # noqa: PLR2004
+ else self.prompt
+ )
+
+ return (
+ f"TextGenerationRequest(id={self.id}, "
+ f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, "
+ f"output_token_count={self.output_token_count}, "
+ f"params={self.params})"
+ f"image_resolution={self.image_resolution}"
+ )
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..aebd1763728192228e7115c5842c4a8cec7fc0fe
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py
@@ -0,0 +1,637 @@
+from time import time
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from loguru import logger
+from pydantic import Field, computed_field
+
+from guidellm.core.distribution import Distribution
+from guidellm.core.request import TextGenerationRequest
+from guidellm.core.serializable import Serializable
+
+__all__ = [
+ "RequestConcurrencyMeasurement",
+ "TextGenerationBenchmark",
+ "TextGenerationBenchmarkReport",
+ "TextGenerationError",
+ "TextGenerationResult",
+]
+
+
+class TextGenerationResult(Serializable):
+ """
+ A class to represent the result of a text generation request
+ for generative AI workloads.
+ """
+
+ request: TextGenerationRequest = Field(
+ description="The text generation request used to generate the result.",
+ )
+ prompt: str = Field(
+ default_factory=str,
+ description="The input prompt for the text generation.",
+ )
+ prompt_word_count: int = Field(
+ default=0,
+ description="The number of words in the input prompt.",
+ )
+ prompt_token_count: int = Field(
+ default=0,
+ description="The number of tokens in the input prompt.",
+ )
+ output: str = Field(
+ default_factory=str,
+ description="The generated output for the text generation.",
+ )
+ output_word_count: int = Field(
+ default=0,
+ description="The number of words in the output.",
+ )
+ output_token_count: int = Field(
+ default=0,
+ description="The number of tokens in the output.",
+ )
+ last_time: Optional[float] = Field(
+ default=None,
+ description="The last time recorded.",
+ )
+ first_token_set: bool = Field(
+ default=False,
+ description="Whether the first token time is set.",
+ )
+ start_time: Optional[float] = Field(
+ default=None,
+ description="The start time of the text generation.",
+ )
+ end_time: Optional[float] = Field(
+ default=None,
+ description="The end time of the text generation.",
+ )
+ first_token_time: Optional[float] = Field(
+ default=None,
+ description="The time taken to decode the first token.",
+ )
+ decode_times: Distribution = Field(
+ default_factory=Distribution,
+ description="The distribution of decode times.",
+ )
+
+ def start(self, prompt: str):
+ """
+ Start the text generation by recording the prompt and start time.
+
+ :param prompt: The input prompt for the text generation.
+ :type prompt: str
+ """
+ self.prompt = prompt
+ self.prompt_word_count = len(prompt.split())
+ self.prompt_token_count = len(prompt) # Token count placeholder
+ self.start_time = time()
+ self.last_time = time()
+ self.first_token_set = False
+
+ logger.info("Text generation started with prompt: '{}'", prompt)
+
+ def output_token(self, token: str):
+ """
+ Add a token to the output and record the decode time.
+
+ :param token: The decoded token.
+ :type token: str
+ """
+ self._check_recording_started()
+
+ if self.last_time is None:
+ raise ValueError(
+ "last time is not specified. "
+ "Did you call `text_generation_benchmark.start()`?"
+ )
+
+ current_counter = time()
+
+ if not self.first_token_set:
+ self.first_token_time = current_counter - self.last_time
+ self.first_token_set = True
+ logger.debug(f"First token decode time: {self.first_token_time}")
+ else:
+ decode_time = current_counter - self.last_time
+ self.decode_times.add_data([decode_time])
+ logger.debug(f"Token '{token}' decoded in {decode_time} seconds")
+
+ self.last_time = current_counter
+ self.output += token
+ logger.debug("Added token {} to output", token)
+
+ def end(
+ self,
+ output: Optional[str] = None,
+ prompt_token_count: Optional[int] = None,
+ output_token_count: Optional[int] = None,
+ ):
+ """
+ End the text generation by recording the output and end time.
+
+ :param output: The generated output for the text generation.
+ :type output: str
+ :param prompt_token_count: Optional token count for the prompt,
+ defaults to word count.
+ :type prompt_token_count: Optional[int]
+ :param output_token_count: Optional token count for the output,
+ defaults to word count.
+ :type output_token_count: Optional[int]
+ """
+ self._check_recording_started()
+ self.end_time = time()
+
+ if output:
+ self.output = output
+
+ self.output_word_count = len(self.output.split())
+ self.output_token_count = output_token_count or self.output_word_count
+ self.prompt_token_count = prompt_token_count or self.prompt_word_count
+
+ logger.info(f"Text generation ended with output: '{self.output}'")
+
+ def _check_recording_started(
+ self,
+ ):
+ if self.start_time is None:
+ raise ValueError(
+ "start time is not specified. "
+ "Did you make the `text_generation_benchmark.start()`?",
+ )
+
+
+class TextGenerationError(Serializable):
+ """
+ A class to represent an error that occurred during a text generation request
+ for generative AI workloads.
+ """
+
+ request: TextGenerationRequest = Field(
+ description="The text generation request that resulted in an error.",
+ )
+ message: str = Field(
+ description="The error message that occurred during text generation.",
+ )
+
+
+class RequestConcurrencyMeasurement(Serializable):
+ """
+ A dataclass to represent the concurrency measurement of a request.
+ """
+
+ time: float = Field(description="The time of the measurement.")
+ completed: int = Field(description="The number of completed requests.")
+ errored: int = Field(description="The number of errored requests.")
+ processing: int = Field(description="The number of processing requests.")
+
+
+class TextGenerationBenchmark(Serializable):
+ """
+ A class to represent a report of text generation requests
+ (results and errors) for generative AI workloads.
+ This is a set of results and errors for a specific mode and rate.
+ """
+
+ mode: Literal["asynchronous", "synchronous", "throughput"] = Field(
+ description="The generation mode, one of 'async', 'sync', or 'throughput'."
+ )
+ rate: Optional[float] = Field(
+ default=None,
+ description="The requested rate of requests per second.",
+ )
+ results: List[TextGenerationResult] = Field(
+ default_factory=list,
+ description="The results of the text generation requests.",
+ )
+ errors: List[TextGenerationError] = Field(
+ default_factory=list,
+ description="The errors of the text generation requests.",
+ )
+ concurrencies: List[RequestConcurrencyMeasurement] = Field(
+ default_factory=list,
+ description="The concurrency measurements of the requests.",
+ )
+
+ def __iter__(self):
+ """
+ Provide an iterator interface to iterate over the results.
+
+ :return: An iterator over the results.
+ """
+ return iter(self.results)
+
+ @computed_field # type: ignore[misc]
+ @property
+ def request_count(self) -> int:
+ """
+ Get the number of requests in the result.
+
+ :return: The number of requests.
+ :rtype: int
+ """
+ return len(self.results)
+
+ @computed_field # type: ignore[misc]
+ @property
+ def error_count(self) -> int:
+ """
+ Get the number of errors in the result.
+
+ :return: The number of errors.
+ :rtype: int
+ """
+ return len(self.errors)
+
+ @computed_field # type: ignore[misc]
+ @property
+ def total_count(self) -> int:
+ """
+ Get the total number of requests in the result.
+
+ :return: The total number of requests.
+ :rtype: int
+ """
+ return self.request_count + self.error_count
+
+ @computed_field # type: ignore[misc]
+ @property
+ def start_time(self) -> Optional[float]:
+ """
+ Get the start time of the first request in the result.
+
+ :return: The start time of the first request.
+ :rtype: Optional[float]
+ """
+ if not self.results:
+ return None
+
+ return self.results[0].start_time
+
+ @computed_field # type: ignore[misc]
+ @property
+ def end_time(self) -> Optional[float]:
+ """
+ Get the end time of the last request in the result.
+
+ :return: The end time of the last request.
+ :rtype: Optional[float]
+ """
+ if not self.results:
+ return None
+
+ return self.results[-1].end_time
+
+ @computed_field # type: ignore[misc]
+ @property
+ def duration(self) -> float:
+ """
+ Get the duration of the result in seconds.
+
+ :return: The duration of the result.
+ :rtype: float
+ """
+ if not self.results or not self.start_time or not self.end_time:
+ return 0.0
+
+ return self.end_time - self.start_time
+
+ @computed_field # type: ignore[misc]
+ @property
+ def completed_request_rate(self) -> float:
+ """
+ Get the rate of requests per second in the result.
+
+ :return: The rate of requests per second.
+ :rtype: float
+ """
+ if not self.results or not self.duration:
+ return 0.0
+
+ return len(self.results) / self.duration
+
+ @computed_field # type: ignore[misc]
+ @property
+ def request_latency(self) -> float:
+ """
+ Get the average request latency in seconds.
+
+ :return: The average request latency in seconds.
+ :rtype: float
+ """
+ if not self.results:
+ return 0.0
+
+ return self.request_latency_distribution.mean
+
+ @property
+ def request_latency_distribution(self) -> Distribution:
+ """
+ Get the distribution of request latencies.
+
+ :return: The distribution of request latencies.
+ :rtype: Distribution
+ """
+ return Distribution(
+ data=[
+ result.end_time - result.start_time
+ for result in self.results
+ if result.end_time is not None and result.start_time is not None
+ ]
+ )
+
+ @computed_field # type: ignore[misc]
+ @property
+ def request_latency_percentiles(self) -> List[float]:
+ """
+ Get standard percentiles of request latency in seconds.
+
+ :return: List of percentile request latency in seconds
+ :rtype: List[float]
+ """
+ return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+
+ @computed_field # type: ignore[misc]
+ @property
+ def time_to_first_token(self) -> float:
+ """
+ Get the time taken to decode the first token in milliseconds.
+
+ :return: The time taken to decode the first token in milliseconds.
+ :rtype: float
+ """
+ if not self.results:
+ return 0.0
+
+ return 1000 * self.ttft_distribution.mean
+
+ @property
+ def ttft_distribution(self) -> Distribution:
+ """
+ Get the distribution of time taken to decode the first token.
+
+ :return: The distribution of time taken to decode the first token.
+ :rtype: Distribution
+ """
+ return Distribution(
+ data=[
+ result.first_token_time
+ for result in self.results
+ if result.first_token_time is not None
+ ]
+ )
+
+ @computed_field # type: ignore[misc]
+ @property
+ def time_to_first_token_percentiles(self) -> List[float]:
+ """
+ Get standard percentiles for time taken to decode the first token
+ in milliseconds.
+
+ :return: List of percentile time taken to decode the first token
+ in milliseconds.
+ :rtype: List[float]
+ """
+ return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+ @computed_field # type: ignore[misc]
+ @property
+ def inter_token_latency(self) -> float:
+ """
+ Get the average time between tokens in milliseconds.
+
+ :return: The average time between tokens.
+ :rtype: float
+ """
+ if not self.results:
+ return 0.0
+
+ return 1000 * self.itl_distribution.mean
+
+ @property
+ def itl_distribution(self) -> Distribution:
+ """
+ Get the distribution of time between tokens.
+
+ :return: The distribution of time between tokens.
+ :rtype: Distribution
+ """
+ return Distribution(
+ data=[
+ decode for result in self.results for decode in result.decode_times.data
+ ]
+ )
+
+ @computed_field # type: ignore[misc]
+ @property
+ def inter_token_latency_percentiles(self) -> List[float]:
+ """
+ Get standard percentiles for the time between tokens in milliseconds.
+
+ :return: List of percentiles for the average time between tokens.
+ :rtype: List[float]
+ """
+ return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+ @computed_field # type: ignore[misc]
+ @property
+ def output_token_throughput(self) -> float:
+ """
+ Get the average token throughput in tokens per second.
+
+ :return: The average token throughput.
+ :rtype: float
+ """
+ if not self.results or not self.duration:
+ return 0.0
+
+ total_tokens = sum(result.output_token_count for result in self.results)
+
+ return total_tokens / self.duration
+
+ @computed_field # type: ignore[misc]
+ @property
+ def prompt_token(self) -> float:
+ """
+ Get the average number of prompt tokens.
+
+ :return: The average number of prompt tokens.
+ :rtype: float
+ """
+ return self.prompt_token_distribution.mean
+
+ @property
+ def prompt_token_distribution(self) -> Distribution:
+ """
+ Get the distribution of prompt token counts.
+
+ :return: The distribution of prompt token counts.
+ :rtype: Distribution
+ """
+ return Distribution(data=[result.prompt_token_count for result in self.results])
+
+ @computed_field # type: ignore[misc]
+ @property
+ def prompt_token_percentiles(self) -> List[float]:
+ """
+ Get standard percentiles for number of prompt tokens.
+
+ :return: List of percentiles of number of prompt tokens.
+ :rtype: List[float]
+ """
+ return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+ @computed_field # type: ignore[misc]
+ @property
+ def output_token(self) -> float:
+ """
+ Get the average number of output tokens.
+
+ :return: The average number of output tokens.
+ :rtype: float
+ """
+ return self.output_token_distribution.mean
+
+ @property
+ def output_token_distribution(self) -> Distribution:
+ """
+ Get the distribution of output token counts.
+
+ :return: The distribution of output token counts.
+ :rtype: Distribution
+ """
+ return Distribution(data=[result.output_token_count for result in self.results])
+
+ @computed_field # type: ignore[misc]
+ @property
+ def output_token_percentiles(self) -> List[float]:
+ """
+ Get standard percentiles for number of output tokens.
+
+ :return: List of percentiles of number of output tokens.
+ :rtype: List[float]
+ """
+ return self.output_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+ @computed_field # type: ignore[misc]
+ @property
+ def overloaded(self) -> bool:
+ if (
+ self.rate is None
+ or not self.results
+ or not self.concurrencies
+ or len(self.concurrencies) < 2 # noqa: PLR2004
+ ):
+ # if rate was not set, sync mode is assumed,
+ # or we have less than 2 data points,
+ # then we cannot be overloaded by definition
+ return False
+
+ # if the calculated rate is less than 75% of the requested rate,
+ # safe to assume the system is overloaded
+ return self.completed_request_rate < 0.75 * self.rate
+
+ def request_started(self):
+ """
+ Record the start of a generation request.
+ """
+ if not self.concurrencies:
+ self.concurrencies = [
+ RequestConcurrencyMeasurement(
+ time=time(),
+ completed=0,
+ errored=0,
+ processing=1,
+ ),
+ ]
+ else:
+ last = self.concurrencies[-1]
+ self.concurrencies.append(
+ RequestConcurrencyMeasurement(
+ time=time(),
+ completed=last.completed,
+ errored=last.errored,
+ processing=last.processing + 1,
+ ),
+ )
+
+ logger.info("Text generation request started")
+
+ def request_completed(
+ self,
+ result: Union[TextGenerationResult, TextGenerationError],
+ ):
+ """
+ Record the completion of a text generation request.
+
+ :param result: The completed result or error.
+ :type result: Union[TextGenerationResult, TextGenerationError]
+ """
+ if not self.concurrencies:
+ raise ValueError("Request completed without starting")
+
+ if isinstance(result, TextGenerationError):
+ is_error = True
+ self.errors.append(result)
+ logger.info(
+ "Text generation request resulted in error: {}",
+ result.message,
+ )
+ else:
+ if not result.start_time or not result.end_time:
+ raise ValueError("Start time and End time are not defined")
+
+ is_error = False
+ self.results.append(result)
+ logger.info("Text generation request completed successfully: {}", result)
+
+ last = self.concurrencies[-1]
+ self.concurrencies.append(
+ RequestConcurrencyMeasurement(
+ time=time(),
+ completed=last.completed + (not is_error),
+ errored=last.errored + is_error,
+ processing=last.processing - 1,
+ )
+ )
+
+
+class TextGenerationBenchmarkReport(Serializable):
+ """
+ A class to represent a report of text generation benchmarks
+ for generative AI workloads.
+ This is a collection of benchmarks for different modes and rates.
+ """
+
+ benchmarks: List[TextGenerationBenchmark] = Field(
+ default_factory=list,
+ description="The benchmarks of text generation requests.",
+ )
+ args: Dict[str, Any] = Field(
+ default_factory=dict,
+ description="The arguments used for the benchmarks.",
+ )
+
+ def __iter__(self):
+ return iter(self.benchmarks)
+
+ @property
+ def benchmarks_sorted(self) -> List[TextGenerationBenchmark]:
+ """
+ Get the list of benchmarks sorted by request rate.
+
+ :return: The sorted list of benchmarks.
+ :rtype: List[TextGenerationBenchmark]
+ """
+ return sorted(self.benchmarks, key=lambda x: x.completed_request_rate)
+
+ def add_benchmark(self, benchmark: TextGenerationBenchmark):
+ """
+ Add a result to the report.
+
+ :param benchmark: The result to add.
+ :type benchmark: TextGenerationBenchmark
+ """
+ self.benchmarks.append(benchmark)
+ logger.debug("Added result: {}", benchmark)
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e6b2944ebe0877ed813f3bc5a41147b91b60092
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py
@@ -0,0 +1,169 @@
+from pathlib import Path
+from typing import Any, Literal, Union, get_args
+
+import yaml
+from loguru import logger
+from pydantic import BaseModel, ConfigDict
+
+__all__ = ["Serializable", "SerializableFileType"]
+
+
+SerializableFileType = Literal["yaml", "json"]
+
+
+class Serializable(BaseModel):
+ """
+ A base class for models that require YAML and JSON serialization and
+ deserialization.
+ """
+
+ model_config = ConfigDict(
+ extra="forbid",
+ use_enum_values=True,
+ validate_assignment=True,
+ from_attributes=True,
+ )
+
+ def __init__(self, /, **data: Any) -> None:
+ super().__init__(**data)
+ logger.debug(
+ "Initialized new instance of {} with data: {}",
+ self.__class__.__name__,
+ data,
+ )
+
+ def to_yaml(self) -> str:
+ """
+ Serialize the model to a YAML string.
+
+ :return: YAML string representation of the model.
+ """
+ logger.debug("Serializing to YAML... {}", self)
+
+ return yaml.dump(self.model_dump())
+
+ @classmethod
+ def from_yaml(cls, data: str):
+ """
+ Deserialize a YAML string to a model instance.
+
+ :param data: YAML string to deserialize.
+ :return: An instance of the model.
+ """
+ logger.debug("Deserializing from YAML... {}", data)
+
+ return cls.model_validate(yaml.safe_load(data))
+
+ def to_json(self) -> str:
+ """
+ Serialize the model to a JSON string.
+
+ :return: JSON string representation of the model.
+ """
+ logger.debug("Serializing to JSON... {}", self)
+
+ return self.model_dump_json()
+
+ @classmethod
+ def from_json(cls, data: str):
+ """
+ Deserialize a JSON string to a model instance.
+
+ :param data: JSON string to deserialize.
+ :return: An instance of the model.
+ """
+ logger.debug("Deserializing from JSON... {}", data)
+
+ return cls.model_validate_json(data)
+
+ def save_file(
+ self,
+ path: Union[str, Path],
+ type_: SerializableFileType = "yaml",
+ ) -> str:
+ """
+ Save the model to a file in either YAML or JSON format.
+
+ :param path: Path to the exact file or the containing directory.
+ If it is a directory, the file name will be inferred from the class name.
+ :param type_: Optional type to save ('yaml' or 'json').
+ If not provided and the path has an extension,
+ it will be inferred to save in that format.
+ If not provided and the path does not have an extension,
+ it will save in YAML format.
+ :return: The path to the saved file.
+ """
+ logger.debug("Saving to file... {} with format: {}", path, type_)
+
+ if isinstance(path, str):
+ path = Path(path)
+
+ if path.suffix:
+ # is a file
+ ext = path.suffix[1:].lower()
+ if type_ not in get_args(SerializableFileType):
+ raise ValueError(
+ f"Unsupported file extension: {type_}. "
+ f"Expected one of {SerializableFileType} "
+ f"for {path}"
+ )
+ type_ = ext # type: ignore # noqa: PGH003
+ else:
+ # is a directory
+ file_name = f"{self.__class__.__name__.lower()}.{type_}"
+ path = path / file_name
+
+ path.parent.mkdir(parents=True, exist_ok=True)
+
+ with path.open("w") as file:
+ if type_ == "yaml":
+ file.write(self.to_yaml())
+ elif type_ == "json":
+ file.write(self.to_json())
+ else:
+ raise ValueError(
+ f"Unsupported file extension: {type_}"
+ f"Expected one of {SerializableFileType} "
+ f"for {path}"
+ )
+
+ logger.info("Successfully saved {} to {}", self.__class__.__name__, path)
+
+ return str(path)
+
+ @classmethod
+ def load_file(cls, path: Union[str, Path]):
+ """
+ Load a model from a file in either YAML or JSON format.
+
+ :param path: Path to the file.
+ :return: An instance of the model.
+ """
+ logger.debug("Loading from file... {}", path)
+
+ if isinstance(path, str):
+ path = Path(path)
+
+ if not path.exists():
+ raise FileNotFoundError(f"File not found: {path}")
+
+ if not path.is_file():
+ raise ValueError(f"Path is not a file: {path}")
+
+ extension = path.suffix[1:].lower()
+
+ with path.open() as file:
+ data = file.read()
+
+ if extension == "yaml":
+ obj = cls.from_yaml(data)
+ elif extension == "json":
+ obj = cls.from_json(data)
+ else:
+ raise ValueError(
+ f"Unsupported file extension: {extension}"
+ f"Expected one of {SerializableFileType} "
+ f"for {path}"
+ )
+
+ return obj
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5858d072bfd8ae7e1092259ccba537f69e65743
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py
@@ -0,0 +1,10 @@
+from .base import Executor, ExecutorResult
+from .profile_generator import Profile, ProfileGenerationMode, ProfileGenerator
+
+__all__ = [
+ "Executor",
+ "ExecutorResult",
+ "Profile",
+ "ProfileGenerationMode",
+ "ProfileGenerator",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..865ab30de412797485b49d2175ac7f94ac3900ba
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py
@@ -0,0 +1,213 @@
+from dataclasses import dataclass
+from typing import AsyncGenerator, Optional, Sequence, Union
+
+from loguru import logger
+
+from guidellm.backend import Backend
+from guidellm.core import TextGenerationBenchmarkReport
+from guidellm.executor.profile_generator import (
+ Profile,
+ ProfileGenerationMode,
+ ProfileGenerator,
+)
+from guidellm.request import RequestGenerator
+from guidellm.scheduler import Scheduler, SchedulerResult
+
+__all__ = ["Executor", "ExecutorResult"]
+
+
+@dataclass
+class ExecutorResult:
+ """
+ Data class representing the result of executing tasks in the Executor.
+
+ :param completed: Indicates whether all tasks have completed.
+ :type completed: bool
+ :param count_total: Total number of profiles.
+ :type count_total: int
+ :param count_completed: Number of completed profiles.
+ :type count_completed: int
+ :param report: A report report for text generation.
+ :type report: TextGenerationBenchmarkReport
+ :param scheduler_result: Optional scheduler result for the last task.
+ :type scheduler_result: Optional[SchedulerResult]
+ """
+
+ completed: bool
+ count_total: int
+ count_completed: int
+ generation_modes: Sequence[ProfileGenerationMode]
+ report: TextGenerationBenchmarkReport
+ scheduler_result: Optional[SchedulerResult] = None
+ current_index: Optional[int] = None
+ current_profile: Optional[Profile] = None
+
+
+class Executor:
+ """
+ The Executor class manages the execution of tasks based on a given profile
+ generation mode and rate. It orchestrates the interaction between the backend,
+ request generator, and profile generator, and runs benchmarks accordingly.
+
+ :param backend: The backend to run tasks against.
+ :type backend: Backend
+ :param request_generator: The generator that creates requests for execution.
+ :type request_generator: RequestGenerator
+ :param mode: The mode for profile generation (e.g., sweep, synchronous).
+ :type mode: ProfileGenerationMode
+ :param rate: The list of rates for load generation, or None.
+ :type rate: Optional[List[float]]
+ :param max_number: Maximum number of requests to generate for the scheduler
+ (a single report run), or None.
+ :type max_number: Optional[int]
+ :param max_duration: Maximum duration for generating requests for the scheduler,
+ (a single report run), or None.
+ :type max_duration: Optional[float]
+ """
+
+ def __init__(
+ self,
+ backend: Backend,
+ request_generator: RequestGenerator,
+ mode: ProfileGenerationMode = "sweep",
+ rate: Optional[Union[float, Sequence[float]]] = None,
+ max_number: Optional[int] = None,
+ max_duration: Optional[float] = None,
+ ):
+ self._backend = backend
+ self._generator = request_generator
+ self._max_number = max_number
+ self._max_duration = max_duration
+ self._profile_generator = ProfileGenerator(mode=mode, rate=rate)
+ logger.info("Executor initialized with mode: {}, rate: {}", mode, rate)
+
+ @property
+ def backend(self) -> Backend:
+ """
+ Returns the backend being used by the Executor.
+
+ :return: Backend
+ :rtype: Backend
+ """
+ return self._backend
+
+ @property
+ def request_generator(self) -> RequestGenerator:
+ """
+ Returns the request generator used by the Executor.
+
+ :return: RequestGenerator
+ :rtype: RequestGenerator
+ """
+ return self._generator
+
+ @property
+ def profile_generator(self) -> ProfileGenerator:
+ """
+ Returns the profile generator for generating profiles during execution.
+
+ :return: ProfileGenerator
+ :rtype: ProfileGenerator
+ """
+ return self._profile_generator
+
+ @property
+ def max_number(self) -> Optional[int]:
+ """
+ Returns the maximum number of requests to generate.
+
+ :return: Maximum number of requests or None.
+ :rtype: Optional[int]
+ """
+ return self._max_number
+
+ @property
+ def max_duration(self) -> Optional[float]:
+ """
+ Returns the maximum duration for generating requests.
+
+ :return: Maximum duration in seconds or None.
+ :rtype: Optional[float]
+ """
+ return self._max_duration
+
+ async def run(self) -> AsyncGenerator[ExecutorResult, None]:
+ """
+ Runs the Executor, generating and scheduling tasks based on the profile
+ generation mode. Yields results incrementally.
+
+ :rtype: AsyncGenerator[ExecutorResult, None]
+ """
+ report = TextGenerationBenchmarkReport()
+ report.args = {
+ # backend args
+ "backend_type": self.backend.type_,
+ "target": self.backend.target,
+ "model": self.backend.model,
+ # data args
+ "data_type": self.request_generator.type_,
+ "data": self.request_generator.source,
+ "tokenizer": self.request_generator.tokenizer.name_or_path,
+ # rate args
+ "mode": self.profile_generator.mode,
+ "rate": self.profile_generator.rates,
+ # limits args
+ "max_number": self.max_number,
+ "max_duration": self.max_duration,
+ }
+ profile_index = -1
+ logger.info("Starting Executor run")
+
+ yield ExecutorResult(
+ completed=False,
+ count_total=len(self.profile_generator),
+ count_completed=0,
+ generation_modes=self.profile_generator.profile_generation_modes,
+ report=report,
+ )
+
+ while profile := self.profile_generator.next(report):
+ logger.debug("Generated profile: {}", profile)
+ scheduler = Scheduler(
+ generator=self.request_generator,
+ worker=self.backend,
+ mode=profile.load_gen_mode,
+ rate=profile.load_gen_rate,
+ max_number=self.max_number or profile.args.get("max_number", None),
+ max_duration=self.max_duration,
+ )
+ profile_index += 1
+
+ logger.info(
+ "Scheduling tasks with mode: {}, rate: {}",
+ profile.load_gen_mode,
+ profile.load_gen_rate,
+ )
+
+ async for scheduler_result in scheduler.run():
+ if scheduler_result.completed:
+ report.add_benchmark(scheduler_result.benchmark)
+ logger.debug(
+ "Benchmark added for scheduler result: {}",
+ scheduler_result.benchmark,
+ )
+
+ yield ExecutorResult(
+ completed=False,
+ count_total=len(self.profile_generator),
+ count_completed=len(report.benchmarks),
+ generation_modes=self.profile_generator.profile_generation_modes,
+ report=report,
+ scheduler_result=scheduler_result,
+ current_index=profile_index,
+ current_profile=profile,
+ )
+
+ logger.info("Executor run completed")
+ yield ExecutorResult(
+ completed=True,
+ count_total=len(self.profile_generator),
+ count_completed=len(report.benchmarks),
+ generation_modes=self.profile_generator.profile_generation_modes,
+ report=report,
+ )
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..757646cf668ed1be6983a955ead5d81460c6d71e
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py
@@ -0,0 +1,350 @@
+from typing import Any, Dict, List, Literal, Optional, Sequence, Union, get_args
+
+import numpy as np
+from loguru import logger
+from numpy._typing import NDArray
+from pydantic import Field
+
+from guidellm.config import settings
+from guidellm.core import TextGenerationBenchmark, TextGenerationBenchmarkReport
+from guidellm.core.serializable import Serializable
+from guidellm.scheduler import LoadGenerationMode
+
+__all__ = [
+ "Profile",
+ "ProfileGenerationMode",
+ "ProfileGenerator",
+]
+
+ProfileGenerationMode = Literal[
+ "sweep", "synchronous", "throughput", "constant", "poisson"
+]
+
+
+class Profile(Serializable):
+ """
+ A data class representing a profile for load generation.
+
+ :param load_gen_mode: The mode of load generation (e.g., constant, poisson).
+ :type load_gen_mode: LoadGenerationMode
+ :param load_gen_rate: The rate of load generation, if applicable.
+ :type load_gen_rate: Optional[float]
+ :param args: Additional arguments for the profile.
+ :type args: Optional[Dict[str, Any]]
+ """
+
+ load_gen_mode: LoadGenerationMode
+ load_gen_rate: Optional[float] = None
+ args: Dict[str, Any] = Field(default_factory=dict)
+
+
+class ProfileGenerator:
+ """
+ Generates profiles based on different load generation modes.
+
+ :param mode: The mode for profile generation (e.g., sweep, synchronous).
+ :type mode: ProfileGenerationMode
+ :param rate: The rate(s) for load generation; could be a float or list of floats.
+ :type rate: Optional[Union[float, Sequence[float]]]
+ """
+
+ def __init__(
+ self,
+ mode: ProfileGenerationMode,
+ rate: Optional[Union[float, Sequence[float]]] = None,
+ ):
+ if mode not in get_args(ProfileGenerationMode):
+ err = ValueError(
+ f"{mode} is not a valid Profile Generation Mode. "
+ f"Valid options are {get_args(ProfileGenerationMode)}"
+ )
+ logger.error(err)
+ raise err
+
+ self._mode = mode
+
+ if self._mode in ("sweep", "throughput", "synchronous"):
+ if rate is not None:
+ err = ValueError(f"Rates are not applicable for {self._mode} mode")
+ logger.error(err)
+ raise err
+ self._rates = None
+ else:
+ if not rate:
+ err = ValueError(f"Rates are required for {self._mode} mode")
+ logger.error(err)
+ raise err
+ self._rates = rate if isinstance(rate, Sequence) else [rate]
+
+ for rt in self._rates:
+ if rt <= 0:
+ err = ValueError(
+ f"Rate must be > 0 for mode: {self._mode}. Given: {rt}"
+ )
+ logger.error(err)
+ raise err
+
+ self._generated_count = 0
+
+ def __len__(self) -> int:
+ """
+ Returns the number of profiles to generate based on the mode and rates.
+
+ :return: The number of profiles.
+ :rtype: int
+ """
+ if self._mode == "sweep":
+ return settings.num_sweep_profiles + 2
+
+ if self._mode in ("throughput", "synchronous"):
+ return 1
+
+ if not self._rates:
+ raise ValueError(f"Rates are required for {self._mode} mode")
+
+ return len(self._rates)
+
+ @property
+ def mode(self) -> ProfileGenerationMode:
+ """
+ Returns the current mode of profile generation.
+
+ :return: The profile generation mode.
+ :rtype: ProfileGenerationMode
+ """
+ return self._mode
+
+ @property
+ def rates(self) -> Optional[Sequence[float]]:
+ """
+ Returns the list of rates for load generation, if any.
+
+ :return: Sequence of rates or None if not applicable.
+ :rtype: Optional[Sequence[float]]
+ """
+ return self._rates
+
+ @property
+ def generated_count(self) -> int:
+ """
+ Returns the current count of generated profiles.
+
+ :return: The current count of generated profiles.
+ :rtype: int
+ """
+ return self._generated_count
+
+ @property
+ def profile_generation_modes(self) -> Sequence[ProfileGenerationMode]:
+ """
+ Return the list of profile modes to be run in the report.
+
+ :return: Sequence of profile modes to be run in the report.
+ :rtype: Sequence[ProfileGenerationMode]
+ """
+ if self._mode == "sweep":
+ return ["synchronous", "throughput"] + ["constant"] * ( # type: ignore # noqa: PGH003
+ settings.num_sweep_profiles
+ )
+
+ if self._mode in ["throughput", "synchronous"]:
+ return [self._mode]
+
+ if self._rates is None:
+ raise ValueError(f"Rates are required for {self._mode} mode")
+
+ if self._mode in ["constant", "poisson"]:
+ return [self._mode] * len(self._rates)
+
+ raise ValueError(f"Invalid mode: {self._mode}")
+
+ def next(self, current_report: TextGenerationBenchmarkReport) -> Optional[Profile]:
+ """
+ Generates the next profile based on the current mode and report.
+
+ :param current_report: The current report report.
+ :type current_report: TextGenerationBenchmarkReport
+ :return: The generated profile or None if no more profiles.
+ :rtype: Optional[Profile]
+ """
+ logger.debug(
+ "Generating the next profile with mode: {}, current report: {}",
+ self.mode,
+ current_report,
+ )
+
+ if self.mode in ["constant", "poisson"]:
+ if not self.rates:
+ err = ValueError(f"Rates are required for {self.mode} mode")
+ logger.error(err)
+ raise err
+
+ profile = self.create_fixed_rate_profile(
+ self.generated_count,
+ self.mode,
+ self.rates,
+ )
+ elif self.mode == "synchronous":
+ profile = self.create_synchronous_profile(self.generated_count)
+ elif self.mode == "throughput":
+ profile = self.create_throughput_profile(self.generated_count)
+ elif self.mode == "sweep":
+ profile = self.create_sweep_profile(
+ self.generated_count,
+ sync_benchmark=(
+ current_report.benchmarks[0] if current_report.benchmarks else None
+ ),
+ throughput_benchmark=(
+ current_report.benchmarks[1]
+ if len(current_report.benchmarks) > 1
+ else None
+ ),
+ )
+ else:
+ err = ValueError(f"Invalid mode: {self.mode}")
+ logger.error(err)
+ raise err
+
+ self._generated_count += 1
+ logger.info(
+ "Generated profile: {}, total generated count: {}",
+ profile,
+ self._generated_count,
+ )
+ return profile
+
+ @staticmethod
+ def create_fixed_rate_profile(
+ index: int, mode: ProfileGenerationMode, rates: Sequence[float]
+ ) -> Optional[Profile]:
+ """
+ Creates a profile with a fixed rate.
+
+ :param index: The index of the rate in the list.
+ :type index: int
+ :param mode: The mode for profile generation (e.g., constant, poisson).
+ :type mode: ProfileGenerationMode
+ :param rates: The list of rates for load generation.
+ :type rates: Sequence[float]
+ :return: The generated profile or None if index is out of range.
+ :rtype: Optional[Profile]
+ """
+ modes_map: Dict[str, LoadGenerationMode] = {
+ "constant": "constant",
+ "poisson": "poisson",
+ }
+
+ if mode not in modes_map:
+ err = ValueError(f"Invalid mode: {mode}")
+ logger.error(err)
+ raise err
+
+ profile = (
+ Profile(
+ load_gen_mode=modes_map[mode],
+ load_gen_rate=rates[index],
+ )
+ if index < len(rates)
+ else None
+ )
+ logger.debug("Created fixed rate profile: {}", profile)
+ return profile
+
+ @staticmethod
+ def create_synchronous_profile(index: int) -> Optional[Profile]:
+ """
+ Creates a profile with synchronous mode.
+
+ :param index: The index of the profile to create.
+ :type index: int
+ :return: The generated profile or None if index is out of range.
+ :rtype: Optional[Profile]
+ """
+ profile = (
+ Profile(
+ load_gen_mode="synchronous",
+ load_gen_rate=None,
+ )
+ if index < 1
+ else None
+ )
+ logger.debug("Created synchronous profile: {}", profile)
+ return profile
+
+ @staticmethod
+ def create_throughput_profile(index: int) -> Optional[Profile]:
+ """
+ Creates a profile with throughput mode.
+
+ :param index: The index of the profile to create.
+ :type index: int
+ :return: The generated profile or None if index is out of range.
+ :rtype: Optional[Profile]
+ """
+ profile = (
+ Profile(
+ load_gen_mode="throughput",
+ load_gen_rate=None,
+ )
+ if index < 1
+ else None
+ )
+ logger.debug("Created throughput profile: {}", profile)
+ return profile
+
+ @staticmethod
+ def create_sweep_profile(
+ index: int,
+ sync_benchmark: Optional[TextGenerationBenchmark],
+ throughput_benchmark: Optional[TextGenerationBenchmark],
+ ) -> Optional[Profile]:
+ """
+ Creates a profile with sweep mode, generating profiles between
+ synchronous and throughput benchmarks.
+
+ :param index: The index of the profile to create.
+ :type index: int
+ :param sync_benchmark: The synchronous report data.
+ :type sync_benchmark: Optional[TextGenerationBenchmark]
+ :param throughput_benchmark: The throughput report data.
+ :type throughput_benchmark: Optional[TextGenerationBenchmark]
+ :return: The generated profile or None if index is out of range.
+ :rtype: Optional[Profile]
+ """
+ if index < 0 or index >= settings.num_sweep_profiles + 2:
+ return None
+
+ if index == 0:
+ return ProfileGenerator.create_synchronous_profile(0)
+
+ if not sync_benchmark:
+ err = ValueError("Synchronous report is required for sweep mode")
+ logger.error(err)
+ raise err
+
+ if index == 1:
+ throughput_profile: Profile = ProfileGenerator.create_throughput_profile(0) # type: ignore # noqa: PGH003
+ # set the max number of requests to 5 times the number of requests
+ # incase it is not set for the sweep to limit the number of requests
+ throughput_profile.args = {"max_number": sync_benchmark.request_count * 5}
+ return throughput_profile
+
+ if not throughput_benchmark:
+ err = ValueError("Throughput report is required for sweep mode")
+ logger.error(err)
+ raise err
+
+ min_rate = sync_benchmark.completed_request_rate
+ max_rate = throughput_benchmark.completed_request_rate
+ intermediate_rates: List[NDArray] = list(
+ np.linspace(min_rate, max_rate, settings.num_sweep_profiles + 1)
+ )[1:]
+
+ return Profile(
+ load_gen_mode="constant",
+ load_gen_rate=(
+ float(load_gen_rate)
+ if (load_gen_rate := intermediate_rates[index - 2])
+ else 1.0 # the fallback value
+ ),
+ )
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..f26966c029ac8e173031822233e971ec7512144b
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py
@@ -0,0 +1,83 @@
+"""
+Logger configuration for GuideLLM.
+
+This module provides a flexible logging configuration using the loguru library.
+It supports console and file logging with options to configure via environment
+variables or direct function calls.
+
+Environment Variables:
+ - GUIDELLM__LOGGING__DISABLED: Disable logging (default: false).
+ - GUIDELLM__LOGGING__CLEAR_LOGGERS: Clear existing loggers
+ from loguru (default: true).
+ - GUIDELLM__LOGGING__LOG_LEVEL: Log level for console logging
+ (default: none, options: DEBUG, INFO, WARNING, ERROR, CRITICAL).
+ - GUIDELLM__LOGGING__FILE: Path to the log file for file logging
+ (default: guidellm.log if log file level set else none)
+ - GUIDELLM__LOGGING__FILE_LEVEL: Log level for file logging
+ (default: INFO if log file set else none).
+
+Usage:
+ from guidellm import logger, configure_logger, LoggerConfig
+
+ # Configure metrics with default settings
+ configure_logger(
+ config=LoggingConfig
+ disabled=False,
+ clear_loggers=True,
+ console_log_level="DEBUG",
+ log_file=None,
+ log_file_level=None,
+ )
+ )
+
+ logger.debug("This is a debug message")
+ logger.info("This is an info message")
+"""
+
+import sys
+
+from loguru import logger
+
+from guidellm.config import LoggingSettings, settings
+
+__all__ = ["configure_logger", "logger"]
+
+
+def configure_logger(config: LoggingSettings = settings.logging):
+ """
+ Configure the metrics for LLM Compressor.
+ This function sets up the console and file logging
+ as per the specified or default parameters.
+
+ Note: Environment variables take precedence over the function parameters.
+
+ :param config: The configuration for the logger to use.
+ :type config: LoggerConfig
+ """
+
+ if config.disabled:
+ logger.disable("guidellm")
+ return
+
+ logger.enable("guidellm")
+
+ if config.clear_loggers:
+ logger.remove()
+
+ # log as a human readable string with the time, function, level, and message
+ logger.add(
+ sys.stdout,
+ level=config.console_log_level.upper(),
+ format="{time} | {function} | {level} - {message}",
+ )
+
+ if config.log_file or config.log_file_level:
+ log_file = config.log_file or "guidellm.log"
+ log_file_level = config.log_file_level or "INFO"
+ # log as json to the file for easier parsing
+ logger.add(log_file, level=log_file_level.upper(), serialize=True)
+
+
+# invoke logger setup on import with default values
+# enabling console logging with INFO and disabling file logging
+configure_logger()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..4748b12d92126698ad18e55388db5e6491293cb6
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py
@@ -0,0 +1,341 @@
+import asyncio
+from typing import Literal, Optional, Union, get_args
+
+import click
+from loguru import logger
+
+from guidellm.backend import Backend, BackendEnginePublic
+from guidellm.core import GuidanceReport, TextGenerationBenchmarkReport
+from guidellm.executor import Executor, ProfileGenerationMode
+from guidellm.request import (
+ EmulatedRequestGenerator,
+ FileRequestGenerator,
+ TransformersDatasetRequestGenerator,
+)
+from guidellm.request.base import RequestGenerator
+from guidellm.utils import BenchmarkReportProgress, cli_params
+
+__all__ = ["generate_benchmark_report"]
+
+
+@click.command()
+@click.option(
+ "--target",
+ type=str,
+ required=True,
+ help=(
+ "The target path or url for the backend to evaluate. "
+ "Ex: 'http://localhost:8000/v1'"
+ ),
+)
+@click.option(
+ "--backend",
+ type=click.Choice(get_args(BackendEnginePublic)),
+ default="openai_server",
+ help=(
+ "The backend to use for benchmarking. "
+ "The default is OpenAI Server enabling compatability with any server that "
+ "follows the OpenAI spec including vLLM."
+ ),
+)
+@click.option(
+ "--model",
+ type=str,
+ default=None,
+ help=(
+ "The Model to use for benchmarking. If not provided, it will use "
+ "the first available model provided the backend supports listing models."
+ ),
+)
+@click.option(
+ "--data",
+ type=str,
+ required=True,
+ help=(
+ "The data source to use for benchmarking. "
+ "Depending on the data-type, it should be a "
+ "path to a data file containing prompts to run (ex: data.txt), "
+ "a HuggingFace dataset name (ex: 'neuralmagic/LLM_compression_calibration'), "
+ "or a configuration for emulated data "
+ "(ex: 'prompt_tokens=128,generated_tokens=128')."
+ ),
+)
+@click.option(
+ "--data-type",
+ type=click.Choice(["emulated", "file", "transformers"]),
+ required=True,
+ help=(
+ "The type of data to use for benchmarking. "
+ "Use 'emulated' for synthetic data, 'file' for a file, or 'transformers' "
+ "for a HuggingFace dataset. Specify the data source with the --data flag."
+ ),
+)
+@click.option(
+ "--tokenizer",
+ type=str,
+ default=None,
+ help=(
+ "The tokenizer to use for calculating the number of prompt tokens. "
+ "This should match the tokenizer used by the model."
+ "By default, it will use the --model flag to determine the tokenizer. "
+ "If not provided and the model is not available, will raise an error. "
+ "Ex: 'neuralmagic/Meta-Llama-3.1-8B-quantized.w8a8'"
+ ),
+)
+@click.option(
+ "--rate-type",
+ type=click.Choice(get_args(ProfileGenerationMode)),
+ default="sweep",
+ help=(
+ "The type of request rate to use for benchmarking. "
+ "Use sweep to run a full range from synchronous to throughput (default), "
+ "synchronous for sending requests one after the other, "
+ "throughput to send requests as fast as possible, "
+ "constant for a fixed request rate, "
+ "or poisson for a real-world variable request rate."
+ ),
+)
+@click.option(
+ "--rate",
+ type=float,
+ default=None,
+ help=(
+ "The request rate to use for constant and poisson rate types. "
+ "To run multiple, provide the flag multiple times. "
+ ),
+ multiple=True,
+)
+@click.option(
+ "--max-seconds",
+ type=int,
+ default=120,
+ help=(
+ "The maximum number of seconds for each benchmark run. "
+ "Either max-seconds, max-requests, or both must be set. "
+ "The default is 120 seconds. "
+ "Note, this is the maximum time for each rate supplied, not the total time. "
+ "This value should be large enough to allow for "
+ "the server's performance to stabilize."
+ ),
+)
+@click.option(
+ "--max-requests",
+ type=cli_params.MAX_REQUESTS,
+ default=None,
+ help=(
+ "The maximum number of requests for each benchmark run. "
+ "Either max-seconds, max-requests, or both must be set. "
+ "Note, this is the maximum number of requests for each rate supplied, "
+ "not the total number of requests. "
+ "This value should be large enough to allow for "
+ "the server's performance to stabilize."
+ ),
+)
+@click.option(
+ "--output-path",
+ type=str,
+ default=None,
+ help=(
+ "The output path to save the output report to for loading later. "
+ "Ex: guidance_report.json. "
+ "The default is None, meaning no output is saved and results are only "
+ "printed to the console."
+ ),
+)
+@click.option(
+ "--enable-continuous-refresh",
+ is_flag=True,
+ default=False,
+ help=(
+ "Enable continual refreshing of the output table in the CLI "
+ "until the user exits. "
+ ),
+)
+def generate_benchmark_report_cli(
+ target: str,
+ backend: BackendEnginePublic,
+ model: Optional[str],
+ data: Optional[str],
+ data_type: Literal["emulated", "file", "transformers"],
+ tokenizer: Optional[str],
+ rate_type: ProfileGenerationMode,
+ rate: Optional[float],
+ max_seconds: Optional[int],
+ max_requests: Union[Literal["dataset"], int, None],
+ output_path: str,
+ enable_continuous_refresh: bool,
+):
+ """
+ Generate a benchmark report for a specified backend and dataset.
+ """
+ generate_benchmark_report(
+ target=target,
+ backend=backend,
+ model=model,
+ data=data,
+ data_type=data_type,
+ tokenizer=tokenizer,
+ rate_type=rate_type,
+ rate=rate,
+ max_seconds=max_seconds,
+ max_requests=max_requests,
+ output_path=output_path,
+ cont_refresh_table=enable_continuous_refresh,
+ )
+
+
+def generate_benchmark_report(
+ target: str,
+ data: Optional[str],
+ data_type: Literal["emulated", "file", "transformers"],
+ backend: BackendEnginePublic="openai_server",
+ model: Optional[str]=None,
+ tokenizer: Optional[str]=None,
+ rate_type: ProfileGenerationMode="sweep",
+ rate: Optional[float]=None,
+ max_seconds: Optional[int]=120,
+ max_requests: Union[Literal["dataset"], int, None]=None,
+ output_path: str=None,
+ cont_refresh_table: bool=False,
+) -> GuidanceReport:
+ """
+ Generate a benchmark report for a specified backend and dataset.
+
+ :param target: The target URL or path for the backend to evaluate.
+ :param backend: The backend type to use for benchmarking.
+ :param model: The model to benchmark;
+ defaults to the first available if not specified.
+ :param data: The data source for benchmarking,
+ which may be a path, dataset name, or config.
+ :param data_type: The type of data to use,
+ such as 'emulated', 'file', or 'transformers'.
+ :param tokenizer: The tokenizer to use for token counting,
+ defaulting to Llama 3.1 if not provided.
+ :param rate_type: The rate type for requests during benchmarking.
+ :param rate: The specific request rate for constant and poisson rate types.
+ :param max_seconds: Maximum duration for each benchmark run in seconds.
+ :param max_requests: Maximum number of requests per benchmark run.
+ :param output_path: Path to save the output report file.
+ :param cont_refresh_table: Continually refresh the table in the CLI
+ until the user exits.
+ """
+ logger.info(
+ "Generating benchmark report with target: {}, backend: {}", target, backend
+ )
+
+ # Create backend
+ backend_inst = Backend.create(
+ backend_type=backend,
+ target=target,
+ model=model,
+ )
+
+ request_generator: RequestGenerator
+
+ # Create tokenizer and request generator
+ tokenizer_inst = tokenizer
+ if not tokenizer_inst:
+ try:
+ tokenizer_inst = backend_inst.model_tokenizer()
+ except Exception as err:
+ raise ValueError(
+ "Could not load model's tokenizer, "
+ "--tokenizer must be provided for request generation"
+ ) from err
+
+ if data_type == "emulated":
+ request_generator = EmulatedRequestGenerator(
+ config=data, tokenizer=tokenizer_inst
+ )
+ elif data_type == "file":
+ request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer_inst)
+ elif data_type == "transformers":
+ request_generator = TransformersDatasetRequestGenerator(
+ dataset=data, tokenizer=tokenizer_inst
+ )
+ else:
+ raise ValueError(f"Unknown data type: {data_type}")
+
+ if data_type == "emulated" and max_requests == "dataset":
+ raise ValueError("Cannot use 'dataset' for emulated data")
+
+ # Create executor
+ executor = Executor(
+ backend=backend_inst,
+ request_generator=request_generator,
+ mode=rate_type,
+ rate=rate if rate_type in ("constant", "poisson") else None,
+ max_number=(
+ len(request_generator) if max_requests == "dataset" else max_requests
+ ),
+ max_duration=max_seconds,
+ )
+
+ # Run executor
+ logger.debug(
+ "Running executor with args: {}",
+ {
+ "backend": backend,
+ "request_generator": request_generator,
+ "mode": rate_type,
+ "rate": rate,
+ "max_number": max_requests,
+ "max_duration": max_seconds,
+ },
+ )
+ report = asyncio.run(_run_executor_for_result(executor))
+
+ # Save and print report
+ guidance_report = GuidanceReport()
+ guidance_report.benchmarks.append(report)
+
+ if output_path:
+ guidance_report.save_file(output_path)
+
+ guidance_report.print(
+ save_path=output_path if output_path is not None else "stdout",
+ continual_refresh=cont_refresh_table,
+ )
+
+ return guidance_report
+
+
+async def _run_executor_for_result(executor: Executor) -> TextGenerationBenchmarkReport:
+ report = None
+ progress = BenchmarkReportProgress()
+ started = False
+
+ async for result in executor.run():
+ if not started:
+ progress.start(result.generation_modes) # type: ignore # noqa: PGH003
+ started = True
+
+ if result.current_index is not None:
+ description = f"{result.current_profile.load_gen_mode}" # type: ignore # noqa: PGH003
+ if result.current_profile.load_gen_mode in ("constant", "poisson"): # type: ignore # noqa: PGH003
+ description += f"@{result.current_profile.load_gen_rate:.2f} req/s" # type: ignore # noqa: PGH003
+
+ progress.update_benchmark(
+ index=result.current_index,
+ description=description,
+ completed=result.scheduler_result.completed, # type: ignore # noqa: PGH003
+ completed_count=result.scheduler_result.count_completed, # type: ignore # noqa: PGH003
+ completed_total=result.scheduler_result.count_total, # type: ignore # noqa: PGH003
+ start_time=result.scheduler_result.benchmark.start_time, # type: ignore # noqa: PGH003
+ req_per_sec=result.scheduler_result.benchmark.completed_request_rate, # type: ignore # noqa: PGH003
+ )
+
+ if result.completed:
+ report = result.report
+ break
+
+ progress.finish()
+
+ if not report:
+ raise ValueError("No report generated by executor")
+
+ return report
+
+
+if __name__ == "__main__":
+ generate_benchmark_report_cli()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4feca91cdbbe9a137bd8ad404394116a50868360
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py
@@ -0,0 +1,13 @@
+from .base import GenerationMode, RequestGenerator
+from .emulated import EmulatedConfig, EmulatedRequestGenerator
+from .file import FileRequestGenerator
+from .transformers import TransformersDatasetRequestGenerator
+
+__all__ = [
+ "EmulatedConfig",
+ "EmulatedRequestGenerator",
+ "FileRequestGenerator",
+ "GenerationMode",
+ "RequestGenerator",
+ "TransformersDatasetRequestGenerator",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fd303e605f7043408c7751733c75e7429caa726
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py
@@ -0,0 +1,200 @@
+import contextlib
+import threading
+import time
+from abc import ABC, abstractmethod
+from queue import Empty, Full, Queue
+from typing import Iterator, Literal, Union
+
+from loguru import logger
+from transformers import ( # type: ignore # noqa: PGH003
+ AutoTokenizer,
+ PreTrainedTokenizer,
+)
+
+from guidellm.core.request import TextGenerationRequest
+
+__all__ = ["GenerationMode", "RequestGenerator"]
+
+
+GenerationMode = Literal["async", "sync"]
+
+
+class RequestGenerator(ABC):
+ """
+ A base class for request generators that generate result requests.
+
+ :param type_: The type of the request generator.
+ :type type_: str
+ :param source: The data source for the request generator.
+ :type source: str
+ :param tokenizer: The tokenizer instance or the name/config to use
+ for tokenizing prompts.
+ :type tokenizer: Union[str, PreTrainedTokenizer]
+ :param mode: The generation mode, either 'async' or 'sync'.
+ :type mode: GenerationMode
+ :param async_queue_size: The size of the request queue.
+ :type async_queue_size: int
+ """
+
+ def __init__(
+ self,
+ type_: str,
+ source: str,
+ tokenizer: Union[str, PreTrainedTokenizer],
+ mode: GenerationMode = "async",
+ async_queue_size: int = 50,
+ ):
+ self._type = type_
+ self._source = source
+ self._async_queue_size: int = async_queue_size
+ self._mode: str = mode
+ self._queue: Queue = Queue(maxsize=async_queue_size)
+ self._stop_event: threading.Event = threading.Event()
+
+ if not tokenizer:
+ err = "Tokenizer must be provided for request generation"
+ logger.error(err)
+ raise ValueError(err)
+
+ self._tokenizer = (
+ AutoTokenizer.from_pretrained(tokenizer)
+ if isinstance(tokenizer, str)
+ else tokenizer
+ )
+ logger.info("Tokenizer initialized for request generation: {}", self._tokenizer)
+
+ if self._mode == "async":
+ self._thread = threading.Thread(target=self._populate_queue, daemon=True)
+ self._thread.start()
+ logger.info(
+ "RequestGenerator started in async mode with queue size: {}",
+ self._async_queue_size,
+ )
+
+ def __repr__(self) -> str:
+ """
+ Return a string representation of the RequestGenerator.
+
+ :return: String representation of the RequestGenerator.
+ :rtype: str
+ """
+ return (
+ f"RequestGenerator("
+ f"mode={self._mode}, "
+ f"async_queue_size={self._async_queue_size}, "
+ f"tokenizer={self._tokenizer})"
+ )
+
+ def __iter__(self) -> Iterator[TextGenerationRequest]:
+ """
+ Provide an iterator interface to generate new requests.
+
+ :return: An iterator over result requests.
+ :rtype: Iterator[TextGenerationRequest]
+ """
+ if self.mode == "async":
+ while not self._stop_event.is_set():
+ try:
+ item = self._queue.get_nowait()
+ self._queue.task_done()
+ yield item
+ except Empty:
+ time.sleep(0.01)
+ continue
+ else:
+ while not self._stop_event.is_set():
+ yield self.create_item()
+
+ @abstractmethod
+ def __len__(self) -> int:
+ """
+ Abstract method to get the length of the collection to be generated.
+ """
+
+ @abstractmethod
+ def create_item(self) -> TextGenerationRequest:
+ """
+ Abstract method to create a new result request item.
+
+ :return: A new result request.
+ :rtype: TextGenerationRequest
+ """
+
+ @property
+ def type_(self) -> str:
+ """
+ Get the type of the request generator.
+
+ :return: The type of the request generator.
+ :rtype: str
+ """
+ return self._type
+
+ @property
+ def source(self) -> str:
+ """
+ Get the data source for the request generator.
+
+ :return: The data source.
+ :rtype: str
+ """
+ return self._source
+
+ @property
+ def tokenizer(self) -> PreTrainedTokenizer:
+ """
+ Get the tokenizer instance.
+
+ :return: The tokenizer instance.
+ :rtype: PreTrainedTokenizer
+ """
+ return self._tokenizer
+
+ @property
+ def mode(self) -> str:
+ """
+ Get the generation mode.
+
+ :return: The generation mode.
+ :rtype: str
+ """
+ return self._mode
+
+ @property
+ def async_queue_size(self) -> int:
+ """
+ Get the size of the request queue.
+
+ :return: The size of the request queue.
+ :rtype: int
+ """
+ return self._async_queue_size
+
+ def stop(self):
+ """
+ Stop the background task that populates the queue.
+ """
+ logger.info("Stopping RequestGenerator...")
+ self._stop_event.set()
+ if self._mode == "async":
+ self._thread.join()
+ logger.info("RequestGenerator stopped")
+
+ def _populate_queue(self):
+ """
+ Populate the request queue in the background.
+ """
+
+ while not self._stop_event.is_set():
+ with contextlib.suppress(Full):
+ if self._queue.qsize() < self._async_queue_size:
+ item = self.create_item()
+ self._queue.put(item, timeout=0.1)
+ logger.debug(
+ "Item added to queue. Current queue size: {}",
+ self._queue.qsize(),
+ )
+ else:
+ time.sleep(0.1)
+
+ logger.info("RequestGenerator stopped populating queue")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f564a1ceecd9e977ce0b8d5c37a0394adeb69a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py
@@ -0,0 +1,416 @@
+import json
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from loguru import logger
+from transformers import PreTrainedTokenizer # type: ignore # noqa: PGH003
+
+from guidellm.config import settings
+from guidellm.core.request import TextGenerationRequest
+from guidellm.request.base import GenerationMode, RequestGenerator
+from guidellm.utils import clean_text, filter_text, load_images, load_text, split_text
+
+__all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"]
+
+
+@dataclass
+class EmulatedConfig:
+ """
+ Configuration for emulated text generation requests.
+
+ Args:
+ prompt_tokens (int): Number of prompt tokens.
+ prompt_tokens_variance (Optional[int]): Variance for prompt tokens.
+ prompt_tokens_min (Optional[int]): Minimum number of prompt tokens.
+ prompt_tokens_max (Optional[int]): Maximum number of prompt tokens.
+ generated_tokens (Optional[int]): Number of generated tokens.
+ generated_tokens_variance (Optional[int]): Variance for generated tokens.
+ generated_tokens_min (Optional[int]): Minimum number of generated tokens.
+ generated_tokens_max (Optional[int]): Maximum number of generated tokens.
+ images (Optional[int]): Number of images.
+ width (Optional[int]): Width of images.
+ height (Optional[int]): Height of images.
+ """
+
+ @staticmethod
+ def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
+ """
+ Create an EmulatedConfig instance from a configuration source.
+
+ :param config: Configuration source, can be a dictionary, JSON string,
+ key=value string, or file path.
+ :type config: Union[str, Path, Dict]
+ :return: An instance of EmulatedConfig.
+ :rtype: EmulatedConfig
+ :raises FileNotFoundError: If the configuration file is not found.
+ :raises ValueError: If the configuration format is invalid.
+ """
+ if not config:
+ logger.debug("Creating default configuration")
+ return EmulatedConfig(prompt_tokens=1024, generated_tokens=256, images=0)
+
+ if isinstance(config, dict):
+ logger.debug("Loading configuration from dict: {}", config)
+ return EmulatedConfig(**config)
+
+ if isinstance(config, Path) or (
+ isinstance(config, str) and (config.endswith(".json") or "{" in config)
+ ):
+ logger.debug("Loading configuration from json: {}", config)
+
+ if isinstance(config, str) and "{" in config:
+ json_text = config.strip()
+ else:
+ if isinstance(config, str):
+ config = Path(config)
+
+ if not config.exists():
+ raise FileNotFoundError(f"Configuration file not found: {config}")
+
+ json_text = config.read_text(encoding="utf-8")
+
+ json_dict = json.loads(json_text)
+
+ return EmulatedConfig(**json_dict)
+
+ if isinstance(config, str) and "=" in config:
+ logger.debug("Loading configuration from csv string: {}", config)
+ items = config.split(",")
+ config_dict = {}
+ for item in items:
+ key_value = item.strip().split("=")
+ if len(key_value) != 2: # noqa: PLR2004
+ raise ValueError(f"Unexpected format for item: {item}")
+ key = key_value[0].strip()
+ value = (
+ int(key_value[1].strip())
+ if key_value[1].isnumeric()
+ else key_value[1]
+ )
+ config_dict[key] = value
+
+ return EmulatedConfig(**config_dict) # type: ignore # noqa: PGH003
+
+ raise ValueError(
+ f"Invalid configuration given for creation of EmulatedConfig: {config}"
+ )
+
+ prompt_tokens: int
+ prompt_tokens_variance: Optional[int] = None
+ prompt_tokens_min: Optional[int] = None
+ prompt_tokens_max: Optional[int] = None
+
+ generated_tokens: Optional[int] = None
+ generated_tokens_variance: Optional[int] = None
+ generated_tokens_min: Optional[int] = None
+ generated_tokens_max: Optional[int] = None
+
+ images: int = 0
+ width: int = None
+ height: int = None
+
+ @property
+ def prompt_tokens_range(self) -> Tuple[int, int]:
+ """
+ Get the range (min, max) of prompt tokens to generate.
+
+ :return: The range of prompt tokens.
+ :rtype: Tuple[int, int]
+ """
+ return self._token_range(
+ self.prompt_tokens,
+ self.prompt_tokens_variance,
+ self.prompt_tokens_min,
+ self.prompt_tokens_max,
+ )
+
+ @property
+ def output_tokens_range(self) -> Tuple[int, int]:
+ """
+ Get the range (min, max) of output tokens to generate.
+
+ :return: The range of generated tokens.
+ :rtype: Tuple[int, int]
+ """
+ if not self.generated_tokens:
+ return 0, 0
+
+ return self._token_range(
+ self.generated_tokens,
+ self.generated_tokens_variance,
+ self.generated_tokens_min,
+ self.generated_tokens_max,
+ )
+
+ def sample_prompt_tokens(self, rng: np.random.Generator) -> int:
+ """
+ Sample the number of prompt tokens to generate.
+
+ :param rng: The random number generator to use.
+ :type rng: np.random.Generator
+ :return: The number of prompt tokens to create.
+ :rtype: int
+ """
+ return self._sample_tokens(
+ self.prompt_tokens,
+ self.prompt_tokens_variance,
+ self.prompt_tokens_min,
+ self.prompt_tokens_max,
+ rng,
+ )
+
+ def sample_output_tokens(self, rng: np.random.Generator) -> Optional[int]:
+ """
+ Sample the number of output tokens to generate.
+
+ :param rng: The random number generator to use.
+ :type rng: np.random.Generator
+ :return: The number of output tokens to generate.
+ :rtype: Optional[int]
+ """
+ if not self.generated_tokens:
+ return None
+
+ return self._sample_tokens(
+ self.generated_tokens,
+ self.generated_tokens_variance,
+ self.generated_tokens_min,
+ self.generated_tokens_max,
+ rng,
+ )
+
+ @staticmethod
+ def _sample_tokens(
+ base: int,
+ variance: Optional[int],
+ min_tokens: Optional[int],
+ max_tokens: Optional[int],
+ rng: np.random.Generator,
+ ) -> int:
+ min_tokens, max_tokens = EmulatedConfig._token_range(
+ base, variance, min_tokens, max_tokens
+ )
+
+ if min_tokens == max_tokens:
+ return min_tokens
+
+ if not variance:
+ return rng.integers(min_tokens, max_tokens + 1)
+
+ rand = rng.normal(base, math.sqrt(variance))
+
+ return int(min(max(rand, min_tokens), max_tokens))
+
+ @staticmethod
+ def _token_range(
+ base: int,
+ variance: Optional[int],
+ min_tokens: Optional[int],
+ max_tokens: Optional[int],
+ ) -> Tuple[int, int]:
+ if not variance:
+ return (
+ min_tokens or base,
+ max_tokens or base,
+ )
+
+ min_tokens = min_tokens if min_tokens and min_tokens > 0 else 1
+ max_tokens = (
+ max_tokens if max_tokens and max_tokens > base else base + 5 * variance
+ )
+
+ return min_tokens, max_tokens
+
+
+class EndlessTokens(List[str]):
+ """
+ A list subclass that allows for endless data generation.
+ """
+
+ def __init__(
+ self,
+ data: Union[str, Path],
+ filter_start: Optional[Union[str, int]] = None,
+ filter_end: Optional[Union[str, int]] = None,
+ clean_text_args: Optional[Dict[str, bool]] = None,
+ ):
+ """
+ Initialize EndlessDataWords with data.
+
+ :param data: Source text data.
+ :type data: str
+ """
+ logger.debug("Loading data from: {}", data)
+ data = load_text(data)
+ data = filter_text(data, filter_start, filter_end)
+ data = (
+ clean_text(data)
+ if not clean_text_args
+ else clean_text(data, **clean_text_args)
+ )
+ self._tokens, self._token_separators, self._line_indices = split_text(data)
+
+ super().__init__(self._tokens)
+
+ @property
+ def line_indices(self) -> List[int]:
+ """
+ Get the list of start indices for lines.
+
+ :return: List of start indices.
+ :rtype: List[int]
+ """
+ return self._line_indices
+
+ def create_text(self, start: int, length: int) -> str:
+ """
+ Create a text snippet from the specified range.
+
+ :param start: Start index.
+ :type start: int
+ :param length: Length of the snippet.
+ :type length: int
+ :return: Text snippet.
+ :rtype: str
+ """
+ start = start % len(self)
+ text = ""
+ buff_token_sep = ""
+
+ for counter in range(length):
+ index = (start + counter) % len(self)
+ text += buff_token_sep + self[index]
+ buff_token_sep = self._token_separators[index]
+
+ return text
+
+
+class EmulatedRequestGenerator(RequestGenerator):
+ """
+ A request generator that generates emulated requests based on a configuration.
+
+ :param config: The configuration string, file path, or dictionary.
+ :type config: Union[str, Dict, Path]
+ :param random_seed: The random seed to use for generating requests.
+ :type random_seed: Optional[int]
+ :param tokenizer: The tokenizer instance or the name/config to use
+ for tokenizing prompts.
+ :type tokenizer: Optional[Union[str, PreTrainedTokenizer]]
+ :param mode: The generation mode, either 'async' or 'sync'.
+ :type mode: GenerationMode
+ :param async_queue_size: The size of the request queue.
+ :type async_queue_size: int
+ """
+
+ def __init__(
+ self,
+ config: Optional[Union[str, Path, Dict]],
+ random_seed: Optional[int] = None,
+ tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+ mode: GenerationMode = "async",
+ async_queue_size: int = 50,
+ ):
+ """
+ Initialize EmulatedRequestGenerator with configuration and tokenizer.
+
+ :param config: Configuration source, can be a dictionary,
+ JSON string, or file path.
+ :type config: Optional[Union[str, Path, Dict]]
+ :param random_seed: Optional seed for random number generator.
+ :type random_seed: Optional[int]
+ :param tokenizer: Tokenizer instance or configuration for tokenizing prompts.
+ :type tokenizer: Optional[Union[str, PreTrainedTokenizer]]
+ :param mode: Mode of request generation, either 'async' or 'sync'.
+ :type mode: str
+ :param async_queue_size: Size of the asynchronous queue.
+ :type async_queue_size: int
+ """
+ self._config = EmulatedConfig.create_config(config)
+ self._tokens = EndlessTokens(
+ settings.emulated_data.source,
+ settings.emulated_data.filter_start,
+ settings.emulated_data.filter_end,
+ )
+ if self._config.images > 0:
+ self._images = load_images(settings.emulated_data.image_source, [self._config.width, self._config.height])
+ self._rng = np.random.default_rng(random_seed)
+
+ # NOTE: Must be after all the parameters since the queue population
+ # function requires attributes above
+ super().__init__(
+ type_="emulated",
+ source=str(config),
+ tokenizer=tokenizer,
+ mode=mode,
+ async_queue_size=async_queue_size,
+ )
+
+ def __len__(self) -> int:
+ raise NotImplementedError(
+ "Can't get the length of the emulated dataset. "
+ "Check the `--data-type` CLI parameter."
+ )
+
+ def create_item(self) -> TextGenerationRequest:
+ """
+ Create a new text generation request item from the data.
+
+ :return: A new text generation request.
+ :rtype: TextGenerationRequest
+ """
+ logger.debug("Creating new text generation request")
+ target_prompt_token_count = self._config.sample_prompt_tokens(self._rng)
+ prompt = self.sample_prompt(target_prompt_token_count)
+ images = self.sample_images()
+ prompt_token_count = len(self.tokenizer.tokenize(prompt))
+ output_token_count = self._config.sample_output_tokens(self._rng)
+ logger.debug("Generated prompt: {}", prompt)
+
+ return TextGenerationRequest(
+ prompt=prompt,
+ prompt_token_count=prompt_token_count,
+ output_token_count=output_token_count,
+ images=images,
+ )
+
+ def sample_prompt(self, tokens: int) -> str:
+ """
+ Sample a prompt with the specified number of tokens.
+
+ :param tokens: Number of tokens for the prompt.
+ :type tokens: int
+ :return: Sampled prompt text.
+ :rtype: str
+ """
+ start_line_index = self._rng.integers(0, len(self._tokens.line_indices))
+
+ # binary search to find the proper number of tokens for the prompt
+ # this is because tokenizers differ in tokenization behavior
+ left = 0
+ right = left + 5 * tokens
+
+ while left < right:
+ mid = (left + right) // 2
+ prompt = self._tokens.create_text(start_line_index, mid)
+ token_count = len(self.tokenizer.tokenize(prompt))
+
+ if token_count == tokens:
+ return prompt
+
+ if token_count < tokens:
+ left = mid + 1
+ else:
+ right = mid
+
+ return self._tokens.create_text(start_line_index, left)
+
+
+ def sample_images(self):
+ image_indices = self._rng.choice(
+ len(self._images), size=self._config.images, replace=False,
+ )
+
+ return [self._images[i] for i in image_indices]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py
new file mode 100644
index 0000000000000000000000000000000000000000..b187f7b46b343311daa32fe465d60fff163beff2
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py
@@ -0,0 +1,83 @@
+from pathlib import Path
+from typing import Optional, Union
+
+from loguru import logger
+from transformers import PreTrainedTokenizer # type: ignore # noqa: PGH003
+
+from guidellm.config import settings
+from guidellm.core.request import TextGenerationRequest
+from guidellm.request.base import GenerationMode, RequestGenerator
+from guidellm.utils import load_text_lines
+
+__all__ = ["FileRequestGenerator"]
+
+
+class FileRequestGenerator(RequestGenerator):
+ """
+ A request generator implementation for files.
+
+ :param path: The path to the file containing the data.
+ :type path: Optional[Union[str, Path]]
+ :param tokenizer: The tokenizer instance or the name/config to use
+ for tokenizing prompts.
+ :type tokenizer: Union[str, PreTrainedTokenizer]
+ :param mode: The generation mode, either 'async' or 'sync'.
+ :type mode: str
+ :param async_queue_size: The size of the request queue.
+ :type async_queue_size: int
+ """
+
+ def __init__(
+ self,
+ path: Optional[Union[str, Path]],
+ tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+ mode: GenerationMode = "async",
+ async_queue_size: int = 50,
+ ):
+ if not path:
+ raise ValueError("File path must be provided for FileRequestGenerator")
+
+ self._path = path
+ self._data = load_text_lines(
+ path,
+ filters=settings.dataset.preferred_data_columns,
+ )
+ self._iterator = iter(self._data)
+
+ # NOTE: Must be after all the parameters since the queue population
+ # function requires attributes above
+ super().__init__(
+ type_="file",
+ source=str(path),
+ tokenizer=tokenizer,
+ mode=mode,
+ async_queue_size=async_queue_size,
+ )
+
+ def __len__(self) -> int:
+ """
+ Return the number of text lines.
+ """
+
+ return len(self._data)
+
+ def create_item(self) -> TextGenerationRequest:
+ """
+ Create a new result request item from the data.
+
+ :return: A new result request.
+ :rtype: TextGenerationRequest
+ """
+ logger.debug("Creating new request item from file data")
+
+ try:
+ data = next(self._iterator)
+ except StopIteration:
+ self._iterator = iter(self._data)
+ data = next(self._iterator)
+
+ token_count = len(self.tokenizer.tokenize(data))
+ request = TextGenerationRequest(prompt=data, prompt_token_count=token_count)
+ logger.debug("Created new TextGenerationRequest: {}", request)
+
+ return request
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd24040d3e59a95a69a2b829552bbca83bc5338
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py
@@ -0,0 +1,103 @@
+from pathlib import Path
+from typing import Optional, Union
+
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
+from loguru import logger
+from transformers import PreTrainedTokenizer # type: ignore # noqa: PGH003
+
+from guidellm.core.request import TextGenerationRequest
+from guidellm.request.base import GenerationMode, RequestGenerator
+from guidellm.utils import (
+ load_transformers_dataset,
+ resolve_transformers_dataset_column,
+)
+
+__all__ = ["TransformersDatasetRequestGenerator"]
+
+
+class TransformersDatasetRequestGenerator(RequestGenerator):
+ """
+ A request generator implementation for Hugging Face datasets.
+
+ :param dataset: The name of the Hugging Face dataset to use or the path
+ to a local dataset.
+ :type dataset_name: str
+ :param split: The split of the dataset to use (e.g., 'train', 'test').
+ :type split: str
+ :param column: The column/field to use for generating requests.
+ :type column: str
+ :param tokenizer: The tokenizer instance or the name/config to use
+ for tokenizing prompts.
+ :type tokenizer: Union[str, PreTrainedTokenizer]
+ :param mode: The generation mode, either 'async' or 'sync'.
+ :type mode: str
+ :param async_queue_size: The size of the request queue.
+ :type async_queue_size: int
+ """
+
+ def __init__(
+ self,
+ dataset: Union[
+ str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+ ],
+ split: Optional[str] = None,
+ column: Optional[str] = None,
+ tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+ mode: GenerationMode = "async",
+ async_queue_size: int = 50,
+ **kwargs,
+ ):
+ self._dataset = dataset
+ self._split = split
+ self._column = column
+ self._kwargs = kwargs
+
+ self._hf_dataset: Union[Dataset, IterableDataset] = load_transformers_dataset(
+ dataset, split=split, **kwargs
+ )
+ self._hf_column = resolve_transformers_dataset_column(
+ self._hf_dataset, column=column
+ )
+ self._hf_dataset_iterator = iter(self._hf_dataset)
+
+ # NOTE: Must be after all the parameters since the queue population
+ # function requires attributes above
+ super().__init__(
+ type_="transformers_dataset",
+ source=str(dataset),
+ tokenizer=tokenizer,
+ mode=mode,
+ async_queue_size=async_queue_size,
+ )
+
+ def __len__(self) -> int:
+ if not isinstance(self._hf_dataset, Dataset):
+ raise ValueError("Can't get dataset size for IterableDataset object")
+ else:
+ return len(self._hf_dataset)
+
+ def create_item(self) -> TextGenerationRequest:
+ """
+ Create a new result request item from the dataset.
+
+ :return: A new result request.
+ :rtype: TextGenerationRequest
+ """
+
+ logger.debug("Creating new request item from dataset")
+
+ try:
+ data = next(self._hf_dataset_iterator)
+ except StopIteration:
+ self._hf_dataset_iterator = iter(self._hf_dataset)
+ data = next(self._hf_dataset_iterator)
+
+ prompt = data[self._hf_column]
+ token_count = len(self.tokenizer.tokenize(prompt))
+ request = TextGenerationRequest(
+ prompt=prompt,
+ prompt_token_count=token_count,
+ )
+ logger.debug(f"Created new TextGenerationRequest: {request}")
+
+ return request
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b4ac50c647ad1adea9ad4368ac58727c11bc19
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py
@@ -0,0 +1,4 @@
+from .base import Scheduler, SchedulerResult
+from .load_generator import LoadGenerationMode, LoadGenerator
+
+__all__ = ["LoadGenerationMode", "LoadGenerator", "Scheduler", "SchedulerResult"]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..602166b01a88d5e9525f3277b2b39602e3a82fd6
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py
@@ -0,0 +1,374 @@
+import asyncio
+import math
+import time
+from dataclasses import dataclass
+from typing import AsyncGenerator, Literal, Optional, Union, get_args
+
+from loguru import logger
+
+from guidellm.backend import Backend
+from guidellm.config import settings
+from guidellm.core import (
+ TextGenerationBenchmark,
+ TextGenerationError,
+ TextGenerationRequest,
+ TextGenerationResult,
+)
+from guidellm.request import RequestGenerator
+from guidellm.scheduler.load_generator import LoadGenerationMode, LoadGenerator
+
+__all__ = ["Scheduler", "SchedulerResult"]
+
+
+@dataclass
+class SchedulerResult:
+ """
+ Represents the result of a single task execution within the Scheduler.
+
+ :param completed: Indicates if the task is completed.
+ :type completed: bool
+ :param count_total: Total number of tasks to be executed.
+ :type count_total: int
+ :param count_completed: Number of tasks that have been completed so far.
+ :type count_completed: int
+ :param report: Benchmark data for the task execution.
+ :type benchmark: TextGenerationBenchmark
+ :param current_result: The result of the current request, if any.
+ :type current_result: Optional[Union[TextGenerationResult, Exception]]
+ """
+
+ completed: bool
+ count_total: int
+ count_completed: int
+ benchmark: TextGenerationBenchmark
+ current_result: Optional[Union[TextGenerationResult, TextGenerationError]] = None
+
+
+class Scheduler:
+ """
+ Schedules and manages the execution of tasks for text generation requests.
+
+ :param generator: The request generator that produces text generation requests.
+ :type generator: RequestGenerator
+ :param worker: The backend worker that processes the requests.
+ :type worker: Backend
+ :param mode: The mode of load generation (e.g., synchronous, asynchronous).
+ :type mode: LoadGenerationMode
+ :param rate: The rate at which requests are generated, if applicable.
+ :type rate: Optional[float]
+ :param max_number: Maximum number of requests to be processed.
+ :type max_number: Optional[int]
+ :param max_duration: Maximum duration in seconds for which requests
+ should be processed.
+ :type max_duration: Optional[float]
+
+ :raises ValueError: If neither max_number nor max_duration is specified or
+ if they are not positive.
+ """
+
+ def __init__(
+ self,
+ generator: RequestGenerator,
+ worker: Backend,
+ mode: LoadGenerationMode = "synchronous",
+ rate: Optional[float] = None,
+ max_number: Optional[int] = None,
+ max_duration: Optional[float] = None,
+ ):
+ logger.info(
+ "Scheduler initialized with params: generator={}, worker={}, mode={}, "
+ "rate={}, max_number={}, max_duration={}",
+ generator,
+ worker,
+ mode,
+ rate,
+ max_number,
+ max_duration,
+ )
+
+ if mode not in get_args(LoadGenerationMode):
+ err = ValueError(
+ f"{mode} is not a valid Load Generation Mode. "
+ f"Valid options are {get_args(LoadGenerationMode)}"
+ )
+ logger.error(err)
+ raise err
+
+ if not max_number and not max_duration:
+ err = ValueError("Either max_number or max_duration must be specified")
+ logger.error(err)
+ raise err
+
+ if max_number and max_number <= 0:
+ err = ValueError(f"max_number must be > 0, given: {max_number}")
+ logger.error(err)
+ raise err
+
+ if max_duration and max_duration <= 0:
+ err = ValueError(f"max_duration must be > 0, given: {max_duration}")
+ logger.error(err)
+ raise err
+
+ if mode in ["constant", "poisson"] and not rate:
+ err = ValueError(f"Rate must be > 0 for mode: {mode}. Given: {rate}")
+ logger.error(err)
+ raise err
+
+ self._generator = generator
+ self._worker = worker
+ self._mode = mode
+ self._rate = rate
+ self._max_number = max_number
+ self._max_duration = max_duration
+
+ self._load_generator = LoadGenerator(mode, rate)
+
+ @property
+ def generator(self) -> RequestGenerator:
+ """
+ The request generator that produces text generation requests.
+
+ :return: The request generator instance.
+ :rtype: RequestGenerator
+ """
+ return self._generator
+
+ @property
+ def worker(self) -> Backend:
+ """
+ The backend worker that processes the requests.
+
+ :return: The backend worker instance.
+ :rtype: Backend
+ """
+ return self._worker
+
+ @property
+ def mode(self) -> LoadGenerationMode:
+ """
+ The mode of load generation (e.g., synchronous, asynchronous).
+
+ :return: The load generation mode.
+ :rtype: LoadGenerationMode
+ """
+ return self._mode
+
+ @property
+ def rate(self) -> Optional[float]:
+ """
+ The rate at which requests are generated, if applicable.
+
+ :return: The rate of request generation.
+ :rtype: Optional[float]
+ """
+ return self._rate
+
+ @property
+ def max_number(self) -> Optional[int]:
+ """
+ Maximum number of requests to be processed.
+
+ :return: The maximum number of requests.
+ :rtype: Optional[int]
+ """
+ return self._max_number
+
+ @property
+ def max_duration(self) -> Optional[float]:
+ """
+ Maximum duration in seconds for which requests should be processed.
+
+ :return: The maximum duration in seconds.
+ :rtype: Optional[float]
+ """
+ return self._max_duration
+
+ @property
+ def load_generator(self) -> LoadGenerator:
+ """
+ The load generator responsible for generating load based on mode and rate.
+
+ :return: The load generator instance.
+ :rtype: LoadGenerator
+ """
+ return self._load_generator
+
+ @property
+ def benchmark_mode(self) -> Literal["asynchronous", "synchronous", "throughput"]:
+ """
+ The report mode for the scheduler.
+
+ :return: The report mode.
+ :rtype: Literal["asynchronous", "synchronous", "throughput"]
+ """
+ if self._mode == "synchronous":
+ return "synchronous"
+
+ if self._mode == "throughput":
+ return "throughput"
+
+ return "asynchronous"
+
+ async def run(self) -> AsyncGenerator[SchedulerResult, None]:
+ """
+ Run the scheduler to process requests based on the configured mode, rate,
+ maximum number, and maximum duration.
+
+ :yield: The result of each task executed by the scheduler.
+ :rtype: Generator[SchedulerResult, None, None]
+ """
+ logger.info("Starting Scheduler run")
+
+ benchmark = TextGenerationBenchmark(mode=self.benchmark_mode, rate=self.rate)
+ start_time = time.time()
+ end_time = start_time + self.max_duration if self.max_duration else math.inf
+ max_number = float(self.max_number) if self.max_number else math.inf
+ runner = self._run_sync if self._mode == "synchronous" else self._run_async
+ count_total = (
+ self.max_number
+ if self.max_number
+ else round(self.max_duration)
+ if self.max_duration
+ else 0
+ )
+
+ # yield initial result for progress tracking
+ yield SchedulerResult(
+ completed=False,
+ count_total=count_total,
+ count_completed=0,
+ benchmark=benchmark,
+ )
+
+ run_count = 0
+ async for res in runner(benchmark, end_time, max_number):
+ run_count += 1
+ count_completed = (
+ min(run_count, self.max_number)
+ if self.max_number
+ else round(time.time() - start_time)
+ if self.max_duration
+ else 0
+ )
+
+ yield SchedulerResult(
+ completed=False,
+ count_total=count_total,
+ count_completed=count_completed,
+ benchmark=benchmark,
+ current_result=res,
+ )
+
+ logger.info("Scheduler run completed")
+
+ yield SchedulerResult(
+ completed=True,
+ count_total=count_total,
+ count_completed=(
+ benchmark.request_count + benchmark.error_count
+ if self.max_number
+ else round(time.time() - start_time)
+ if self.max_duration
+ else 0
+ ),
+ benchmark=benchmark,
+ )
+
+ async def _run_sync(
+ self, benchmark: TextGenerationBenchmark, end_time: float, max_number: float
+ ) -> AsyncGenerator[Union[TextGenerationResult, TextGenerationError], None]:
+ for index, (request, submit_at) in enumerate(
+ zip(self.generator, self.load_generator.times())
+ ):
+ if index >= max_number or time.time() >= end_time:
+ break
+
+ logger.debug(
+ "Running synchronous request={} at submit_at={}",
+ request,
+ submit_at,
+ )
+ benchmark.request_started()
+ result = await self._submit_task_coroutine(request, submit_at, end_time)
+ if result is not None:
+ benchmark.request_completed(result)
+ logger.debug("Request completed with output: {}", result)
+ yield result
+
+ async def _run_async(
+ self, benchmark: TextGenerationBenchmark, end_time: float, max_number: float
+ ) -> AsyncGenerator[Union[TextGenerationResult, TextGenerationError], None]:
+ tasks = []
+ completed = 0
+
+ for index, (request, submit_at) in enumerate(
+ zip(self.generator, self.load_generator.times())
+ ):
+ while (index + 1 - completed) >= settings.max_concurrency:
+ await asyncio.sleep(0.1)
+
+ if index >= max_number or time.time() >= end_time or submit_at >= end_time:
+ break
+
+ logger.debug(
+ "Running asynchronous request={} at submit_at={}",
+ request,
+ submit_at,
+ )
+
+ def _completed(_task: asyncio.Task) -> None:
+ nonlocal completed
+ completed += 1
+ _res = _task.result()
+
+ if _res:
+ benchmark.request_completed(_res)
+ logger.debug("Request completed: {}", _res)
+
+ benchmark.request_started()
+ task = asyncio.create_task(
+ self._submit_task_coroutine(request, submit_at, end_time)
+ )
+ task.add_done_callback(_completed)
+ tasks.append(task)
+
+ # release control to the event loop for other tasks
+ await asyncio.sleep(0.001)
+
+ for compl_task in asyncio.as_completed(tasks):
+ task_res = await compl_task
+ if task_res is not None:
+ yield task_res
+
+ async def _submit_task_coroutine(
+ self, request: TextGenerationRequest, submit_at: float, end_time: float
+ ) -> Optional[Union[TextGenerationResult, TextGenerationError]]:
+ try:
+ if submit_at > end_time:
+ logger.info(
+ "Request {} submission time {} is greater than end time {}",
+ request,
+ submit_at,
+ end_time,
+ )
+ raise asyncio.TimeoutError(
+ f"Request submission time {submit_at} "
+ f"is greater than end time {end_time}"
+ )
+
+ if submit_at > time.time():
+ await asyncio.sleep(submit_at - time.time())
+
+ timeout = (
+ end_time - time.time() if end_time and end_time < math.inf else None
+ )
+
+ return await asyncio.wait_for(self._worker.submit(request), timeout=timeout)
+ except asyncio.TimeoutError as exc:
+ logger.info("Request {} timed out: {}", request, exc)
+
+ return None
+ except Exception as exc: # noqa: BLE001
+ logger.warning("Request {} failed: {}", request, exc)
+
+ return TextGenerationError(request=request, message=str(exc))
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f629752ab2e1b961615ece6ba1e90f48274e89a2
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py
@@ -0,0 +1,196 @@
+import time
+from typing import Generator, Literal, Optional, get_args
+
+import numpy as np
+from loguru import logger
+
+__all__ = ["LoadGenerationMode", "LoadGenerator"]
+
+LoadGenerationMode = Literal["synchronous", "constant", "poisson", "throughput"]
+
+
+class LoadGenerator:
+ """
+ Load Generator class that generates timestamps for load generation.
+
+ This class supports multiple load generation modes: "constant", "poisson",
+ "throughput", and "synchronous". Each mode has its own method for generating
+ timestamps based on the rate provided during initialization.
+
+ :param mode: The mode of load generation. Valid options are "constant",
+ "poisson", "throughput", and "synchronous".
+ :type mode: LoadGenerationMode
+ :param rate: The rate at which to generate timestamps. This value is
+ interpreted differently depending on the mode.
+ :type rate: float
+
+ :raises ValueError: If an invalid mode is provided.
+ """
+
+ def __init__(self, mode: LoadGenerationMode, rate: Optional[float] = None):
+ """
+ Initialize the Load Generator with the mode and rate.
+
+ :param mode: The mode of load generation ("constant", "poisson", "throughput",
+ or "synchronous").
+ :type mode: LoadGenerationMode
+ :param rate: The rate at which to generate timestamps. In the "constant"
+ mode, this represents the frequency of events. In the "poisson" mode,
+ it represents the average frequency.
+ :type rate: Optional[float]
+ """
+ if mode not in get_args(LoadGenerationMode):
+ error = ValueError(
+ f"{mode} is not a valid Load Generation Mode. "
+ f"Valid options are {get_args(LoadGenerationMode)}"
+ )
+ logger.error(error)
+ raise error
+
+ if mode not in ["synchronous", "throughput"] and (rate is None or rate <= 0):
+ error = ValueError(f"Rate must be > 0 for mode: {mode}. Given: {rate}")
+ logger.error(error)
+ raise error
+
+ self._mode = mode
+ self._rate = rate
+ logger.debug(
+ "Initialized LoadGenerator with mode: {mode}, rate: {rate}",
+ mode=mode,
+ rate=rate,
+ )
+
+ @property
+ def mode(self) -> LoadGenerationMode:
+ """
+ Get the mode of load generation.
+
+ :return: The mode of load generation.
+ :rtype: LoadGenerationMode
+ """
+ return self._mode
+
+ @property
+ def rate(self) -> Optional[float]:
+ """
+ Get the rate of load generation.
+
+ :return: The rate of load generation.
+ :rtype: Optional[float]
+ """
+ return self._rate
+
+ def times(self) -> Generator[float, None, None]:
+ """
+ Generate timestamps for load generation based on the selected mode.
+
+ :return: A generator that yields timestamps at which each load
+ should be initiated.
+ :rtype: Generator[float, None, None]
+
+ :raises ValueError: If the mode is invalid.
+ """
+ logger.debug(f"Generating timestamps using mode: {self._mode}")
+
+ if self._mode == "throughput":
+ yield from self.throughput_times()
+ elif self._mode == "constant":
+ yield from self.constant_times()
+ elif self._mode == "poisson":
+ yield from self.poisson_times()
+ elif self._mode == "synchronous":
+ yield from self.synchronous_times()
+ else:
+ logger.error(f"Invalid mode encountered: {self._mode}")
+ raise ValueError(f"Invalid mode: {self._mode}")
+
+ def synchronous_times(self) -> Generator[float, None, None]:
+ """
+ Generate invalid timestamps for the "synchronous" mode.
+
+ :return: A generator that yields a constant invalid timestamp (-1.0).
+ :rtype: Generator[float, None, None]
+ """
+ logger.debug("Generating invalid timestamps for synchronous mode")
+ while True:
+ yield -1.0
+
+ def throughput_times(self) -> Generator[float, None, None]:
+ """
+ Generate timestamps at the maximum rate possible, returning the current time.
+
+ :return: A generator that yields the current time in seconds.
+ :rtype: Generator[float, None, None]
+ """
+ logger.debug("Generating timestamps at throughput rate")
+ while True:
+ yield time.time()
+
+ def constant_times(self) -> Generator[float, None, None]:
+ """
+ Generate timestamps at a constant rate based on the specified rate.
+
+ :return: A generator that yields timestamps incremented by 1/rate seconds.
+ :rtype: Generator[float, None, None]
+ """
+ logger.debug("Generating constant rate timestamps with rate: {}", self._rate)
+
+ if self._rate is None or self._rate == 0:
+ raise ValueError(
+ "Rate must be > 0 for constant mode, given: {}", self._rate
+ )
+
+ start_time = time.time()
+ time_increment = 1.0 / self._rate
+ counter = 0
+
+ while True:
+ yield_time = start_time + time_increment * counter
+ logger.debug(f"Yielding timestamp: {yield_time}")
+ yield yield_time
+ counter += 1
+
+ def poisson_times(self) -> Generator[float, None, None]:
+ """
+ Generate timestamps based on a Poisson process, where the number
+ of requests to be sent per second is drawn from a Poisson distribution.
+ The inter arrival time between requests is exponentially distributed.
+
+ :return: A generator that yields timestamps based on a Poisson distribution.
+ :rtype: Generator[float, None, None]
+ """
+ logger.debug("Generating Poisson rate timestamps with rate: {}", self._rate)
+
+ if self._rate is None or self._rate == 0:
+ raise ValueError("Rate must be > 0 for poisson mode, given: {}", self._rate)
+
+ time_tracker = time.time()
+ rng = np.random.default_rng()
+ time_increment = 1.0
+
+ while True:
+ num_requests = rng.poisson(self._rate)
+
+ if num_requests == 0:
+ yield time_tracker + time_increment
+ else:
+ inter_arrival_times = rng.exponential(1.0 / self._rate, num_requests)
+ logger.debug(
+ "Calculated new inter-arrival times for poisson process: {}",
+ inter_arrival_times,
+ )
+ arrival_time_tracker = time_tracker
+
+ for arrival_time in inter_arrival_times:
+ arrival_time_tracker += arrival_time
+
+ if arrival_time_tracker > time_tracker + time_increment:
+ logger.debug(
+ "Arrival time tracker: {} is greater than current time",
+ arrival_time_tracker,
+ )
+ break
+
+ yield arrival_time_tracker
+
+ time_tracker += time_increment # Move on to the next time period
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb4931bdabcddce94d443a809876950424c803f5
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py
@@ -0,0 +1,43 @@
+from .images import ImageDescriptor, load_images
+from .injector import create_report, inject_data
+from .progress import BenchmarkReportProgress
+from .text import (
+ clean_text,
+ filter_text,
+ is_path,
+ is_path_like,
+ is_url,
+ load_text,
+ load_text_lines,
+ parse_text_objects,
+ split_lines_by_punctuation,
+ split_text,
+)
+from .transformers import (
+ load_transformers_dataset,
+ resolve_transformers_dataset,
+ resolve_transformers_dataset_column,
+ resolve_transformers_dataset_split,
+)
+
+__all__ = [
+ "BenchmarkReportProgress",
+ "clean_text",
+ "create_report",
+ "filter_text",
+ "inject_data",
+ "is_path",
+ "is_path_like",
+ "is_url",
+ "load_text",
+ "load_text_lines",
+ "load_transformers_dataset",
+ "parse_text_objects",
+ "resolve_transformers_dataset",
+ "resolve_transformers_dataset_column",
+ "resolve_transformers_dataset_split",
+ "split_lines_by_punctuation",
+ "split_text",
+ "ImageDescriptor",
+ "load_images",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8800d2abf8df387de691bda21073c643f9129b
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py
@@ -0,0 +1,34 @@
+"""
+This module includes custom CLI parameters for the `click` package.
+"""
+
+from typing import Any, Optional
+
+from click import Context, Parameter, ParamType
+
+__all__ = ["MAX_REQUESTS"]
+
+
+class MaxRequestsType(ParamType):
+ """
+ Catch the `dataset` string parameter to determine the behavior of the Scheduler.
+ """
+
+ name = "max_requests"
+
+ def convert(
+ self, value: Any, param: Optional[Parameter], ctx: Optional[Context]
+ ) -> Any:
+ if isinstance(value, int):
+ return value
+
+ try:
+ return int(value)
+ except ValueError:
+ if value == "dataset":
+ return value
+ else:
+ self.fail(f"{value} is not a valid integer or 'dataset'", param, ctx)
+
+
+MAX_REQUESTS = MaxRequestsType()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb66d4321309c39f92b6e1cf3ce737f7bf5c2f4c
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py
@@ -0,0 +1,80 @@
+from io import BytesIO
+from typing import List, Optional, Tuple
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+from loguru import logger
+from PIL import Image
+from pydantic import ConfigDict, Field, computed_field
+
+from guidellm.config import settings
+from guidellm.core.serializable import Serializable
+
+__all__ = ["load_images", "ImageDescriptor"]
+
+class ImageDescriptor(Serializable):
+ """
+ A class to represent image data in serializable format.
+ """
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ url: Optional[str] = Field(description="url address for image.")
+ image: Image.Image = Field(description="PIL image", exclude=True)
+ filename: Optional[int] = Field(
+ default=None,
+ description="Image filename.",
+ )
+
+ @computed_field # type: ignore[misc]
+ @property
+ def image_resolution(self) -> Tuple[int, int]:
+ if self.image is None:
+ return None
+ else:
+ return self.image.size
+
+
+def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:
+ """
+ Load an HTML file from a path or URL
+
+ :param data: the path or URL to load the HTML file from
+ :type data: Union[str, Path]
+ :return: Descriptor containing image url and the data in PIL.Image.Image format
+ :rtype: ImageDescriptor
+ """
+
+ images = []
+ if not data:
+ return None
+ if isinstance(data, str) and data.startswith("http"):
+ response = requests.get(data, timeout=settings.request_timeout)
+ response.raise_for_status()
+
+ soup = BeautifulSoup(response.text, "html.parser")
+ for img_tag in soup.find_all("img"):
+ img_url = img_tag.get("src")
+
+ if img_url:
+ # Handle relative URLs
+ img_url = urljoin(data, img_url)
+
+ # Download the image
+ logger.debug("Loading image: {}", img_url)
+ img_response = requests.get(img_url)
+ img_response.raise_for_status()
+ image = Image.open(BytesIO(img_response.content))
+
+ if image_resolution is not None:
+ image = image.resize(image_resolution)
+
+ # Load image into Pillow
+ images.append(
+ ImageDescriptor(
+ url=img_url,
+ image=image,
+ )
+ )
+
+ return images
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5216aa65fe83328015af1517e049fadd344677
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+from typing import Union
+
+from pydantic import BaseModel
+
+from guidellm.config import settings
+from guidellm.utils.text import load_text
+
+__all__ = ["create_report", "inject_data"]
+
+
+def create_report(model: BaseModel, output_path: Union[str, Path]) -> Path:
+ """
+ Creates a report from the model and saves it to the output path.
+
+ :param model: the model to serialize and inject
+ :type model: BaseModel
+ :param output_path: the path, either a file or a directory,
+ to save the report to. If a directory, the report will be saved
+ as "report.html" inside of the directory.
+ :type output_path: str
+ :return: the path to the saved report
+ :rtype: str
+ """
+ if not isinstance(output_path, Path):
+ output_path = Path(output_path)
+
+ html_content = load_text(settings.report_generation.source)
+ report_content = inject_data(
+ model,
+ html_content,
+ settings.report_generation.report_html_match,
+ settings.report_generation.report_html_placeholder,
+ )
+
+ if not output_path.suffix:
+ # assume directory, save as report.html
+ output_path = output_path / "report.html"
+
+ output_path.parent.mkdir(parents=True, exist_ok=True)
+ output_path.write_text(report_content)
+
+ return output_path
+
+
+def inject_data(
+ model: BaseModel,
+ html: str,
+ match: str,
+ placeholder: str,
+) -> str:
+ """
+ Injects the data from the model into the HTML while replacing the placeholder.
+
+ :param model: the model to serialize and inject
+ :type model: BaseModel
+ :param html: the html to inject the data into
+ :type html: str
+ :param match: the string to match in the html to find the placeholder
+ :type match: str
+ :param placeholder: the placeholder to replace with the model data
+ inside of the placeholder
+ :type placeholder: str
+ :return: the html with the model data injected
+ :rtype: str
+ """
+ model_str = model.json()
+ inject_str = match.replace(placeholder, model_str)
+
+ return html.replace(match, inject_str)
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1e1e7987e2aaa226e5c38dfbf9a9445aac60b43
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py
@@ -0,0 +1,199 @@
+from datetime import datetime
+from typing import List
+
+from loguru import logger
+from rich.console import Group
+from rich.live import Live
+from rich.panel import Panel
+from rich.progress import (
+ BarColumn,
+ Progress,
+ SpinnerColumn,
+ TaskID,
+ TaskProgressColumn,
+ TextColumn,
+ TimeElapsedColumn,
+ TimeRemainingColumn,
+)
+
+__all__ = ["BenchmarkReportProgress"]
+
+
+class BenchmarkReportProgress:
+ """
+ Manages the progress display for benchmarks and report generation using Rich.
+
+ This class provides a visual representation of the benchmarking process
+ and report generation using Rich's progress bars and panels.
+ """
+
+ def __init__(self):
+ """
+ Initialize the BenchmarkReportProgress with default settings.
+
+ This method sets up the progress displays for both individual benchmarks
+ and the overall report, as well as initializing internal task management
+ structures.
+ """
+ logger.info("Initializing BenchmarkReportProgress instance")
+
+ self.benchmarks_progress = Progress(
+ TextColumn("[{task.fields[start_time_str]}]"),
+ SpinnerColumn(),
+ TaskProgressColumn(),
+ TextColumn("{task.description}"),
+ TextColumn(" "),
+ TextColumn(
+ "[bold cyan]({task.fields[req_per_sec]} req/sec avg)[/bold cyan]"
+ ),
+ )
+ self.benchmarks_panel = Panel(
+ self.benchmarks_progress,
+ title="Benchmarks",
+ title_align="left",
+ expand=True,
+ )
+ self.report_progress = Progress(
+ SpinnerColumn(),
+ TextColumn("Generating report..."),
+ BarColumn(bar_width=None),
+ TextColumn(
+ "({task.fields[completed_benchmarks]}/{task.fields[total_benchmarks]})"
+ ),
+ TextColumn("["),
+ TimeElapsedColumn(),
+ TextColumn("<"),
+ TimeRemainingColumn(),
+ TextColumn("]"),
+ )
+ self.render_group = Group(self.benchmarks_panel, self.report_progress)
+ self.live = Live(self.render_group, redirect_stdout=True, redirect_stderr=True)
+
+ self.report_task: TaskID = None # type: ignore # noqa: PGH003
+ self.benchmark_tasks: List[TaskID] = []
+ self.benchmark_tasks_started: List[bool] = []
+ self.benchmark_tasks_completed: List[bool] = []
+ self.benchmark_tasks_progress: List[float] = []
+
+ def start(self, task_descriptions: List[str]) -> None:
+ """
+ Starts the live progress display and initializes benchmark tasks.
+
+ :param task_descriptions: List of descriptions for each benchmark task.
+ :type task_descriptions: List[str]
+ """
+ logger.info(
+ "Starting BenchmarkReportProgress with task descriptions: {}",
+ task_descriptions,
+ )
+ self.live.start()
+
+ for task_description in task_descriptions:
+ logger.debug("Adding task with description: {}", task_description)
+ task_id = self.benchmarks_progress.add_task(
+ task_description,
+ start=False,
+ total=None,
+ start_time_str="--:--:--",
+ req_per_sec="#.##",
+ )
+ self.benchmark_tasks.append(task_id)
+ self.benchmark_tasks_started.append(False)
+ self.benchmark_tasks_completed.append(False)
+ self.benchmark_tasks_progress.append(0)
+
+ self.report_task = self.report_progress.add_task(
+ "",
+ total=len(self.benchmark_tasks) * 100, # 100 points per report
+ completed_benchmarks=0,
+ total_benchmarks=len(task_descriptions),
+ )
+ logger.info("Initialized {} benchmark tasks", len(task_descriptions))
+
+ def update_benchmark(
+ self,
+ index: int,
+ description: str,
+ completed: bool,
+ completed_count: int,
+ completed_total: int,
+ start_time: float,
+ req_per_sec: float,
+ ) -> None:
+ """
+ Updates the progress of a specific benchmark task.
+
+ :param index: Index of the benchmark task to update.
+ :type index: int
+ :param description: Description of the current benchmark task.
+ :type description: str
+ :param completed: Flag indicating if the benchmark is completed.
+ :type completed: bool
+ :param completed_count: Number of completed operations for the task.
+ :type completed_count: int
+ :param completed_total: Total number of operations for the task.
+ :type completed_total: int
+ :param start_time: Start time of the benchmark in timestamp format.
+ :type start_time: float
+ :param req_per_sec: Average requests per second.
+ :type req_per_sec: float
+ :raises ValueError: If trying to update a completed benchmark.
+ """
+
+ if self.benchmark_tasks_completed[index]:
+ err = ValueError(f"Benchmark {index} already completed")
+ logger.error("Error updating benchmark: {}", err)
+ raise err
+
+ if not self.benchmark_tasks_started[index]:
+ self.benchmark_tasks_started[index] = True
+ self.benchmarks_progress.start_task(self.benchmark_tasks[index])
+ logger.info("Starting benchmark task at index {}", index)
+
+ if completed:
+ self.benchmark_tasks_completed[index] = True
+ self.benchmark_tasks_progress[index] = 100
+ self.benchmarks_progress.stop_task(self.benchmark_tasks[index])
+ logger.info("Completed benchmark task at index {}", index)
+
+ self.benchmark_tasks_progress[index] = completed_count / completed_total * 100
+ self.benchmarks_progress.update(
+ self.benchmark_tasks[index],
+ description=description,
+ total=completed_total,
+ completed=completed_count if not completed else completed_total,
+ req_per_sec=(f"{req_per_sec:.2f}" if req_per_sec else "#.##"),
+ start_time_str=(
+ datetime.fromtimestamp(start_time).strftime("%H:%M:%S")
+ if start_time
+ else "--:--:--"
+ ),
+ )
+ logger.debug(
+ "Updated benchmark task at index {}: {}% complete",
+ index,
+ self.benchmark_tasks_progress[index],
+ )
+ self.report_progress.update(
+ self.report_task,
+ total=len(self.benchmark_tasks) * 100,
+ completed=sum(self.benchmark_tasks_progress),
+ completed_benchmarks=sum(self.benchmark_tasks_completed),
+ total_benchmarks=len(self.benchmark_tasks),
+ )
+
+ def finish(self) -> None:
+ """
+ Marks the overall report task as finished and stops the live display.
+ """
+ logger.info("Finishing BenchmarkReportProgress")
+ self.report_progress.update(
+ self.report_task,
+ total=len(self.benchmark_tasks) * 100,
+ completed=len(self.benchmark_tasks) * 100,
+ completed_benchmarks=len(self.benchmark_tasks),
+ total_benchmarks=len(self.benchmark_tasks),
+ )
+ self.report_progress.stop_task(self.report_task)
+ self.live.stop()
+ logger.info("BenchmarkReportProgress finished and live display stopped")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c5038c2e8235f02acca0503d773dbce9814e76
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py
@@ -0,0 +1,455 @@
+import csv
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+
+import ftfy
+import requests
+import yaml
+from loguru import logger
+
+from guidellm.config import settings
+
+__all__ = [
+ "clean_text",
+ "filter_text",
+ "is_path",
+ "is_path_like",
+ "is_url",
+ "load_text",
+ "load_text_lines",
+ "parse_text_objects",
+ "split_lines_by_punctuation",
+ "split_text",
+]
+
+
+NAME_TITLES = [
+ "Mr.",
+ "Mrs.",
+ "Ms.",
+ "Dr.",
+ "Prof.",
+ "Jr.",
+ "Sr.",
+ "St.",
+ "Lt.",
+ "Col.",
+ "Gen.",
+ "Rep.",
+ "Sen.",
+ "Gov.",
+ "Pres.",
+]
+SENTENCE_REGEX = r'[^.!?]*[.!?]["\']?\s*(?=[A-Z])'
+MAX_EXTENSION_LENGTH = 8
+MAX_PATH_LENGTH = 4096
+EXTENSION_TYPES = {
+ "csv": "csv",
+ "jsonl": "jsonl",
+ "json": "json",
+ "yaml": "yaml",
+ "yml": "yaml",
+ "txt": "txt",
+ "text": "txt",
+}
+
+
+def filter_text(
+ text: str,
+ filter_start: Optional[Union[str, int]] = None,
+ filter_end: Optional[Union[str, int]] = None,
+) -> str:
+ """
+ Filter text by start and end strings or indices
+
+ :param text: the text to filter
+ :param filter_start: the start string or index to filter from
+ :param filter_end: the end string or index to filter to
+ :return: the filtered text
+ """
+ filter_start_index = -1
+ filter_end_index = -1
+
+ if filter_start and isinstance(filter_start, str):
+ filter_start_index = text.index(filter_start)
+ elif filter_start:
+ if not isinstance(filter_start, int):
+ raise ValueError(f"Invalid filter start index: {filter_start}")
+ filter_start_index = filter_start
+
+ if filter_end and isinstance(filter_end, str):
+ filter_end_index = text.index(filter_end)
+ elif filter_end:
+ if not isinstance(filter_end, int):
+ raise ValueError(f"Invalid filter end index: {filter_end}")
+ filter_end_index = filter_end
+
+ if filter_start_index > -1:
+ text = text[filter_start_index:]
+ if filter_end_index > -1:
+ text = text[:filter_end_index]
+
+ return text
+
+
+def clean_text(
+ text: str,
+ fix_encoding: bool = True,
+ clean_whitespace: bool = False,
+ remove_empty_lines: bool = False,
+ force_new_line_punctuation: bool = False,
+) -> str:
+ """
+ Clean text by fixing encoding, cleaning whitespace, removing empty lines,
+ and forcing new line punctuation
+
+ :param text: the text to clean
+ :param fix_encoding: True to fix the encoding of the text, False to leave as is
+ :param clean_whitespace: True to clean the whitespace in the text
+ (remove extra spaces, tabs, etc), False to leave as is
+ :param remove_empty_lines: True to remove empty lines from the text
+ (lines with only whitespace), False to leave as is
+ :param force_new_line_punctuation: True to force new lines at punctuation
+ (line ends in a period, exclamation point, or question mark),
+ False to leave as is
+ :return: The cleaned text
+ """
+
+ if fix_encoding:
+ text = ftfy.fix_text(text)
+
+ if clean_whitespace:
+ text = "\n".join(
+ [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
+ )
+
+ if remove_empty_lines:
+ text = "\n".join([line for line in text.splitlines() if line.strip()])
+
+ if force_new_line_punctuation:
+ # first remove any existing new lines
+ text = " ".join(line for line in text.splitlines() if line.strip())
+ lines = split_lines_by_punctuation(text)
+ text = "\n".join(lines)
+
+ return text
+
+
+def split_lines_by_punctuation(text: str) -> List[str]:
+ """
+ Split text into lines based on punctuation
+
+ :param text: the text to split
+ :return: the list of lines
+ """
+
+ lines = []
+ current_line = ""
+ skip_next = False
+
+ for index, char in enumerate(text):
+ if skip_next:
+ skip_next = False
+ continue
+
+ current_line += char
+
+ if char not in [".", "!", "?"]:
+ # must match end of sentence punctuation
+ continue
+
+ # if this is the character for a title, don't split
+ if any(current_line.endswith(title) for title in NAME_TITLES):
+ continue
+
+ char_next_1 = text[index + 1] if index + 1 < len(text) else None
+ char_next_2 = text[index + 2] if index + 2 < len(text) else None
+ char_next_3 = text[index + 3] if index + 3 < len(text) else None
+
+ next_is_space = char_next_1 and char_next_1.isspace()
+ next_is_quote_and_space = char_next_1 in ["'", '"'] and char_next_2 == " "
+
+ # next character must be a space or a quote, otherwise skip
+ if not next_is_space and not next_is_quote_and_space:
+ continue
+
+ # after this, next character must be an upper case letter
+ upper_char = char_next_3 if next_is_quote_and_space else char_next_2
+ next_is_upper = upper_char and (
+ upper_char.isupper() or upper_char in ["'", '"']
+ )
+
+ if not next_is_upper:
+ continue
+
+ # if next char is a quote, add it and skip next
+ if next_is_quote_and_space:
+ current_line += text[index + 1]
+ skip_next = True
+
+ lines.append(current_line.strip())
+ current_line = ""
+
+ if current_line:
+ lines.append(current_line.strip())
+
+ return lines
+
+
+def is_url(url: str) -> bool:
+ """
+ Check if a string is a URL
+
+ :param url: the string to check
+ :return: True if the string is a URL, False if not
+ """
+ try:
+ result = urlparse(url)
+ return all([result.scheme, result.netloc])
+ except Exception: # noqa: BLE001
+ return False
+
+
+def is_path(path: Any) -> bool:
+ """
+ Check if a string is a path
+
+ :param path: the string to check
+ :return: True if the string is a path, False if not
+ """
+ if not isinstance(path, (str, Path)):
+ return False
+
+ if isinstance(path, str):
+ path = Path(path)
+
+ return path.exists()
+
+
+def is_path_like(path: Any, enforce_file: bool = False) -> bool:
+ """
+ Check if a string has a path like structure where it doesn't need to exist
+
+ :param path: the string to check
+ :param enforce_file: True if the path should be a file, False if not
+ :return: True if the string is path like, False if not
+ """
+ # if path isn't a str or Path, it's not a path
+ if not isinstance(path, (str, Path)):
+ return False
+
+ if isinstance(path, Path):
+ path = str(path)
+
+ # if text is too long, it's not a path (4096 for most linux setups)
+ if len(path) > MAX_PATH_LENGTH:
+ return False
+
+ # if it starts with a URL scheme, it's not a path
+ if path.startswith(("http", "ftp")):
+ return False
+
+ test_path = Path(path)
+
+ # if it's supposed to be a file and there's no extension or
+ # the extension is too long, it's not a path
+ return not enforce_file or (
+ bool(test_path.suffix) and len(test_path.suffix) <= MAX_EXTENSION_LENGTH
+ )
+
+
+def split_text(text: str) -> Tuple[List[str], List[str], List[int]]:
+ """
+ Split text into words / tokens, the white space separators between words,
+ and the indices for each new line
+
+ :param text: the text to split
+ :return: the words, the white space separators, and the new line indices
+ """
+ if not text or not text.strip():
+ return [], [], []
+
+ text = text.strip()
+ tokens = [] # type: List[str]
+ separators = [] # type: List[str]
+ new_lines = [0]
+ buffer = text[0]
+ is_token = not text[0].isspace()
+
+ for char in text[1:]:
+ char_whitespace = char.isspace()
+
+ if char == "\n":
+ new_lines.append(len(tokens) + 1)
+
+ if char_whitespace and is_token:
+ tokens.append(buffer)
+ buffer = char
+ is_token = False
+ elif char_whitespace:
+ buffer += char
+ elif not char_whitespace and not is_token:
+ separators.append(buffer)
+ buffer = char
+ is_token = True
+ else:
+ buffer += char
+
+ if buffer and is_token:
+ tokens.append(buffer)
+ separators.append(" ")
+ elif buffer:
+ separators.append(buffer)
+
+ return tokens, separators, new_lines
+
+
+def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
+ """
+ Load an HTML file from a path or URL
+
+ :param data: the path or URL to load the HTML file from
+ :type data: Union[str, Path]
+ :param encoding: the encoding to use when reading the file
+ :type encoding: str
+ :return: the HTML content
+ :rtype: str
+ """
+ logger.debug("Loading text: {}", data)
+
+ if not data:
+ return ""
+
+ # check URLs
+ if isinstance(data, str) and data.startswith("http"):
+ response = requests.get(data, timeout=settings.request_timeout)
+ response.raise_for_status()
+ return response.text
+
+ # check raw text
+ if isinstance(data, str) and not is_path_like(data, enforce_file=True):
+ return data
+
+ # assume local file
+ if not isinstance(data, Path):
+ data = Path(data)
+
+ if not data.exists():
+ raise FileNotFoundError(f"File not found: {data}")
+
+ if not data.is_file():
+ raise IsADirectoryError(f"Path is a directory: {data}")
+
+ return data.read_text(encoding=encoding)
+
+
+def parse_text_objects(data: str, format_: str = "txt") -> List[Dict]:
+ """
+ Parse text data into a list of dictionaries based on the format given
+ (csv, jsonl, json, yaml, txt).
+
+ :param data: the text data to parse
+ :param format_: the format of the data to parse:
+ 'csv', 'jsonl', 'json', 'yaml', 'txt'
+ :return: the list of dictionaries parsed from the data, if text
+ then each line is a dictionary with a single key 'text'
+ """
+ if not isinstance(data, str):
+ raise ValueError(f"Unsupported data given of type: {type(data)}")
+
+ if format_ == "csv":
+ reader = csv.DictReader(data.splitlines())
+ columns = reader.fieldnames
+ return [{col: row[col] for col in columns} for row in reader] # type: ignore # noqa: PGH003
+
+ if format_ == "jsonl":
+ return [json.loads(line) for line in data.splitlines() if line]
+
+ if format_ in ("json", "yaml"):
+ data = json.loads(data) if format_ == "json" else yaml.safe_load(data)
+
+ if not data:
+ return []
+
+ if isinstance(data, dict) and len(data) == 1:
+ logger.debug("Getting first value from JSON/YAML object: {}", data)
+ data = list(data.values())[0]
+ elif isinstance(data, dict):
+ logger.debug("Converting JSON/YAML object to list: {}", data)
+ data = list(data.values())
+
+ if not isinstance(data, list) or not isinstance(data[0], dict):
+ raise ValueError(f"Unsupported data structure given: {data}")
+
+ return data
+
+ if format_ == "txt":
+ return [{"text": line} for line in data.splitlines() if line]
+
+ raise ValueError(f"Unsupported format given: {format_}")
+
+
+def load_text_lines(
+ data: Union[str, Path, List[Dict]],
+ format_: Optional[str] = None,
+ filters: Optional[List[str]] = None,
+ encoding: Optional[str] = None,
+) -> List[str]:
+ """
+ Load text lines from a file or data object with optional filtering and formatting.
+
+
+ :param data: the data to load the text lines from
+ :param format_: the format of the data to load, if not provided will be inferred.
+ Supported formats: 'csv', 'jsonl', 'json', 'yaml', 'txt'
+ :param filters: the keys to filter the data by when loading in order of preference.
+ If not provided, will use the first key in the data object.
+ :param encoding: the encoding to use when reading the file
+ :return: the list of text lines
+ """
+ logger.debug(
+ "Loading text lines with format {}, filters {}, encoding {} for data: {}",
+ format_,
+ filters,
+ encoding,
+ data,
+ )
+
+ if not data:
+ return []
+
+ if not format_ and isinstance(data, (str, Path)) and "." in str(data):
+ extension = str(data).split(".")[-1]
+ format_ = EXTENSION_TYPES.get(extension, "txt")
+ elif not format_:
+ format_ = "txt"
+
+ # load the data if it's a path or URL
+ if isinstance(data, (Path, str)):
+ data = load_text(data, encoding=encoding)
+ data = clean_text(data)
+
+ # parse the data into a list of dictionaries based on the format
+ if isinstance(data, str):
+ data = parse_text_objects(data, format_)
+
+ if not isinstance(data, list):
+ raise ValueError(f"Unsupported data given of type: {type(data)}")
+
+ if not isinstance(data[0], dict):
+ raise ValueError(f"Unsupported data item type given: {type(data[0])}")
+
+ # grab the first available filter key to use if preference order as provided
+ filter_ = list(data[0].keys())[0]
+ for filt in filters or []:
+ if filt not in data[0]:
+ continue
+
+ filter_ = filt
+ break
+
+ # extract the lines from the data
+ return [row[filter_] for row in data] if filter_ else [str(row) for row in data]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..540572994eb692ddcaeced0055feb6a1c932f7f2
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py
@@ -0,0 +1,151 @@
+from pathlib import Path
+from typing import List, Optional, Union
+
+from datasets import ( # type: ignore # noqa: PGH003
+ Dataset,
+ DatasetDict,
+ IterableDataset,
+ IterableDatasetDict,
+ load_dataset,
+)
+from loguru import logger
+
+from guidellm.config import settings
+
+__all__ = [
+ "load_transformers_dataset",
+ "resolve_transformers_dataset",
+ "resolve_transformers_dataset_column",
+ "resolve_transformers_dataset_split",
+]
+
+
+def load_transformers_dataset(
+ dataset: Union[
+ str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+ ],
+ split: Optional[str] = None,
+ preferred_splits: Optional[List[str]] = settings.dataset.preferred_data_splits,
+ **kwargs,
+) -> Union[Dataset, IterableDataset]:
+ """
+ Load a dataset from a file or a script and resolve the preferred split.
+
+ :param dataset: the dataset file or script to load
+ :param split: the dataset split to use
+ (overrides preferred_splits, must be in dataset)
+ :param preferred_splits: the preferred dataset splits to use
+ :param kwargs: additional keyword arguments to pass to the dataset loader
+ :return: the loaded dataset
+ """
+ dataset = resolve_transformers_dataset(dataset, **kwargs)
+
+ return resolve_transformers_dataset_split(dataset, split, preferred_splits)
+
+
+def resolve_transformers_dataset(
+ dataset: Union[
+ str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+ ],
+ **kwargs,
+) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:
+ """
+ Resolve the dataset from a file (csv, json, script) or a dataset name.
+
+ :param dataset: the dataset file or script to load
+ :param kwargs: additional keyword arguments to pass to the dataset loader
+ :return: the loaded dataset
+ """
+ if isinstance(
+ dataset, (DatasetDict, Dataset, IterableDatasetDict, IterableDataset)
+ ):
+ return dataset
+
+ if not isinstance(dataset, (str, Path)):
+ raise ValueError(f"Invalid dataset type: {type(dataset)}")
+
+ dataset = str(dataset)
+
+ if dataset.endswith((".csv", ".json")):
+ logger.debug("Loading dataset from local path: {}", dataset)
+ extension = dataset.split(".")[-1]
+
+ return load_dataset(extension, data_files=dataset, **kwargs)
+
+ if dataset.endswith(".py"):
+ logger.debug("Loading dataset from local script: {}", dataset)
+
+ return load_dataset(dataset, **kwargs)
+
+ logger.debug("Loading dataset: {}", dataset)
+
+ return load_dataset(dataset, **kwargs)
+
+
+def resolve_transformers_dataset_split(
+ dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
+ split: Optional[str] = None,
+ preferred_splits: Optional[List[str]] = settings.dataset.preferred_data_splits,
+) -> Union[Dataset, IterableDataset]:
+ """
+ Resolve the preferred split from a dataset dictionary.
+
+ :param dataset: the dataset to resolve the split from
+ :param split: the dataset split to use
+ (overrides preferred_splits, must be in dataset)
+ :param preferred_splits: the preferred dataset splits to use
+ :return: the resolved dataset split
+ """
+ if not isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+ logger.debug("Dataset is not a dictionary, using default split")
+ return dataset
+
+ if split:
+ if split not in dataset:
+ raise ValueError(f"Split '{split}' not found in dataset")
+
+ return dataset[split]
+
+ if preferred_splits:
+ for spl in preferred_splits:
+ if spl not in dataset:
+ continue
+ return dataset[spl]
+
+ return list(dataset.values())[0]
+
+
+def resolve_transformers_dataset_column(
+ dataset: Union[Dataset, IterableDataset],
+ column: Optional[str] = None,
+ preferred_columns: Optional[List[str]] = settings.dataset.preferred_data_columns,
+) -> str:
+ """
+ Resolve the preferred column from a dataset.
+
+ :param dataset: the dataset to resolve the column from
+ :param column: the dataset column to use
+ (overrides preferred_columns, must be in dataset)
+ :param preferred_columns: the preferred dataset columns to use
+ :return: the resolved dataset column
+ """
+ column_names = dataset.column_names
+
+ if not column_names:
+ # grab from the first item
+ first_item = next(iter(dataset))
+ column_names = list(first_item.keys())
+
+ if column:
+ if column not in column_names:
+ raise ValueError(f"Column '{column}' not found in dataset")
+
+ return column
+
+ if preferred_columns:
+ for col in preferred_columns:
+ if col not in column_names:
+ continue
+ return col
+
+ return list(column_names)[0]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py
new file mode 100644
index 0000000000000000000000000000000000000000..74000dd8d3acdde3a539c1efb01f1de9b640f9db
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py
@@ -0,0 +1,79 @@
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+import toml
+from loguru import logger
+
+
+def get_build_type():
+ return os.getenv("GUIDELLM_BUILD_TYPE", "dev")
+
+
+def get_build_number():
+ return os.getenv("GUIDELLM_BUILD_NUMBER", "0")
+
+
+def construct_project_name_and_version(build_type, build_number, current_version):
+ if not re.match(r"^\d+\.\d+\.\d+$", current_version):
+ raise ValueError(
+ f"Version '{current_version}' does not match the "
+ f"semantic versioning pattern '#.#.#'",
+ )
+
+ if build_type == "dev":
+ project_name = "guidellm_dev"
+ version = f"{current_version}.dev{build_number}"
+ elif build_type == "nightly":
+ project_name = "guidellm_nightly"
+ date_str = datetime.now().strftime("%Y%m%d")
+ version = f"{current_version}.{date_str}"
+ elif build_type == "release":
+ project_name = "guidellm"
+ version = current_version
+ else:
+ raise ValueError(f"Unknown build type: {build_type}")
+
+ return project_name, version
+
+
+def update_pyproject_toml(project_name, version):
+ try:
+ with Path("pyproject.toml").open() as file:
+ data = toml.load(file)
+
+ data["project"]["name"] = project_name
+ data["project"]["version"] = version
+
+ with Path("pyproject.toml").open("w") as file:
+ toml.dump(data, file)
+
+ logger.info(
+ f"Updated project name to: {project_name} and version to: {version}",
+ )
+ except (FileNotFoundError, toml.TomlDecodeError) as e:
+ logger.error(f"Error reading or writing pyproject.toml: {e}")
+ raise
+
+
+def main():
+ build_type = get_build_type()
+ build_number = get_build_number()
+
+ with Path("pyproject.toml").open() as file:
+ pyproject_data = toml.load(file)
+
+ current_version = pyproject_data["project"]["version"]
+ project_name, version = construct_project_name_and_version(
+ build_type,
+ build_number,
+ current_version,
+ )
+
+ if build_type != "release":
+ update_pyproject_toml(project_name, version)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/models/multimodal/vision_language_model/step3/vllm/README.md b/models/multimodal/vision_language_model/step3/vllm/README.md
index ce1df9df1d573642f834fbb7e3a0c1732d34e627..11266ddc300f18172ed7b6027ac787e5fce02820 100644
--- a/models/multimodal/vision_language_model/step3/vllm/README.md
+++ b/models/multimodal/vision_language_model/step3/vllm/README.md
@@ -9,6 +9,7 @@ Step3 is cutting-edge multimodal reasoning model—built on a Mixture-of-Experts
| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
| :----: | :----: | :----: |
| MR-V100 | dev-only | 25.12 |
+| MR-V100 | 4.4.0 | 26.03 |
## Model Preparation
@@ -33,6 +34,51 @@ pip3 install -r requirements.txt
## Model Inference
+### Inference with W4A8
+
+#### Performance Test
+
+1. Set environment variables:
+```bash
+export VLLM_W8A8_MOE_USE_W4A8=1
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+2. Start server:
+```bash
+vllm serve /path/to/model --limit-mm-per-prompt '{"image":5}' --gpu-memory-utilization 0.92 --port 12347 --trust-remote-code --disable-cascade-attn --no-enable-prefix-caching --max-model-len 65536 --tensor-parallel-size 4 --pipeline-parallel-size 4 --max-num-seqs 1024 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+3. Run client (Input1024, Output1024, BS10):
+```bash
+vllm bench serve --num-prompts 4*[max-concurrency] --model /path/to/model --dataset-name random --random-input-len 1024 --random-output-len 1024 --max-concurrency 10 --host 0.0.0.0 --port 12347 --disable-tqdm --ignore-eos
+```
+
+#### Accuracy Test
+
+4. The evaluation scripts are already included in this directory:
+```bash
+# eval_dataset.py and eval_dataset_w8a8.py are in the current directory
+pip install fire
+```
+
+5. Set environment variables:
+```bash
+export VLLM_W8A8_MOE_USE_W4A8=1
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+6. Start server:
+```bash
+vllm serve /path/to/model --limit-mm-per-prompt '{"image":5}' --gpu-memory-utilization 0.92 --port 12347 --trust-remote-code --disable-cascade-attn --no-enable-prefix-caching --max-model-len 65536 --tensor-parallel-size 4 --pipeline-parallel-size 4 --max-num-seqs 1024 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+7. Run client (MMMU dataset):
+```bash
+pip install fire
+python3 eval_dataset.py --dataset_name MMMU_BETA --model /path/to/model --ip 127.0.0.1 --port 12347 --num_workers 8
+```
+
### Inference with w8a8
#### Starting w8a8 server
```bash
diff --git a/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md b/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md
index b59e7d8aa673153fe16a725047904f7c035453e6..8b8a596f303b60acd61cbcc50dce7e1ecfd2cd78 100644
--- a/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md
+++ b/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md
@@ -10,6 +10,7 @@ based on Qwen2.5 and Llama3 series to the community.
| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
| MR-V100 | 4.3.0 | 25.09 |
| MR-V100 | 4.2.0 | 25.03 |
@@ -49,6 +50,31 @@ python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-7B --max-to
vllm serve data/DeepSeek-R1-Distill-Qwen-7B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code
```
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+4. Start server (DeepSeek-R1-Distill-Qwen-7B BF16):
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=1 --max-model-len 20480 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input2048, Output1024, BS8):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 8 --random-input 2048 --max-concurrency 8 --tokenize-prompt --random-range-ratio 1 --random-output 1024
+```
+
## Model Results
### Benchmarking vLLM
diff --git a/models/nlp/llm/deepseek-v3.1/vllm/README.md b/models/nlp/llm/deepseek-v3.1/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..17d6d265bab98fefed187c0bbd99c19671058566
--- /dev/null
+++ b/models/nlp/llm/deepseek-v3.1/vllm/README.md
@@ -0,0 +1,79 @@
+# DeepSeek-V3.1 (vLLM)
+
+## Model Description
+
+DeepSeek-V3 is a powerful Mixture-of-Experts (MoE) language model with 671B total parameters and 37B activated parameters. It achieves excellent performance on math, code, and reasoning tasks, comparable to leading models like GPT-4 and Claude-3.5.
+
+This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_W8A8_MOE_USE_W4A8=1
+export VLLM_ENFORCE_CUDA_GRAPH=1
+export VLLM_PP_LAYER_PARTITION="16,16,16,13"
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=4 --tensor-parallel-size=4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input128, Output128, BS8):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 8 --random-input 128 --max-concurrency 8 --tokenize-prompt --random-range-ratio 1 --random-output 128
+```
+
+#### Accuracy Test
+
+6. Install evalscope:
+```bash
+pip3 install 'evalscope[app,perf]' -U
+```
+
+7. Set environment variables:
+```bash
+export VLLM_USE_MODELSCOPE=True
+```
+
+8. Start server:
+```bash
+vllm serve /path/to/model --max-num-seqs 4 --max-model-len 95600 --served-model-name DeepSeek-v3.1-int4-pack8 --trust-remote-code --disable-cascade-attn --tensor-parallel-size 8 --pipeline-parallel-size 2 --compilation-config '{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' --port 9989
+```
+
+9. Run client (MATH-500 dataset):
+```bash
+evalscope eval --model DeepSeek-v3.1-W4A8 --dataset-args '{"math_500": {"few_shot_num": 0}}' --generation-config '{"do_sample": true, "temperature": 0.6, "max_tokens": 32768, "n": 1, "top_p": 0.95}' --datasets math_500 --eval-type openai_api --eval-batch-size 4 --api-url http://127.0.0.1:9989/v1 --timeout 12000000 --api-key EMPTY --eval-type openai_api
+```
+
+## References
+
+- [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/llm-benchmark/README.md b/models/nlp/llm/llm-benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..33aa05bf03d0c21fc64da72b190b64c586ac98a8
--- /dev/null
+++ b/models/nlp/llm/llm-benchmark/README.md
@@ -0,0 +1,308 @@
+# 安装
+
+```bash
+# pip安装依赖
+pip3 install -r requirements.txt
+```
+
+# 精度评测
+
+## 简单评测
+
+假如使用SGLang拉起模型服务,IP为:127.0.0.1,PORT为30000,在指定的若干数据集上使用默认配置评测DeepSeek模型,在任意路径下,执行`eval`命令:
+```bash
+./iluvatar_bench eval \
+ --model /data/DeepSeek-R1-AWQ \
+ --datasets gsm8k \
+ --limit 4 \
+ --eval-batch-size 8
+```
+
+### 基本参数说明
+
+- `--model`: 指定了模型在ModelScope中的model_id,可自动下载,也可使用模型的本地路径,例如/path/to/model。
+- `--datasets`: 数据集名称,支持输入多个数据集,使用空格分开,数据集将自动从modelscope下载,支持的数据集参考数[据集列表](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/index.html)。
+- `--limit`: 每个数据集子集,最大评测数据量,不填写则默认为全部评测,可用于快速验证。
+- `--eval-batch-size`: 评测批量大小,默认为1,表示并发请求数量。
+
+
+## 模型API服务评测
+
+指定模型API服务地址(api_url)和API Key(api_key),评测部署的模型API服务,此时eval-type参数指定为server,默认参数如下:
+```bash
+--api-key='EMPTY' \
+--api-url='http://127.0.0.1:30000/v1' \
+--eval-type='server'
+```
+
+# 性能测试-1
+
+## 基本使用
+
+下面展示了用 SGLang 框架在 Bi150 上进行 DeepSeek-R1-AWQ 模型的压测示例,固定输入1024 token,输出1024 token。用户可以根据自己的需求修改参数。
+
+```bash
+./iluvatar_bench perf \
+ --parallel 1 10 50 100 200 \
+ --number 10 20 100 200 400 \
+ --model /data/DeepSeek-R1-AWQ \
+ --url http://127.0.0.1:30000/v1/completions \
+ --api openai \
+ --dataset random \
+ --max-tokens 1024 \
+ --min-tokens 1024 \
+ --prefix-length 0 \
+ --min-prompt-length 1024 \
+ --max-prompt-length 1024 \
+ --tokenizer-path /data/DeepSeek-R1-AWQ \
+ --extra-args '{"ignore_eos": true}'
+```
+
+### 参数说明
+
+- `parallel`: 请求的并发数,可以传入多个值,用空格隔开。
+- `number`: 发出的请求的总数量,可以传入多个值,用空格隔开(与`parallel`一一对应)。
+- `url`: 请求的URL地址。
+- `model`: 使用的模型名称。
+- `api`: 使用的API服务,默认为`openai`。
+- `dataset`: 数据集名称,此处为`random`,表示随机生成数据集,具体使用说明参考;更多可用的(多模态)数据集请参考数据集配置。
+- `tokenizer-path`: 模型的tokenizer路径,用于计算token数量(在random数据集中是必须的)。
+- `extra-args`: 请求中的额外的参数,传入json格式的字符串,例如`{"ignore_eos": true}`表示忽略结束token。
+
+**默认参数**
+
+其中,下列参数,属于默认参数。
+```bash
+--max-tokens=1024,
+--min-tokens=1024,
+--min-prompt-length=1024,
+--max-prompt-length=1024,
+--api='openai',
+--url='http://127.0.0.1:30000/v1/completions'
+```
+
+`max-tokens`和`min-tokens`表示最大生成长度和最小生成长度。
+`max-prompt-length`和`min-prompt-length`,表示最大提示词长度和最小提示词长度。
+
+# 性能测试-2
+
+这里使用SGLang自带的 bench_serving.py(源自vLLM),用于测量在线服务吞吐量和延迟。
+
+bench_serving.py 文件信息:
+```bash
+git log --oneline -- ./python/sglang/bench_serving.py
+88a6f9dab bench_serving support PD Disaggregation (#11542)
+```
+
+## 基本使用
+
+注:`./iluvatar_bench sgl-perf` 命令,与 `python3 bench_serving.py` 等同,都可以运行。
+
+```bash
+./iluvatar_bench sgl-perf \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --num-prompts 1000
+```
+模型名或路径,若未设置,系统将向/v1/models请求默认模型配置。
+
+## 公共参数
+
+* `--backend backend`: sglang/vllm等后端。
+* `--model`: 模型名称或者地址。
+* 连接参数:`--host`和`--port`、或`--base-url`。
+* `--dataset-name`: sharegpt, random, random-ids, generated-shared-prefix等,不同数据集,相关的配置参数不同。
+* `--request-rate`:每秒到达的请求数(默认值 inf,表示所有请求同时到达),使用**泊松过程(Poisson process)**来模拟请求的到达时间。这意味着请求之间的时间间隔是随机的,但平均速率符合设定的值。这更真实地模拟了现实世界中随机到达的用户请求。假如每隔3.5秒,需要发送6条数据,则 `Request rate = 6 requests / 3.5 seconds ≈ 1.71 requests/second`。
+* `--request-interval`: 固定间隔时间(秒)。如果设置,此值将覆盖 `--request-rate` 的设置,并使用确定性(固定时间)的间隔调度。
+* `--max-concurrency`:最大并发请求数。表示实际处理请求的 worker 数量,虽然 `--request-rate` 参数控制请求发起的速率,但此参数控制实际允许同时执行的请求数量。。
+* `--warmup-requests`: benchmark 前的 warmup 次数。
+
+## sharegpt 数据集
+
+`sharegpt`是真实对话数据集(默认),相关的参数如下:
+* `--num-prompts`: 请求总数。
+* `--sharegpt-output-len`: 输出长度,如果没有指定,则由数据集中的样本长度决定。
+* `--sharegpt-context-len`: 设置上下文总体长度,被指定时,当 `输入 + 输出 > 最大上下文长度`时,request会被跳过。
+
+简单来说,输入长度不可以被指定,输出长度和最大上下文长度,可以被指定。
+
+注意,当出现以下情况,request同样会被跳过:
+* `prompt_len < 2` 或者 `output_len < 2`
+
+## random/random-ids数据集
+
+* `random`: **真实的文本**,来自 ShareGPT 数据集。确定一个随机的目标输入长度(例如 500 token),它会从 ShareGPT 数据集中随机选择一个真实的提示。如果提示太长(例如 1000 token),它会截断 (truncate) 提示到 500 token;如果提示太短(例如 100 token),它会重复 (repeat) 这个提示的 token,直到填满 500 token。用于模拟一个提示内容是**真实自然语言**的随机长度工作负载。
+* `random-ids`: **完全随机的 Token ID**。首先确定一个随机的目标输入长度(例如 500 token),它不会加载任何外部数据集,它会直接在 tokenizer 的词汇表(vocab)范围内随机生成 500 个 token ID。这些 ID 组合起来的文本**不具有任何语言学意义**(即"乱码")。模拟一个提示内容是随机、无意义数据的随机长度工作负载,对于压力测试 tokenizer 和模型处理异常输入的能力很有用。
+
+相关的参数如下:
+* `--num-prompts`:要处理的请求总数
+* `--random-input-len`(default:1024): 每个请求的最大输入 token 长度。脚本会在 `[random-input-len * random-range-ratio, random-input-len + 1)`随机采样一个长度。
+* `--random-output-len`(default:1024): 每个请求的最大输出 token 长度。脚本会在 `[random-output-len * random-range-ratio, random-output-len + 1)`随机采样一个长度。
+* `--random-range-ratio`(default:0.0): 一个介于 0.0 和 1.0 之间的浮点数,用于定义随机长度的下限。如果希望输入/输出长度固定为 1024,设置为 1.0 即可。
+* `--tokenize-prompt`: 主要用于 `random` 和 `random-ids` 数据集,以便在使用 `sglang` 和 `vllm` 后端时,通过发送精确长度的 token ID 列表来进行基准测试。例如,客户端生成 `[1024个ID]` 列表,跳过解码步骤,直接将这个整数ID列表发送给服务器。服务器收到ID列表后,会跳过分词步骤,直接使用这个列表。好处就是,这保证了服务器处理的输入长度**精确地**是我们想要的1024个 token。
+
+以下是对于输入/输出长度,如何进行最大值/最小值的计算,如果不需要,跳过以下“计算公式”和“举例说明”即可。
+
+计算公式:
+* `实际输入长度=[random-input-len * random-range-ratio, random-input-len + 1)`
+* `实际输出长度=[random-output-len * random-range-ratio, random-output-len + 1)`
+
+举例说明:
+```bash
+--dataset-name random \
+--random-input-len 1024 \
+--random-output-len 1024 \
+--random-range-ratio 0.8
+```
+则,输入/输出长度大小,会从区间`[819, 1025) = [1024 * 0.8, 1024+1)`进行随机取值,这时候的输入/输出长度,可能是`833/955`.
+如果希望输入/输出长度固定为 1024,则需要把 `--random-range-ratio` 设置为 1.0.
+
+## generated-shared-prefix 数据集
+
+`generated-shared-prefix` 数据集并不是一个像 `sharegpt` 那样从外部文件加载的静态数据集,而是一个动态生成的数据集。
+
+它的核心目的是模拟一个非常重要且常见的 LLM 服务场景:大量请求共享一个长的前缀(prefix)。这通常发生在多租户、RAG(检索增强生成)或设置了复杂系统指令(system prompt)的应用中。
+
+每个生成的请求都由两部分组成:
+
+1. **共享的系统提示 (System Prompt)**:一个很长的、在组内共享的文本块。
+2. **唯一的问题 (Question)**:一个较短的、每个请求独有的文本块。
+
+完整的提示 (prompt) 会被构造成类似于 `"{system_prompt}\n\n{question}"` 的形式。
+
+相关的参数如下:
+* `--gsp-num-groups` (default: 64): 定义了要生成的**唯一系统提示 (system prompt) 的数量**。代表了基准测试中有多少个"共享前缀"的组。
+* `--gsp-prompts-per-group` (default: 16): 定义了每个组(即每个系统提示)包含多少个唯一的请求(问题)。这决定了每个共享前缀被重用的次数。总的请求数量将是 `gsp-num-groups * gsp-prompts-per-group`。
+* `--gsp-system-prompt-len` (default: 2048): 每个生成的 system prompt 的目标 token 长度。这用于模拟一个很长的前缀(例如,一个复杂的指令集或一个大的上下文文档)。
+* `--gsp-question-len` (default: 128): 每个生成的唯一问题的目标 token 长度。这模拟了用户输入的、非共享的那部分提示。
+* `--gsp-output-len` (default: 256): 基准测试中,为每个请求设置的目标输出 token 数量。这定义了模型在接收到 `system_prompt + question` 后需要生成多少内容。
+
+## 例子
+
+1. `sharegpt` (模拟真实对话)
+
+```bash
+./iluvatar_bench sgl-perf \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model /home/data/qwen3/Qwen3-32B \
+ --dataset-name sharegpt \
+ --host 127.0.0.1 --port 30000 \
+ --num-prompts 1000
+```
+
+2. `random` (模拟特定长度的合成负载)
+
+```bash
+./iluvatar_bench sgl-perf \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model /home/data/qwen3/Qwen3-32B \
+ --dataset-name random \
+ --num-prompts 1000 \
+ --random-input 2048 \
+ --random-output 128 \
+ --random-range-ratio 0.5
+```
+此命令测试1000个请求。每个请求的输入长度将在 `(2048 * 0.5)` 到 `2048` 之间随机(即 1024 到 2048 token)。输出长度将在 `(128 * 0.5)` 到 `128` 之间随机(即 64 到 128 token)。提示内容是基于ShareGPT文本填充的。
+
+3. `random-ids` (纯粹的压力测试)
+
+最极端的压力测试。它不关心提示的语言含义。它只是生成完全随机的 Token ID 来填满指定的输入长度。
+可以与 `--tokenize-prompt` 结合使用,以发送 `[1024, 512, 300, ...]` 这样的ID列表,而不是解码后的乱码字符串。这可以**100%精确地控制输入长度**,是测量纯硬件和系统吞吐量的最佳方式。
+
+```bash
+# 压力测试: 1000个请求,每个请求 *精确地* 包含1024个输入ID
+# 并请求1024个输出ID
+./iluvatar_bench sgl-perf \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model /home/data/qwen3/Qwen3-32B \
+ --dataset-name random-ids \
+ --num-prompts 1000 \
+ --random-input-len 1024 \
+ --random-output-len 1024 \
+ --random-range-ratio 1.0 \
+ --tokenize-prompt
+```
+`--random-range-ratio 1.0` 确保输入/输出长度不会随机化,而是精确等于 1024。`--tokenize-prompt` 确保客户端发送的是 `input_ids` 列表,而不是 `text`。这个命令是在测量服务器处理“1024-in, 1024-out”请求的纯粹性能。
+
+4. 速率控制 + 输出文件
+
+```bash
+./iluvatar_bench sgl-perf \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model /home/data/qwen3/Qwen3-32B \
+ --dataset-name random \
+ --random-input-len 1024 --random-output-len 1024 --random-range-ratio 1.0 \
+ --num-prompts 2000 \
+ --request-rate 100 \
+ --max-concurrency 512 \
+ --output-file sglang_random.jsonl --output-details
+```
+
+5. `generated-shared-prefix` (测试 KV Cache 性能)
+
+```bash
+./iluvatar_bench sgl-perf \
+ --backend sglang \
+ --host 127.0.0.1 --port 30000 \
+ --model /home/data/qwen3/Qwen3-32B \
+ --dataset-name generated-shared-prefix \
+ --gsp-num-groups 64 --gsp-prompts-per-group 16 \
+ --gsp-system-prompt-len 4096 --gsp-question-len 128 --gsp-output-len 256 \
+ --num-prompts 1024
+```
+此命令将生成 `64 * 16 = 1024` 个总请求。它会创建 64 个不同的、长度为 4096 token 的“系统提示”(共享前缀)。
+
+然后,对于每一个系统提示,它都会生成 16 个不同的、长度为 128 token 的“问题”。服务器在处理这1024个(被打乱的)请求时,如果其KV Cache效率高,那么它处理4096-token前缀的成本应该只支付64次,而不是1024次。
+
+## PD Disaggregation Mode性能分析
+
+**关键参数**
+
+* `--pd-separated`: 启动 PD 模式。
+* `--profile-prefill-url`:用于性能分析的 prefill worker数量。
+* `--profile-decode-url`: 用于性能分析的 decode worker数量。
+
+`--profile-prefill-url` 并且 `--profile-decode-url` 是相互排斥的 - 只能二选一。
+
+Start server
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# start prefill and decode servers (see PD disaggregation docs for setup)
+python3 -m sglang.launch_server --model-path /home/data/qwen3/Qwen3-32B --disaggregation-mode prefill
+python3 -m sglang.launch_server --model-path /home/data/qwen3/Qwen3-32B --disaggregation-mode decode --port 30001 --base-gpu-id 1
+
+# start router
+python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+```
+
+Profile Prefill Workers
+```bash
+# send profiling request targeting prefill workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000
+```
+
+Profile Decode Workers
+```bash
+# send profiling request targeting decode workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001
+```
+
+注意:
+* 这两个选项都支持用于多实例设置下的多个 worker URL:
+```bash
+# Profile multiple prefill workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000 http://127.0.0.1:30002
+
+# Profile multiple decode workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001 http://127.0.0.1:30003
+```
+
+# References
+
+- [evalscope](https://github.com/modelscope/evalscope)
\ No newline at end of file
diff --git a/models/nlp/llm/llm-benchmark/iluvatar_bench b/models/nlp/llm/llm-benchmark/iluvatar_bench
new file mode 100644
index 0000000000000000000000000000000000000000..fa6ab398bea7a1006c26b2acb6e0e5560b55fe05
--- /dev/null
+++ b/models/nlp/llm/llm-benchmark/iluvatar_bench
@@ -0,0 +1,181 @@
+#!/usr/local/bin/python3
+
+import argparse
+from argparse import ArgumentParser
+
+from evalscope import __version__
+from evalscope.cli.base import CLICommand
+
+from bench_serving import define_sgl_bench_args, run_benchmark
+
+class PerfBenchCMD(CLICommand):
+ name = 'perf'
+
+ def __init__(self, args):
+ self.args = args
+
+ @classmethod
+ def subparser_func(cls, args):
+ """
+ Function which will be called for a specific sub parser.
+ This method creates an instance of PerfBenchCMD from parsed arguments.
+ """
+ return cls(args)
+
+ @staticmethod
+ def define_args(parsers: ArgumentParser):
+ """ define args for create pipeline template command.
+ """
+ from evalscope.perf.arguments import add_argument
+
+ parser = parsers.add_parser(PerfBenchCMD.name)
+ add_argument(parser)
+ parser.set_defaults(
+ max_tokens=1024,
+ min_tokens=1024,
+ min_prompt_length=1024,
+ max_prompt_length=1024,
+ api='openai',
+ url='http://127.0.0.1:30000/v1/completions'
+ )
+ parser.set_defaults(func=PerfBenchCMD.subparser_func)
+
+ def execute(self):
+ try:
+ from evalscope.perf.main import run_perf_benchmark
+ except ImportError as e:
+ raise ImportError(
+ f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
+ "Please run `pip install 'evalscope[perf]'`."
+ )
+
+ run_perf_benchmark(self.args)
+
+
+class EvalCMD(CLICommand):
+ name = 'eval'
+
+ def __init__(self, args):
+ self.args = args
+
+ @classmethod
+ def subparser_func(cls, args):
+ """
+ Function which will be called for a specific sub parser.
+ This method creates an instance of EvalCMD from parsed arguments.
+ """
+ return cls(args)
+
+ @staticmethod
+ def define_args(parsers: ArgumentParser):
+ """ define args for create pipeline template command.
+ """
+ from evalscope.arguments import add_argument
+
+ parser = parsers.add_parser(EvalCMD.name)
+ add_argument(parser)
+ parser.set_defaults(
+ api_key='EMPTY',
+ api_url='http://127.0.0.1:30000/v1',
+ eval_type='server'
+ )
+ parser.set_defaults(func=EvalCMD.subparser_func)
+
+ def execute(self):
+ from evalscope.run import run_task
+
+ run_task(self.args)
+
+
+class StartAppCMD(CLICommand):
+ name = 'app'
+
+ def __init__(self, args):
+ self.args = args
+
+ @classmethod
+ def subparser_func(cls, args):
+ """
+ Function which will be called for a specific sub parser.
+ This method creates an instance of StartAppCMD from parsed arguments.
+ """
+ return cls(args)
+
+ @staticmethod
+ def define_args(parsers: ArgumentParser):
+ """ define args for create pipeline template command.
+ """
+ from evalscope.app import add_argument
+
+ parser = parsers.add_parser(StartAppCMD.name)
+ add_argument(parser)
+ parser.set_defaults(func=StartAppCMD.subparser_func)
+
+ def execute(self):
+ try:
+ from evalscope.app import create_app
+ except ImportError as e:
+ raise ImportError(
+ f'Failed to import create_app from evalscope.app, due to {e}. '
+ "Please run `pip install 'evalscope[app]'`."
+ )
+
+ create_app(self.args)
+
+class SGLPerfCMD(CLICommand):
+ name = 'sgl-perf'
+
+ def __init__(self, args):
+ self.args = args
+
+ @classmethod
+ def subparser_func(cls, args):
+ """
+ Function which will be called for a specific sub parser.
+ """
+ return cls(args)
+
+ @staticmethod
+ def define_args(parsers: ArgumentParser):
+ """ define args for sgl-perf command. """
+
+ parser = parsers.add_parser(SGLPerfCMD.name,
+ help='Run SGLang performance benchmark (bench_serving.py)')
+
+ define_sgl_bench_args(parser)
+
+ parser.set_defaults(func=SGLPerfCMD.subparser_func)
+
+ def execute(self):
+ if run_benchmark is None:
+ raise ImportError(
+ "Failed to import 'run_benchmark' from 'bench_serving'. "
+ "Command 'sgl-perf' cannot execute."
+ )
+
+ run_benchmark(self.args)
+
+def run_cmd():
+ parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope []')
+ parser.add_argument('-v', '--version', action='version', version=f'evalscope {__version__}')
+ subparsers = parser.add_subparsers(help='EvalScope command line helper.')
+
+ PerfBenchCMD.define_args(subparsers)
+ EvalCMD.define_args(subparsers)
+ StartAppCMD.define_args(subparsers)
+
+ # sgl-perf
+ SGLPerfCMD.define_args(subparsers)
+
+ args = parser.parse_args()
+
+ if not hasattr(args, 'func'):
+ parser.print_help()
+ exit(1)
+
+ cmd = args.func(args)
+ cmd.execute()
+
+
+if __name__ == '__main__':
+ run_cmd()
diff --git a/models/nlp/llm/llm-benchmark/requirements.txt b/models/nlp/llm/llm-benchmark/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d25d9047746399f23453ec9a29e29b92aa9db271
--- /dev/null
+++ b/models/nlp/llm/llm-benchmark/requirements.txt
@@ -0,0 +1,3 @@
+evalscope==1.0.2
+evalscope[perf]
+evalscope[app]
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md b/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d790fe79907a37f5ef61f0c8ab81f3e91c77ed82
--- /dev/null
+++ b/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md
@@ -0,0 +1,56 @@
+# Qwen3-235B-A22B-Thinking-2507 (vLLM)
+
+## Model Description
+
+Qwen3-235B-A22B is a large Mixture-of-Experts (MoE) language model with 235B total parameters and 22B activated parameters. The "Thinking" version is optimized for complex logical reasoning, math, and coding tasks with enhanced reasoning capabilities.
+
+This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+export VLLM_W8A8_MOE_USE_W4A8=1
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=4 --tensor-parallel-size=4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input128, Output128, BS1):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 128 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 128
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md b/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d4b23188d6132f6fb1d89e3cbf245802fc98be1
--- /dev/null
+++ b/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md
@@ -0,0 +1,56 @@
+# Qwen3-30B-A3B-Thinking-2507 (vLLM)
+
+## Model Description
+
+Qwen3-30B-A3B is a Mixture-of-Experts (MoE) large language model with 30B total parameters and 3B activated parameters. The "Thinking" version is optimized for complex logical reasoning, math, and coding tasks.
+
+This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+export VLLM_W8A8_MOE_USE_W4A8=1
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=2 --max-model-len 4096 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input128, Output128, BS1):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 128 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 128
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-32b/vllm/README.md b/models/nlp/llm/qwen3-32b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8494587317da66ad862415d8ffd10a1c16142277
--- /dev/null
+++ b/models/nlp/llm/qwen3-32b/vllm/README.md
@@ -0,0 +1,55 @@
+# Qwen3-32B (vLLM)
+
+## Model Description
+
+Qwen3-32B is a dense large language model with 32B parameters, offering excellent performance on reasoning, instruction-following, and multilingual tasks. It supports seamless switching between thinking mode (for complex logical reasoning, math, and coding) and non-thinking mode (for efficient, general-purpose dialogue).
+
+This version supports W8A8 (Weight-8bit, Activation-8bit) and W4A16 (Weight-4bit, Activation-16bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W8A8/W4A16
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=2 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input2048, Output1024, BS1):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 2048 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 1024
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md b/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f462989e4b6f662e1a4f02f374157fb3e9c033eb
--- /dev/null
+++ b/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md
@@ -0,0 +1,55 @@
+# Qwen3-Next-80B-A3B-Instruct (vLLM)
+
+## Model Description
+
+Qwen3-Next-80B-A3B-Instruct is a Mixture-of-Experts (MoE) large language model with 80B total parameters and 3B activated parameters. This is the next generation Qwen model with enhanced reasoning capabilities and instruction following.
+
+This version runs in BF16 precision for maximum accuracy.
+
+## Supported Environments
+
+| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model:
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with BF16
+
+#### Accuracy Test
+
+1. Install evalscope:
+```bash
+pip3 install 'evalscope[app,perf]' -U
+```
+
+2. Set environment variables:
+```bash
+export VLLM_USE_MODELSCOPE=True
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+3. Start server:
+```bash
+vllm serve /path/to/model --served-model-name Qwen3-Next-80B-A3B-Instruct --trust_remote_code --port 8801 --pipeline-parallel-size 1 --tensor-parallel-size 8 --max-num-seqs 64 --max-model-len 40960 --disable-cascade-attn --gpu-memory-utilization 0.90 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' --port 9989
+```
+
+4. Run client (MMLU-Pro dataset):
+```bash
+evalscope eval --model Qwen3-Next-80B-A3B-Instruct --dataset-args '{"mmlu_pro": {"few_shot_num": 0}}' --generation-config '{"do_sample": true, "temperature": 0.7, "max_tokens": 32768, "n": 1, "top_p": 0.8, "top_k": 20}' --datasets mmlu_pro --eval-type openai_api --eval-batch-size 64 --api-url http://127.0.0.1:9989/v1 --timeout 12000000 --api-key EMPTY --eval-type openai_api
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index 5fa94159b24ceded3fe8dcc876e4313390de496b..1eb8591fa3a56ccd51211c20095b16493a1344d7 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -10060,6 +10060,370 @@
"type": "inference",
"hasDemo": false,
"demoType": ""
+ },
+ {
+ "display_name": "DeepSeek V3.1",
+ "model_name": "deepseek-v3.1",
+ "framework": "vllm",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "nlp/llm",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/nlp/llm/deepseek-v3.1/vllm",
+ "readme_file": "models/nlp/llm/deepseek-v3.1/vllm/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3",
+ "need_third_part": false,
+ "precisions": [
+ "w4a8"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "Qwen3 32B",
+ "model_name": "qwen3-32b",
+ "framework": "vllm",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "nlp/llm",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/nlp/llm/qwen3-32b/vllm",
+ "readme_file": "models/nlp/llm/qwen3-32b/vllm/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-32B",
+ "need_third_part": false,
+ "precisions": [
+ "w8a8",
+ "w4a16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "Qwen3 30B A3B Thinking",
+ "model_name": "qwen3-30b-a3b-thinking",
+ "framework": "vllm",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "nlp/llm",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/nlp/llm/qwen3-30b-a3b-thinking/vllm",
+ "readme_file": "models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B",
+ "need_third_part": false,
+ "precisions": [
+ "w4a8"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "Qwen3 235B A22B Thinking",
+ "model_name": "qwen3-235b-a22b-thinking",
+ "framework": "vllm",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "nlp/llm",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/nlp/llm/qwen3-235b-a22b-thinking/vllm",
+ "readme_file": "models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-235B-A22B",
+ "need_third_part": false,
+ "precisions": [
+ "w4a8"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "Qwen3 Next 80B A3B",
+ "model_name": "qwen3-next-80b-a3b",
+ "framework": "vllm",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "nlp/llm",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/nlp/llm/qwen3-next-80b-a3b/vllm",
+ "readme_file": "models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct",
+ "need_third_part": false,
+ "precisions": [
+ "bf16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "FLUX.1 Dev",
+ "model_name": "flux.1-dev",
+ "framework": "xdit",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "multimodal/diffusion_model",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/multimodal/diffusion_model/flux.1-dev/xdit",
+ "readme_file": "models/multimodal/diffusion_model/flux.1-dev/xdit/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://modelscope.cn/models/black-forest-labs/FLUX.1-dev",
+ "need_third_part": false,
+ "precisions": [
+ "fp16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "HunyuanVideo",
+ "model_name": "hunyuan_video",
+ "framework": "xdit",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "multimodal/diffusion_model",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/multimodal/diffusion_model/hunyuan_video/xdit",
+ "readme_file": "models/multimodal/diffusion_model/hunyuan_video/xdit/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://modelscope.cn/models/Tencent-Hunyuan/HunyuanVideo",
+ "need_third_part": false,
+ "precisions": [
+ "fp16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "Wan2.1 T2V 14B",
+ "model_name": "wan2.1-t2v-14b",
+ "framework": "xdit",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "multimodal/diffusion_model",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit",
+ "readme_file": "models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B-Diffusers",
+ "need_third_part": false,
+ "precisions": [
+ "fp16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "Wan2.2 TI2V 5B",
+ "model_name": "wan2.2-ti2v-5b",
+ "framework": "xdit",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "multimodal/diffusion_model",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit",
+ "readme_file": "models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://www.modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+ "need_third_part": false,
+ "precisions": [
+ "fp16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "HunyuanDiT v1.2",
+ "model_name": "hunyuandit-v1.2",
+ "framework": "xdit",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "multimodal/diffusion_model",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit",
+ "readme_file": "models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://modelscope.cn/models/dengcao/HunyuanDiT-v1.2-Diffusers",
+ "need_third_part": false,
+ "precisions": [
+ "fp16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
+ },
+ {
+ "display_name": "SD3 Medium",
+ "model_name": "stable-diffusion-3-medium",
+ "framework": "xdit",
+ "release_version": "26.03",
+ "release_sdk": "4.4.0",
+ "release_gpgpu": "MR-V100",
+ "latest_sdk": "4.4.0",
+ "latest_gpgpu": "",
+ "category": "multimodal/diffusion_model",
+ "toolbox": "",
+ "mdims": "",
+ "dataset": "",
+ "license": "",
+ "model_path": "models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit",
+ "readme_file": "models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md",
+ "bitbucket_repo": "",
+ "bitbucket_branch": "",
+ "bitbucket_path": "",
+ "develop_owner": "",
+ "github_repo": "",
+ "github_branch": "",
+ "github_path": "",
+ "datasets": "",
+ "download_url": "https://modelscope.cn/models/stabilityai/stable-diffusion-3-medium-diffusers",
+ "need_third_part": false,
+ "precisions": [
+ "fp16"
+ ],
+ "type": "inference",
+ "hasDemo": false,
+ "demoType": ""
}
]
}
\ No newline at end of file