diff --git a/README.md b/README.md
index 0af03f6635e001599e5e3944d23ac88d3e49a2ce..3f571e07c09f495845f52a4a0827a5b23ec12238 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@
 | DeepSeek-R1-Distill-Llama-8B  | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm)              | 4.3.0     |
 | DeepSeek-R1-Distill-Llama-70B | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm)             | 4.3.0     |
 | DeepSeek-R1-Distill-Qwen-1.5B | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm)             | 4.3.0     |
-| DeepSeek-R1-Distill-Qwen-7B   | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm)               | 4.3.0     |
+| DeepSeek-R1-Distill-Qwen-7B   | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm)               | 4.4.0     |
 | DeepSeek-R1-Distill-Qwen-14B  | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-14b/vllm)              | 4.3.0     |
 | DeepSeek-R1-Distill-Qwen-32B  | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-32b/vllm)              | 4.3.0     |
 | DeepSeek-OCR                  | `Transformers` | [✅](models/multimodal/vision_language_model/deepseek-ocr/transformers)  | 4.3.0 |
@@ -61,7 +61,7 @@
 | Qwen-7B                       | `vLLM`       | [✅](models/nlp/llm/qwen-7b/vllm)                                   | 4.3.0     |
 | Qwen-VL                       | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen_vl/vllm)          | 4.3.0     |
 | Qwen2-VL                      | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen2_vl/vllm)         | 4.3.0     |
-| Qwen2.5-VL                    | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm)       | 4.3.0     |
+| Qwen2.5-VL                    | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm)       | 4.4.0     |
 | Qwen1.5-7B                    | `vLLM`       | [✅](models/nlp/llm/qwen1.5-7b/vllm)                                | 4.3.0     |
 | Qwen1.5-7B                    | `TGI`        | [✅](models/nlp/llm/qwen1.5-7b/tgi)                                 | 4.3.0     |
 | Qwen1.5-14B                   | `vLLM`       | [✅](models/nlp/llm/qwen1.5-14b/vllm)                               | 4.3.0     |
@@ -70,9 +70,14 @@
 | Qwen2-7B Instruct             | `vLLM`       | [✅](models/nlp/llm/qwen2-7b/vllm)                                  | 4.3.0     |
 | Qwen2-72B Instruct            | `vLLM`       | [✅](models/nlp/llm/qwen2-72b/vllm)                                 | 4.3.0     |
 | Qwen3_Moe                     | `vLLM`       | [✅](models/nlp/llm/qwen3-235b/vllm)                                | dev-only  |
-| Qwen3-8B                      | `vLLM`       | [✅](models/nlp/llm/qwen3/vllm)                                     | 4.4.0     |
+| Qwen3-8B                      | `vLLM`       | [✅](models/nlp/llm/qwen3-8b/vllm)                                  | 4.4.0     |
+| Qwen3-32B                     | `vLLM`       | [✅](models/nlp/llm/qwen3-32b/vllm)                                 | 4.4.0     |
+| Qwen3-30B-A3B-Thinking        | `vLLM`       | [✅](models/nlp/llm/qwen3-30b-a3b-thinking/vllm)                   | 4.4.0     |
+| Qwen3-235B-A22B-Thinking      | `vLLM`       | [✅](models/nlp/llm/qwen3-235b-a22b-thinking/vllm)                 | 4.4.0     |
+| Qwen3-Next-80B-A3B            | `vLLM`       | [✅](models/nlp/llm/qwen3-next-80b-a3b/vllm)                       | 4.4.0     |
+| DeepSeek-V3.1                 | `vLLM`       | [✅](models/nlp/llm/deepseek-v3.1/vllm)                            | 4.4.0     |
 | StableLM2-1.6B                | `vLLM`       | [✅](models/nlp/llm/stablelm/vllm)                                  | 4.3.0     |
-| Step3                         | `vLLM`       | [✅](models/multimodal/vision_language_model/step3/vllm)            | dev-only  |
+| Step3                         | `vLLM`       | [✅](models/multimodal/vision_language_model/step3/vllm)            | 4.4.0     |
 | Ultravox                      | `vLLM`       | [✅](models/speech/asr/ultravox/vllm)                               | 4.3.0     |
 | Whisper                       | `vLLM`       | [✅](models/speech/asr/whisper/vllm/)                               | 4.3.0     |
 | XLMRoberta                    | `vLLM`       | [✅](models/multimodal/vision_language_model/xlmroberta/vllm)       | 4.3.0     |
@@ -323,6 +328,12 @@
 | Stable Diffusion 1.5 | Diffusers   | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers)  | 4.3.0     |
 | Stable Diffusion 2.1 | ixRT   | [✅](models/multimodal/diffusion_model/stable-diffusion-2.1/diffusers)  | 4.4.0     |
 | Stable Diffusion 3 |   Diffusers   | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers)    | dev-only  |
+| FLUX.1-Dev          | xDiT   | [✅](models/multimodal/diffusion_model/flux.1-dev/xdit)                 | 4.4.0     |
+| HunyuanVideo        | xDiT   | [✅](models/multimodal/diffusion_model/hunyuan_video/xdit)              | 4.4.0     |
+| Wan2.1-T2V-14B      | xDiT   | [✅](models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit)            | 4.4.0     |
+| Wan2.2-TI2V-5B      | xDiT   | [✅](models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit)            | 4.4.0     |
+| HunyuanDiT-v1.2     | xDiT   | [✅](models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit)           | 4.4.0     |
+| SD3-Medium          | xDiT   | [✅](models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit) | 4.4.0     |
 
 ### 自然语言处理（NLP）
 
diff --git a/README_en.md b/README_en.md
index 631736f9d5a0c0605ce14cca93b394a21bafc037..08f98040ca51db95758a53295ae68950232a3970 100644
--- a/README_en.md
+++ b/README_en.md
@@ -46,7 +46,7 @@ inference to be expanded in the future.
 | DeepSeek-R1-Distill-Llama-8B  | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm)              | 4.3.0     |
 | DeepSeek-R1-Distill-Llama-70B | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm)             | 4.3.0     |
 | DeepSeek-R1-Distill-Qwen-1.5B | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm)             | 4.3.0     |
-| DeepSeek-R1-Distill-Qwen-7B   | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm)               | 4.3.0     |
+| DeepSeek-R1-Distill-Qwen-7B   | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm)               | 4.4.0     |
 | DeepSeek-R1-Distill-Qwen-14B  | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-14b/vllm)              | 4.3.0     |
 | DeepSeek-R1-Distill-Qwen-32B  | `vLLM`       | [✅](models/nlp/llm/deepseek-r1-distill-qwen-32b/vllm)              | 4.3.0     |
 | DeepSeek-OCR                  | `Transformers` | [✅](models/multimodal/vision_language_model/deepseek-ocr/transformers) | 4.3.0 |
@@ -71,7 +71,7 @@ inference to be expanded in the future.
 | Qwen-7B                       | `vLLM`       | [✅](models/nlp/llm/qwen-7b/vllm)                                   | 4.3.0     |
 | Qwen-VL                       | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen_vl/vllm)          | 4.3.0     |
 | Qwen2-VL                      | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen2_vl/vllm)         | 4.3.0     |
-| Qwen2.5-VL                    | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm)       | 4.3.0     |
+| Qwen2.5-VL                    | `vLLM`       | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm)       | 4.4.0     |
 | Qwen1.5-7B                    | `vLLM`       | [✅](models/nlp/llm/qwen1.5-7b/vllm)                                | 4.3.0     |
 | Qwen1.5-7B                    | `TGI`        | [✅](models/nlp/llm/qwen1.5-7b/tgi)                                 | 4.3.0     |
 | Qwen1.5-14B                   | `vLLM`       | [✅](models/nlp/llm/qwen1.5-14b/vllm)                               | 4.3.0     |
@@ -80,9 +80,14 @@ inference to be expanded in the future.
 | Qwen2-7B Instruct             | `vLLM`       | [✅](models/nlp/llm/qwen2-7b/vllm)                                  | 4.3.0     |
 | Qwen2-72B Instruct            | `vLLM`       | [✅](models/nlp/llm/qwen2-72b/vllm)                                 | 4.3.0     |
 | Qwen3_Moe                     | `vLLM`       | [✅](models/nlp/llm/qwen3-235b/vllm)                                | dev-only  |
-| Qwen3-8B                      | `vLLM`       | [✅](models/nlp/llm/qwen3/vllm)                                     | 4.4.0     |
+| Qwen3-8B                      | `vLLM`       | [✅](models/nlp/llm/qwen3-8b/vllm)                                  | 4.4.0     |
+| Qwen3-32B                     | `vLLM`       | [✅](models/nlp/llm/qwen3-32b/vllm)                                 | 4.4.0     |
+| Qwen3-30B-A3B-Thinking        | `vLLM`       | [✅](models/nlp/llm/qwen3-30b-a3b-thinking/vllm)                   | 4.4.0     |
+| Qwen3-235B-A22B-Thinking      | `vLLM`       | [✅](models/nlp/llm/qwen3-235b-a22b-thinking/vllm)                 | 4.4.0     |
+| Qwen3-Next-80B-A3B            | `vLLM`       | [✅](models/nlp/llm/qwen3-next-80b-a3b/vllm)                       | 4.4.0     |
+| DeepSeek-V3.1                 | `vLLM`       | [✅](models/nlp/llm/deepseek-v3.1/vllm)                            | 4.4.0     |
 | StableLM2-1.6B                | `vLLM`       | [✅](models/nlp/llm/stablelm/vllm)                                  | 4.3.0     |
-| Step3                         | `vLLM`       | [✅](models/multimodal/vision_language_model/step3/vllm)            | dev-only  |
+| Step3                         | `vLLM`       | [✅](models/multimodal/vision_language_model/step3/vllm)            | 4.4.0     |
 | Ultravox                      | `vLLM`       | [✅](models/speech/asr/ultravox/vllm)                               | 4.3.0     |
 | Whisper                       | `vLLM`       | [✅](models/speech/asr/whisper/vllm/)                               | 4.3.0     |
 | XLMRoberta                    | `vLLM`       | [✅](models/multimodal/vision_language_model/xlmroberta/vllm)       | 4.3.0     |
@@ -332,6 +337,12 @@ inference to be expanded in the future.
 | Stable Diffusion 1.5 | Diffusers   | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers)  | 4.3.0     |
 | Stable Diffusion 2.1 | ixRT   | [✅](models/multimodal/diffusion_model/stable-diffusion-2.1/diffusers)  | 4.4.0     |
 | Stable Diffusion 3 |   Diffusers   | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers)    | dev-only  |
+| FLUX.1-Dev          | xDiT   | [✅](models/multimodal/diffusion_model/flux.1-dev/xdit)                 | 4.4.0     |
+| HunyuanVideo        | xDiT   | [✅](models/multimodal/diffusion_model/hunyuan_video/xdit)              | 4.4.0     |
+| Wan2.1-T2V-14B      | xDiT   | [✅](models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit)            | 4.4.0     |
+| Wan2.2-TI2V-5B      | xDiT   | [✅](models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit)            | 4.4.0     |
+| HunyuanDiT-v1.2     | xDiT   | [✅](models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit)           | 4.4.0     |
+| SD3-Medium          | xDiT   | [✅](models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit) | 4.4.0     |
 ### NLP
 
 #### PLM (Pre-trained Language Model)
diff --git a/RELEASE.md b/RELEASE.md
index 866a42207844a63ba4c95562f2fcfa0d5017717b..b0cf8a3be90c196bdb1f94c60d4823b4d081218b 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,7 @@
 
 | Release Date | Release Version | IXUCA SDK |
 |--------------|-----------------|-----------|
+| Mar 2026     | 26.03           | v4.4.0    |
 | Dec 2025     | 25.12           | v4.3.0    |
 | Sep 2025     | 25.09           | v4.3.0    |
 | Jun 2025     | 25.06           | v4.2.0    |
@@ -20,6 +21,104 @@
 
 ## Release Notes
 
+### DeepSparkInference 26.03
+
+#### 模型与算法
+
+* 新增了 16 个推理小模型示例，其中支持 IGIE 推理引擎的 10 个，支持 ixRT 推理引擎的 6 个。
+* 新增了 18 个大语言模型推理示例，其中 12 个使用 [vLLM](https://github.com/vllm-project/vllm)，6 个使用 [xDiT](https://github.com/xdit-team/xDiT)，2 个使用 ixRT。
+
+<table>
+  <th colspan=3>IGIE</th>
+  <tr>
+      <td>MobileViT-S</td>
+      <td>ViT-B-32</td>
+      <td>ViT-L-14</td>
+  </tr>
+  <tr>
+      <td>DETR</td>
+      <td>RT-DETR</td>
+      <td>YOLOv11m</td>
+  </tr>
+  <tr>
+      <td>YOLOv11s</td>
+      <td>YOLOv26n</td>
+      <td>YOLOv5s</td>
+  </tr>
+  <tr>
+      <td>DenseNet121(int8)</td>
+      <td></td>
+      <td></td>
+  </tr>
+  <th colspan=3>ixRT</th>
+  <tr>
+      <td>Swin Transformer</td>
+      <td>RepNet</td>
+      <td>Grounding DINO</td>
+  </tr>
+  <tr>
+      <td>RT-DETR</td>
+      <td>CRNN</td>
+      <td>UNet</td>
+  </tr>
+  <th colspan=3>LLM</th>
+  <tr>
+      <td>DeepSeek-V3.1 (vLLM)</td>
+      <td>DeepSeek-VL2 (vLLM)</td>
+      <td>DeepSeek-OCR (vLLM)</td>
+  </tr>
+  <tr>
+      <td>InternLM3 (vLLM)</td>
+      <td>MiniCPM-V-4 (vLLM)</td>
+      <td>Qwen3-8B (vLLM)</td>
+  </tr>
+  <tr>
+      <td>Qwen3-32B (vLLM)</td>
+      <td>Qwen3-30B-A3B (vLLM)</td>
+      <td>Qwen3-235B-A22B (vLLM)</td>
+  </tr>
+  <tr>
+      <td>Qwen3-Next-80B (vLLM)</td>
+      <td>FLUX.1-Dev(xDiT)</td>
+      <td>HunyuanVideo(xDiT)</td>
+  </tr>
+  <tr>
+      <td>Wan2.1-T2V-14B(xDiT)</td>
+      <td>Wan2.2-TI2V-5B(xDiT)</td>
+      <td>HunyuanDiT-v1.2(xDiT)</td>
+  </tr>
+    <tr>
+      <td>SD3-Medium(xDiT)</td>
+      <td>CosyVoice (ixRT)</td>
+      <td>Stable Diffusion 2.1 (ixRT)</td>
+  </tr>
+</table>
+
+#### 修复更新
+
+* 适配了 IXUCA SDK 4.4.0 版本的 CI 测试流程
+* 修复了 IGIE MViTv2-base 模型运行时缺少 pkg_resources 模块的问题
+* 修复了 vLLM 推理模型的 deprecated 参数错误并升级为离线推理模式
+* 修复了 DeepSeek-R1-Distill-Llama-8B 模型在 vLLM 0.11.2 版本上的兼容性问题
+* 修复了 Qwen-VL、Qwen2-VL、Qwen2.5-VL、Whisper 等模型的参数错误问题
+* 修复了 Pixtral 模型在 vLLM 0.11.2 版本上的兼容性问题
+* 修复了 ixRT RT-DETR 模型在 batchsize 为 64 时运行报错的问题
+* 修复了多个模型的 trust_remote_code 参数配置问题
+* 修复了 IGIE YOLOv8n 模型与 ultralytics 版本的兼容性问题
+* 修复了 YOLOx 模型的数据集路径问题
+* 修复了 IGIE ResNet 和 VGG16 模型在安装 TensorFlow 时的报错问题
+* 修复了 protobuf 版本导致的兼容性问题
+* 新增了 ARM 架构的核心绑定命令支持
+* 新增了 YOLOv8n ixRT 模型的 batchsize 参数支持
+
+#### 版本关联
+
+DeepSparkInference 26.03 对应天数软件栈 4.4.0 版本。
+
+#### 感谢以下社区贡献者
+
+YoungPeng，honglyua，majorli6，shengyan.zhao，yougouda，jinrui.zhang，tianyu，anders。
+
 ### DeepSparkInference 25.12
 
 #### 模型与算法
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md b/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..95bc45a8f5472ee4b248fd2d2ff41b15f6ce92b7
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md
@@ -0,0 +1,61 @@
+# FLUX.1-Dev (xDiT)
+
+## Model Description
+
+FLUX.1-Dev is a state-of-the-art text-to-image diffusion model developed by Black Forest Labs. It excels at generating high-quality, detailed images from text prompts with exceptional prompt adherence and image quality.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://modelscope.cn/models/black-forest-labs/FLUX.1-dev>
+
+- Model: <https://modelscope.cn/models/black-forest-labs/FLUX.1-schnell>
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. Modify model path in ``run.sh``:
+```bash
+# The run.sh script is pre-copied in this directory
+# Modify MODEL_CONFIGS to point to your model path
+vim run.sh
+# Update: MODEL_CONFIGS=(["Flux"]="flux_example.py /home/data/flux___1-schnell/ 28")
+```
+
+2. Run script:
+```bash
+bash run.sh
+```
+
+3. The model supports 512*512 and 1024*1024 image sizes. To modify:
+```bash
+vim run.sh
+# Modify TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5" 
+# to TASK_ARGS="--height 512 --width 512 --no_use_resolution_binning --guidance_scale 3.5"
+```
+
+## References
+
+- [FLUX.1](https://github.com/black-forest-labs/flux)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py b/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a6a91e9a35cc2f4433c9303a9fbaffb0321f408
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py
@@ -0,0 +1,124 @@
+import logging
+import time
+import torch
+import torch.distributed
+from transformers import T5EncoderModel
+from xfuser import xFuserFluxPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    get_data_parallel_rank,
+    get_data_parallel_world_size,
+    get_runtime_state,
+    is_dp_last_group,
+    get_pipeline_parallel_world_size,
+    get_classifier_free_guidance_world_size,
+    get_tensor_model_parallel_world_size,
+    get_data_parallel_world_size,
+)
+from xfuser.model_executor.cache.diffusers_adapters import apply_cache_on_transformer
+# if  os.environ.get("ENABLE_IXFORMER_CONV2D", "0") == "1":
+# import ixformer as ixff
+# torch.nn.functional.conv2d=ixff.conv2d
+
+
+def main():
+    torch.backends.cudnn.benchmark=False
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+    engine_config, input_config = engine_args.create_config()
+    runtime_dtype = torch.bfloat16
+    engine_config.runtime_config.dtype = runtime_dtype
+    local_rank = get_world_group().local_rank
+    torch.cuda.set_device(local_rank)
+    text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
+
+    if args.use_fp8_t5_encoder:
+        from optimum.quanto import freeze, qfloat8, quantize
+        logging.info(f"rank {local_rank} quantizing text encoder 2")
+        quantize(text_encoder_2, weights=qfloat8)
+        freeze(text_encoder_2)
+
+    cache_args = {
+            "use_teacache": engine_args.use_teacache,
+            "use_fbcache": engine_args.use_fbcache,
+            "rel_l1_thresh": 0.12,
+            "return_hidden_states_first": False,
+            "num_steps": input_config.num_inference_steps,
+        }
+    # print(cache_args)
+    pipe = xFuserFluxPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        engine_config=engine_config,
+        cache_args=cache_args,
+        torch_dtype=runtime_dtype,
+        text_encoder_2=text_encoder_2,
+
+    )
+    pipe.vae = pipe.vae.to(dtype=torch.float32)
+    
+    # pipe.vae.to(memory_format=torch.channels_last)
+    # for net in pipe.vae.modules():
+    #     net.register_forward_hook(forward_hook)
+   
+    if args.enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} sequential CPU offload enabled")
+    elif args.enable_model_cpu_offload:
+        pipe.enable_model_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} model CPU offload enabled")
+    else:        
+        pipe = pipe.to(local_rank)
+    
+    parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+    ## 
+    import os
+    if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1":
+        from w8a8_linear import apply_quant_linear_i8w8o16
+        pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+    # pipe.transformer.fuse_qkv_projections()
+    pipe.prepare_run(input_config, steps=input_config.num_inference_steps)
+    
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        prompt=input_config.prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        output_type=input_config.output_type,
+        max_sequence_length=256,
+        guidance_scale=input_config.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+    )
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"tp{engine_args.tensor_parallel_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    if input_config.output_type == "pil":
+        dp_group_index = get_data_parallel_rank()
+        num_dp_groups = get_data_parallel_world_size()
+        dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
+        if pipe.is_dp_last_group():
+            for i, image in enumerate(output.images):
+                image_rank = dp_group_index * dp_batch_size + i
+                image_name = f"flux_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+                image.save(f"./{image_name}")
+                print(f"image {i} saved to ./{image_name}")
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(
+            f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
+        )
+    get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt b/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..92e80ab221aa39de5448471b3bc63121b1dbbd37
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt
@@ -0,0 +1,8 @@
+#diffusers
+yunchang
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh b/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..163e088888c3ec0cd88b0d5e3d50bdd8a1a00990
--- /dev/null
+++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh
@@ -0,0 +1,72 @@
+# set -x
+export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1  #使用 sageattention,#xdit ==0.4.4
+export ENABLE_IXFORMER_W8A8LINEAR=1
+
+# Select the model type
+export MODEL_TYPE="Flux"
+# Configuration for different model types
+# script, model_id, inference_step
+declare -A MODEL_CONFIGS=(    
+    ["Flux"]="flux_example.py /home/data/flux___1-schnell/ 28"   
+)
+
+echo ${MODEL_CONFIGS[$MODEL_TYPE]}
+
+# if [ -v MODEL_CONFIGS[$MODEL_TYPE] ] ; then
+if [ -n "${MODEL_CONFIGS[$MODEL_TYPE]+_}" ]; then
+    IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}"
+    export SCRIPT MODEL_ID INFERENCE_STEP
+else
+    echo "Invalid MODEL_TYPE: $MODEL_TYPE"
+    exit 1
+fi
+
+
+
+# task args
+TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+
+# cache args
+# CACHE_ARGS="--use_teacache"
+# CACHE_ARGS="--use_fbcache"
+
+# On 8 gpus, pp=2, ulysses=2, ring=1, cfg_parallel=2 (split batch)
+N_GPUS=2
+PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 1"
+
+# CFG_ARGS="--use_cfg_parallel"
+
+# By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance.
+# PIPEFUSION_ARGS="--num_pipeline_patch 8 "
+
+# For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed.
+# OUTPUT_ARGS="--output_type latent"
+
+# PARALLLEL_VAE="--use_parallel_vae"
+
+# Another compile option is `--use_onediff` which will use onediff's compiler.
+# COMPILE_FLAG="--use_torch_compile"
+
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 1 \
+--prompt "brown dog laying on the ground with a metal bowl in front of him." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG \
+$QUANTIZE_FLAG \
+$CACHE_ARGS \
+
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..25717b05019a5509b3a2a1e032adac21f3713df0
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md
@@ -0,0 +1,58 @@
+# HunyuanDiT-v1.2-Diffusers (xDiT)
+
+## Model Description
+
+HunyuanDiT-v1.2 is Tencent's advanced text-to-image diffusion model, featuring improved architecture and training for high-quality image generation. It excels at generating detailed, photorealistic images from text descriptions.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://modelscope.cn/models/dengcao/HunyuanDiT-v1.2-Diffusers>
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_hunyuandit.sh
+# Update MODEL_ID to your actual model path
+```
+
+2. Run script:
+```bash
+bash run_hunyuandit.sh
+```
+
+3. The model supports BS=1/BS=2. Different BS prompts format:
+```bash
+# BS1 (default) prompt format
+#--prompt "brown dog laying on the ground with a metal bowl in front of him."
+# BS2 prompt format
+--prompt "brown dog laying on the ground with a metal bowl in front of him." "brown dog laying on the ground with a metal bowl in front of him."
+```
+
+## References
+
+- [HunyuanDiT](https://github.com/Tencent/HunyuanDiT)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26a4503848bf371699693406f04ebf9e052a746
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py
@@ -0,0 +1,92 @@
+import time
+import os
+import torch
+import torch.distributed
+from transformers import T5EncoderModel
+from xfuser import xFuserHunyuanDiTPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    is_dp_last_group,
+    get_data_parallel_world_size,
+    get_runtime_state,
+    get_data_parallel_rank,
+)
+
+def main():
+
+    # torch.backends.cudnn.benchmark=False
+
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+    torch.cuda.set_device(local_rank)
+    text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16)
+    if args.use_fp8_t5_encoder:
+        from optimum.quanto import freeze, qfloat8, quantize
+        print(f"rank {local_rank} quantizing text encoder 2")
+        quantize(text_encoder_2, weights=qfloat8)
+        freeze(text_encoder_2)
+
+    pipe = xFuserHunyuanDiTPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        engine_config=engine_config,
+        torch_dtype=torch.float16,
+        text_encoder_2=text_encoder_2,
+    ).to(f"cuda:{local_rank}")
+    pipe.vae.to(memory_format=torch.channels_last)
+    parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+    import os
+    if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1":
+        from w8a8_linear import apply_quant_linear_i8w8o16
+        pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+    pipe.prepare_run(input_config)
+
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        prompt=input_config.prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        output_type=input_config.output_type,
+        use_resolution_binning=input_config.use_resolution_binning,
+        guidance_scale=input_config.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+    )
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    if input_config.output_type == "pil":
+        dp_group_index = get_data_parallel_rank()
+        num_dp_groups = get_data_parallel_world_size()
+        dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
+        if pipe.is_dp_last_group():
+            if not os.path.exists("results"):
+                os.mkdir("results")
+            for i, image in enumerate(output.images):
+                image_rank = dp_group_index * dp_batch_size + i
+                image.save(
+                    f"./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+                )
+                print(
+                    f"image {i} saved to ./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png"
+                )
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(
+            f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB"
+        )
+    get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2969a4385d913c98a2cb13adfa2bb29f3d3f0938
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c136150e280f7b1f260977707f062515974491fc
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh
@@ -0,0 +1,45 @@
+# set -x
+export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+#多ring 没提升
+# export NCCL_USE_HIGHPRIORITYWARP=1
+
+export ENABLE_IXFORMER_INFERENCE=1
+# export ATTN_OPT_LEVEL=2
+export ENABLE_IXFORMER_W8A8LINEAR=0
+
+# Select the model type
+SCRIPT=hunyuandit_example.py
+MODEL_ID=/data/nlp/HunyuanDiT-v1.2-Diffusers/
+INFERENCE_STEP=20
+
+mkdir -p ./results
+
+# task args
+TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+
+# cache args
+# CACHE_ARGS="--use_teacache"
+# CACHE_ARGS="--use_fbcache"
+
+N_GPUS=2
+PARALLEL_ARGS="--pipefusion_parallel_degree 1 --ulysses_degree 1 --ring_degree 1 --tensor_parallel_degree 1 --data_parallel_degree 1"
+CFG_ARGS="--use_cfg_parallel"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 1 \
+--prompt "brown dog laying on the ground with a metal bowl in front of him." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG \
+$QUANTIZE_FLAG \
+$CACHE_ARGS \
+
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md b/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a9dd1c9e23f18807bc564b6e2efd811f10d0937
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md
@@ -0,0 +1,50 @@
+# HunyuanVideo (xDiT)
+
+## Model Description
+
+HunyuanVideo is Tencent's advanced text-to-video diffusion model capable of generating high-quality videos from text descriptions. It features excellent motion coherence, visual quality, and text understanding capabilities.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://modelscope.cn/models/Tencent-Hunyuan/HunyuanVideo>
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path in ``run_hunyuan_video_usp_teacache.sh``:
+```bash
+vim run_hunyuan_video_usp_teacache.sh
+# Update: MODEL_ID="/data/nlp/HunyuanVideo/" to your actual path
+```
+
+2. Run script:
+```bash
+bash run_hunyuan_video_usp_teacache.sh
+```
+
+## References
+
+- [HunyuanVideo](https://github.com/Tencent/HunyuanVideo)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d1c549f223d0d9906ae55129ec3f4413fbd322a
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py
@@ -0,0 +1,333 @@
+# from https://github.com/chengzeyi/ParaAttention/blob/main/examples/run_hunyuan_video.py
+import functools
+from typing import Any, Dict, Union, Optional
+import logging
+import time
+
+import torch
+
+from diffusers import DiffusionPipeline, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import scale_lora_layers, unscale_lora_layers, USE_PEFT_BACKEND
+from diffusers.utils import export_to_video
+from xfuser.model_executor.models.customized.hunyuan_video.tp_applicator import TensorParallelApplicator
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from xfuser import xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    get_data_parallel_world_size,
+    get_data_parallel_rank,
+    get_runtime_state,
+    get_classifier_free_guidance_world_size,
+    get_classifier_free_guidance_rank,
+    get_cfg_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    get_sp_group,
+    is_dp_last_group,
+    initialize_runtime_state,
+    get_pipeline_parallel_world_size,
+)
+
+from xfuser.model_executor.layers.attention_processor import xFuserHunyuanVideoAttnProcessor2_0
+
+assert xFuserHunyuanVideoAttnProcessor2_0 is not None
+
+
+def parallelize_transformer(pipe: DiffusionPipeline):
+    transformer = pipe.transformer
+
+    @functools.wraps(transformer.__class__.forward)
+    def new_forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_attention_mask: torch.Tensor,
+        pooled_projections: torch.Tensor,
+        guidance: torch.Tensor = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logging.warning("Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.")
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+        assert batch_size % get_classifier_free_guidance_world_size(
+        ) == 0, f"Cannot split dim 0 of hidden_states ({batch_size}) into {get_classifier_free_guidance_world_size()} parts."
+
+        p, p_t = self.config.patch_size, self.config.patch_size_t
+        post_patch_num_frames = num_frames // p_t
+        post_patch_height = height // p
+        post_patch_width = width // p
+
+        # 1. RoPE
+        image_rotary_emb = self.rope(hidden_states)
+
+        # 2. Conditional embeddings
+        # temb = self.time_text_embed(timestep, guidance, pooled_projections)
+        temb, token_replace_emb = self.time_text_embed(timestep,pooled_projections, guidance)
+        hidden_states = self.x_embedder(hidden_states)
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states,
+                                                      timestep,
+                                                      encoder_attention_mask)
+
+        hidden_states = hidden_states.reshape(batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1)
+        hidden_states = hidden_states.flatten(1, 3)
+
+        hidden_states = torch.chunk(hidden_states,
+                                    get_classifier_free_guidance_world_size(),
+                                    dim=0)[get_classifier_free_guidance_rank()]
+        hidden_states = torch.chunk(hidden_states,
+                                    get_sequence_parallel_world_size(),
+                                    dim=-2)[get_sequence_parallel_rank()]
+
+        encoder_attention_mask = encoder_attention_mask[0].to(torch.bool)
+        encoder_hidden_states_indices = torch.arange(
+            encoder_hidden_states.shape[1],
+            device=encoder_hidden_states.device)
+        encoder_hidden_states_indices = encoder_hidden_states_indices[
+            encoder_attention_mask]
+        encoder_hidden_states = encoder_hidden_states[
+            ..., encoder_hidden_states_indices, :]
+        if encoder_hidden_states.shape[-2] % get_sequence_parallel_world_size(
+        ) != 0:
+            get_runtime_state().split_text_embed_in_sp = False
+        else:
+            get_runtime_state().split_text_embed_in_sp = True
+
+        encoder_hidden_states = torch.chunk(
+            encoder_hidden_states,
+            get_classifier_free_guidance_world_size(),
+            dim=0)[get_classifier_free_guidance_rank()]
+        if get_runtime_state().split_text_embed_in_sp:
+            encoder_hidden_states = torch.chunk(
+                encoder_hidden_states,
+                get_sequence_parallel_world_size(),
+                dim=-2)[get_sequence_parallel_rank()]
+
+        freqs_cos, freqs_sin = image_rotary_emb
+
+        def get_rotary_emb_chunk(freqs):
+            freqs = torch.chunk(freqs, get_sequence_parallel_world_size(), dim=0)[get_sequence_parallel_rank()]
+            return freqs
+
+        freqs_cos = get_rotary_emb_chunk(freqs_cos)
+        freqs_sin = get_rotary_emb_chunk(freqs_sin)
+        image_rotary_emb = (freqs_cos, freqs_sin)
+
+        # 4. Transformer blocks
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module, return_dict=None):
+
+                def custom_forward(*inputs):
+                    if return_dict is not None:
+                        return module(*inputs, return_dict=return_dict)
+                    else:
+                        return module(*inputs)
+
+                return custom_forward
+
+            ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False}
+
+            for block in self.transformer_blocks:
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    None,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+            for block in self.single_transformer_blocks:
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    None,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+
+        else:
+            for block in self.transformer_blocks:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states, encoder_hidden_states, temb, None,
+                    image_rotary_emb)
+
+            for block in self.single_transformer_blocks:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states, encoder_hidden_states, temb, None,
+                    image_rotary_emb)
+
+        # 5. Output projection
+        hidden_states = self.norm_out(hidden_states, temb)
+        hidden_states = self.proj_out(hidden_states)
+
+        hidden_states = get_sp_group().all_gather(hidden_states, dim=-2)
+        hidden_states = get_cfg_group().all_gather(hidden_states, dim=0)
+
+        hidden_states = hidden_states.reshape(batch_size,
+                                              post_patch_num_frames,
+                                              post_patch_height,
+                                              post_patch_width, -1, p_t, p, p)
+
+        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
+        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
+
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
+        if not return_dict:
+            return (hidden_states, )
+
+        return Transformer2DModelOutput(sample=hidden_states)
+
+    new_forward = new_forward.__get__(transformer)
+    transformer.forward = new_forward
+
+    for block in transformer.transformer_blocks + transformer.single_transformer_blocks:
+        block.attn.processor = xFuserHunyuanVideoAttnProcessor2_0()
+
+
+def main():
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+
+    assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+    assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for HunyuanVideo"
+
+    transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        subfolder="transformer",
+        torch_dtype=torch.bfloat16,
+        revision="refs/pr/18",
+    )
+    pipe = HunyuanVideoPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        transformer=transformer,
+        torch_dtype=torch.float16,
+        revision="refs/pr/18",
+    )
+
+    initialize_runtime_state(pipe, engine_config)
+    get_runtime_state().set_video_input_parameters(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        batch_size=1,
+        num_inference_steps=input_config.num_inference_steps,
+        split_text_embed_in_sp=get_pipeline_parallel_world_size() == 1,
+    )
+
+    
+    if args.tensor_parallel_degree > 1:
+        tp_applicator = TensorParallelApplicator(get_tensor_model_parallel_world_size(), get_tensor_model_parallel_rank())
+        tp_applicator.apply_to_model(pipe.transformer)
+        tp_applicator.apply_to_llamamodel(pipe.text_encoder)
+    
+    parallelize_transformer(pipe)
+    if args.enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} sequential CPU offload enabled")
+    elif args.enable_model_cpu_offload:
+        pipe.enable_model_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} model CPU offload enabled")
+    else:
+        device = torch.device(f"cuda:{local_rank}")
+        pipe = pipe.to(device)
+
+    if args.enable_tiling:
+        pipe.vae.enable_tiling(
+            # Make it runnable on GPUs with 48GB memory
+            # tile_sample_min_height=128,
+            # tile_sample_stride_height=96,
+            # tile_sample_min_width=128,
+            # tile_sample_stride_width=96,
+            # tile_sample_min_num_frames=32,
+            # tile_sample_stride_num_frames=24,
+        )
+
+    if args.enable_slicing:
+        pipe.vae.enable_slicing()
+
+    parameter_peak_memory = torch.cuda.max_memory_allocated(
+        device=f"cuda:{local_rank}")
+
+    if engine_config.runtime_config.use_torch_compile:
+        torch._inductor.config.reorder_for_compute_comm_overlap = True
+        pipe.transformer.compile()
+
+        # one step to warmup the torch compiler
+        output = pipe(
+            height=input_config.height,
+            width=input_config.width,
+            num_frames=input_config.num_frames,
+            prompt=input_config.prompt,
+            num_inference_steps=1,
+            guidance_scale=input_config.guidance_scale,
+            generator=torch.Generator(device="cuda").manual_seed(
+                input_config.seed),
+        ).frames[0]
+
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        prompt=input_config.prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        guidance_scale=input_config.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(
+            input_config.seed),
+    ).frames[0]
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"tp{engine_args.tensor_parallel_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    if is_dp_last_group():
+        resolution = f"{input_config.width}x{input_config.height}"
+        output_filename = f"results/hunyuan_video_{parallel_info}_{resolution}.mp4"
+        export_to_video(output, output_filename, fps=15)
+        print(f"output saved to {output_filename}")
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(
+            f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9} GB"
+        )
+    get_runtime_state().destroy_distributed_env()
+
+
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 320 --width 512 --num_frames 61 --enable_tiling --enable_model_cpu_offload
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 544 --width 960 --num_frames 129 --enable_tiling --enable_model_cpu_offload
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffca2c6eb0de7a1cfc9610b1aa3b32b406b6e44b
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py
@@ -0,0 +1,180 @@
+# from https://github.com/chengzeyi/ParaAttention/blob/main/examples/run_hunyuan_video.py
+import functools
+from typing import Any, Dict, Union, Optional
+import logging
+import time
+
+import torch
+
+from diffusers import DiffusionPipeline, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
+from xfuser import xFuserHunyuanVideoPipeline
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import scale_lora_layers, unscale_lora_layers, USE_PEFT_BACKEND
+from diffusers.utils import export_to_video
+from xfuser.model_executor.models.customized.hunyuan_video.tp_applicator import TensorParallelApplicator
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from xfuser import xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    get_data_parallel_world_size,
+    get_data_parallel_rank,
+    get_runtime_state,
+    get_classifier_free_guidance_world_size,
+    get_classifier_free_guidance_rank,
+    get_cfg_group,
+    get_sequence_parallel_world_size,
+    get_sequence_parallel_rank,
+    get_sp_group,
+    is_dp_last_group,
+    initialize_runtime_state,
+    get_pipeline_parallel_world_size,
+)
+
+from xfuser.model_executor.layers.attention_processor import xFuserHunyuanVideoAttnProcessor2_0
+
+assert xFuserHunyuanVideoAttnProcessor2_0 is not None
+from w8a8_linear import apply_quant_linear_i8w8o16
+
+
+def main():
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+
+    assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+    # assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for HunyuanVideo"
+
+    transformer = HunyuanVideoTransformer3DModel.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        subfolder="transformer",
+        torch_dtype=torch.bfloat16,
+        revision="refs/pr/18",
+    )
+    rel_l1_thresh =0.12
+    if engine_args.use_fbcache:
+        rel_l1_thresh = 0.06
+    cache_args = {
+            "use_teacache": engine_args.use_teacache,
+            "use_fbcache": engine_args.use_fbcache,
+            "rel_l1_thresh": rel_l1_thresh,
+            "return_hidden_states_first": True,
+            "num_steps": input_config.num_inference_steps,
+        }
+    # pipe = HunyuanVideoPipeline.from_pretrained(
+    pipe = xFuserHunyuanVideoPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        transformer=transformer,
+        torch_dtype=torch.float16,
+        revision="refs/pr/18",
+        engine_config=engine_config,
+        cache_args=cache_args,
+    )
+    
+    # initialize_runtime_state(pipe, engine_config)
+    get_runtime_state().set_video_input_parameters(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        batch_size=1,
+        num_inference_steps=input_config.num_inference_steps,
+        split_text_embed_in_sp=get_pipeline_parallel_world_size() == 1,
+    )
+
+    
+    if args.tensor_parallel_degree > 1:
+        tp_applicator = TensorParallelApplicator(get_tensor_model_parallel_world_size(), get_tensor_model_parallel_rank())
+        tp_applicator.apply_to_model(pipe.transformer)
+        tp_applicator.apply_to_llamamodel(pipe.text_encoder)
+    
+    pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+    pipe.text_encoder=apply_quant_linear_i8w8o16(pipe.text_encoder)
+    
+    if args.enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} sequential CPU offload enabled")
+    elif args.enable_model_cpu_offload:
+        pipe.enable_model_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} model CPU offload enabled")
+    else:
+        device = torch.device(f"cuda:{local_rank}")
+        pipe = pipe.to(device)
+
+    if args.enable_tiling:
+        pipe.vae.enable_tiling()
+
+    if args.enable_slicing:
+        pipe.vae.enable_slicing()
+
+    parameter_peak_memory = torch.cuda.max_memory_allocated(
+        device=f"cuda:{local_rank}")
+
+    if engine_config.runtime_config.use_torch_compile:
+        torch._inductor.config.reorder_for_compute_comm_overlap = True
+        pipe.transformer = torch.compile(pipe.transformer,
+                                         mode="max-autotune-no-cudagraphs")
+
+        # one step to warmup the torch compiler
+        output = pipe(
+            height=input_config.height,
+            width=input_config.width,
+            num_frames=input_config.num_frames,
+            prompt=input_config.prompt,
+            num_inference_steps=1,
+            generator=torch.Generator(device="cuda").manual_seed(
+                input_config.seed),
+        ).frames[0]
+    warmup =False
+    if warmup:
+        output = pipe(
+            height=input_config.height,
+            width=input_config.width,
+            num_frames=input_config.num_frames,
+            prompt=input_config.prompt,
+            num_inference_steps=1,
+            generator=torch.Generator(device="cuda").manual_seed(
+                input_config.seed),
+        ).frames[0]
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        prompt=input_config.prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        generator=torch.Generator(device="cuda").manual_seed(
+            input_config.seed),
+    ).frames[0]
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"tp{engine_args.tensor_parallel_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    if is_dp_last_group():
+        resolution = f"{input_config.width}x{input_config.height}"
+        output_filename = f"results/hunyuan_video_{parallel_info}_{resolution}.mp4"
+        export_to_video(output, output_filename, fps=15)
+        print(f"output saved to {output_filename}")
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(
+            f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9} GB"
+        )
+    get_runtime_state().destroy_distributed_env()
+
+
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 320 --width 512 --num_frames 61 --enable_tiling --enable_model_cpu_offload
+# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 544 --width 960 --num_frames 129 --enable_tiling --enable_model_cpu_offload
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt b/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2a81dd7eef2aec1fab733810b1ed8531ac1515a4
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt
@@ -0,0 +1,9 @@
+yunchang
+diffusers
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ba4dd3ed3735cae1084a6130987c3e41d8bd93f7
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -x
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+export NCCL_USE_HIGHPRIORITYWARP=1
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1  #使用 sageattention,#xdit ==0.4.4
+
+SCRIPT="hunyuan_video_usp_example.py"
+MODEL_ID="/data/nlp/HunyuanVideo/"
+INFERENCE_STEP=50
+mkdir -p ./results
+
+TASK_ARGS="--height 720 --width 1280 --num_frames 133 --guidance_scale 5.0"
+
+N_GPUS=8
+PARALLEL_ARGS="--ulysses_degree 4 --ring_degree 2"
+ENABLE_TILING="--enable_tiling"
+ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload"
+COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A cat walks on the grass, realistic" \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$ENABLE_TILING \
+$ENABLE_MODEL_CPU_OFFLOAD \
+$COMPILE_FLAG
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fab6e8fceeb13793e330cf90b4e9c21b1543df30
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+set -x
+
+export PYTHONPATH=$PWD:$PYTHONPATH
+export NCCL_USE_HIGHPRIORITYWARP=1
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1  #使用 sageattention,#xdit ==0.4.4
+
+SCRIPT="hunyuan_video_usp_example_teacache.py"
+MODEL_ID="/data/nlp/HunyuanVideo/"
+
+INFERENCE_STEP=50
+
+mkdir -p ./results
+
+TASK_ARGS="--height 720 --width 1280 --num_frames 129  --seed 24"
+
+# CogVideoX parallel configuration
+N_GPUS=8
+PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 8"
+ENABLE_TILING="--enable_tiling"
+ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload"
+# COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A cat walks on the grass, realistic" \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$ENABLE_TILING \
+$ENABLE_MODEL_CPU_OFFLOAD \
+$COMPILE_FLAG \
+--use_teacache
+
diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..d320f03d0eae209c68db65e1f2ea438cb79a708c
--- /dev/null
+++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py
@@ -0,0 +1,106 @@
+import torch
+from typing import Optional
+from torch.nn.parameter import Parameter
+from ixformer.inference.functions.w8a8 import w8a8, dynamic_scaled_int8_quant
+
+
+def perchannel_quantize_weight_int8(weight: torch.Tensor):
+    weight = weight.cpu().to(torch.float32)
+    n_bit = 8
+    eps = 1e-5
+    max_int = 2**(n_bit - 1) - 1
+    min_int = -(2**(n_bit - 1)-1)
+    max_val = weight.abs().amax(dim=-1, keepdim=True)
+    # max_val = max_val.clamp(min=eps)
+    qscale = max_val / max_int
+    qweight = torch.clamp(torch.round(weight * (1.0 / qscale)), min_int,
+                            max_int).to(torch.int8)
+    qscale = qscale.squeeze().to(torch.float32)
+    return qweight, qscale
+class DynamicQuantizeLinear(torch.nn.Module):
+    def __init__(self,
+                 unquantized: torch.nn.Module,
+                 output_dtype: Optional[torch.dtype] = None,
+                 ):
+        
+        super().__init__()
+        assert isinstance(unquantized, torch.nn.Linear)
+        self.in_features = unquantized.in_features
+        self.out_features = unquantized.out_features
+        
+        self.device = unquantized.weight.device
+        self.output_dtype =output_dtype
+        
+        qweight, qscale = perchannel_quantize_weight_int8(unquantized.weight)
+        self.weight = Parameter(qweight.to(self.device), requires_grad=False)
+        self.scale = Parameter(qscale.to(self.device), requires_grad=False)
+
+        if unquantized.bias is not None:
+            self.bias = unquantized.bias.to(self.device)
+        else:
+            self.register_parameter("bias", None)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        device =self.weight.device
+        assert x.device == device
+        output_dtype = x.dtype if self.output_dtype is None else self.output_dtype
+        inputs = torch.empty(x.shape, dtype=torch.int8, device=device)
+        i_scales = torch.empty(x.shape[:-1], dtype=torch.float32, device=device)
+        dynamic_scaled_int8_quant(inputs, x.contiguous(), i_scales)
+
+        output = torch.empty(
+                (inputs.shape[:-1] + (self.weight.shape[0],)),
+                dtype=output_dtype,
+                device=device,
+            )
+        
+        out = w8a8(inputs, self.weight, i_scales, self.scale,self.bias, output)
+        # if self.bias is not None:
+        #     out =out +self.bias
+        return out
+    
+def _is_linear(mod, *args):
+    # return isinstance(mod, torch.nn.Linear) and args[0] in ["to_qkv", "to_added_qkv", "proj"]
+    # if isinstance(mod, torch.nn.Linear):
+    #     print(args[0])
+    return isinstance(mod, torch.nn.Linear) and "transformer" in args[0]  and ("attn1" in args[0] or "attn" in args[0] or "ff" in args[0] or "proj_mlp" in  args[0] or  "proj_out" in  args[0])
+   
+def _is_linear_flux(mod, *args):
+    # return isinstance(mod, torch.nn.Linear) and args[0] in ["to_qkv", "to_added_qkv", "proj"]
+    # if isinstance(mod, torch.nn.Linear):
+    #     print(args[0])
+    return isinstance(mod, torch.nn.Linear) and "transformer" in args[0]  and ( "attn" in args[0] or "ff" in args[0] or "proj_out" in  args[0] )
+
+def _is_linear_sd3(mod, *args):
+    return isinstance(mod, torch.nn.Linear) and "transformer" in args[0]  and ("attn" in args[0] or "ff" in args[0] or "proj_out" in  args[0])
+
+def _is_linear_hunyuandit(mod, *args):
+    return isinstance(mod, torch.nn.Linear) and "blocks" in args[0]
+
+def _is_wan_linear(mod, *args):
+    return isinstance(mod, torch.nn.Linear) and ("attn1" in args[0] or "attn" in args[0] or "attn2" in args[0] or "ffn" in args[0] or "proj_out" in  args[0])
+
+
+def apply_quant_linear_i8w8o16(model, cls=DynamicQuantizeLinear, filter_fn = None):
+    if filter_fn is None:
+        filter_fn = _is_linear
+    if type(model).__name__ == "FluxTransformer2DModel" or type(model).__name__ == "xFuserFluxTransformer2DWrapper":        
+        filter_fn = _is_linear_flux
+    elif type(model).__name__ == "HunyuanDiT2D" or type(model).__name__ == "xFuserHunyuanDiT2DWrapper":
+        filter_fn = _is_linear_hunyuandit
+    # elif type(model).__name__ == "SD3Transformer2DModel" or type(model).__name__ == "xFuserSD3Transformer2DWrapper":
+    #     filter_fn = _is_linear_sd3
+    elif type(model).__name__ == "WanTransformer3DModel" or type(model).__name__ == "xFuserWanTransformer3DModelWrapper":        
+        filter_fn = _is_wan_linear
+    # for name, child in model.named_children():
+    #     if filter_fn(child, name):            
+    #         setattr(model, name, cls(child))
+    #     else:
+    #         apply_quant_linear_i8w8o16(child, cls, filter_fn)
+    for name, m in model.named_modules():
+        if filter_fn(m,name):
+            parent_module_name, child_name = name.rsplit('.', 1) if '.' in name else ('', name)
+            parent_module = model.get_submodule(parent_module_name)
+            # print(parent_module_name,name)
+            setattr(parent_module, child_name, cls(m))
+    return model
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4dc2cd9c081086e96d65253f691053f126db1a5b
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md
@@ -0,0 +1,50 @@
+# Stable Diffusion 3 Medium (xDiT)
+
+## Model Description
+
+Stable Diffusion 3 Medium is Stability AI's latest text-to-image diffusion model, featuring significant improvements in image quality, prompt adherence, and typography rendering. It uses a new Multimodal Diffusion Transformer (MMDiT) architecture with separate sets of weights for text and image encoders.
+
+This version runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://modelscope.cn/models/stabilityai/stable-diffusion-3-medium-diffusers>
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_sd3.sh
+# Update MODEL_ID to your actual model path
+```
+
+2. Run script:
+```bash
+bash run_sd3.sh
+```
+
+## References
+
+- [Stable Diffusion 3](https://github.com/Stability-AI/stable-diffusion)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2969a4385d913c98a2cb13adfa2bb29f3d3f0938
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers>=4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh
new file mode 100644
index 0000000000000000000000000000000000000000..98f21c9fd429500abc55db01ac637626bbe7f546
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh
@@ -0,0 +1,45 @@
+# set -x
+export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+#多ring 没提升
+# export NCCL_USE_HIGHPRIORITYWARP=1
+
+export ENABLE_IXFORMER_INFERENCE=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1  #使用 sageattention,#xdit ==0.4.4
+export ENABLE_IXFORMER_W8A8LINEAR=1
+
+# Select the model type
+SCRIPT=sd3_example.py
+MODEL_ID=/data/nlp/stable-diffusion-3-medium-diffusers
+INFERENCE_STEP=50
+
+echo ${MODEL_CONFIGS[$MODEL_TYPE]}
+
+mkdir -p ./results
+
+# task args
+TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5"
+
+
+N_GPUS=4
+PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 2 --tensor_parallel_degree 1 --data_parallel_degree 1"
+
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 1 \
+--prompt "brown dog laying on the ground with a metal bowl in front of him." \
+$CFG_ARGS \
+$PARALLLEL_VAE \
+$COMPILE_FLAG \
+$QUANTIZE_FLAG \
+$CACHE_ARGS \
+
diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dc86bd64ff213a6884a1750f22e0aac67bef071
--- /dev/null
+++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py
@@ -0,0 +1,132 @@
+import time
+import os
+import torch
+import torch.distributed
+from transformers import T5EncoderModel
+from xfuser import xFuserStableDiffusion3Pipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    is_dp_last_group,
+    get_data_parallel_rank,
+    get_runtime_state,
+)
+from xfuser.core.distributed.parallel_state import get_data_parallel_world_size
+
+from apex.normalization.fused_layer_norm import FusedRMSNorm
+import torch
+import torch.nn as nn
+
+
+from ixformer.inference.functions.rms_norm import rms_norm
+
+
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        # Compute variance without subtracting mean (RMSNorm)
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+
+        # Cast back to half precision if needed
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+
+        return self.weight * hidden_states
+    
+
+#https://github.com/huggingface/transformers/issues/20287  fix apex, from apex.  normalization import FusedRMSNorm 
+def replace_fused_rmsnorm_with_t5(module):
+    for name, child in module.named_children():
+        if isinstance(child, FusedRMSNorm):
+            hidden_size = child.weight.shape[0]
+            eps = getattr(child, "eps", 1e-6)
+            new_ln = T5LayerNorm(hidden_size, eps=eps)
+            new_ln.weight.data = child.weight.data.clone()
+            setattr(module, name, new_ln)
+        else:
+            replace_fused_rmsnorm_with_t5(child)
+
+def main():
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+    torch.cuda.set_device(local_rank)
+    text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16)
+    if args.use_fp8_t5_encoder:
+        from optimum.quanto import freeze, qfloat8, quantize
+        print(f"rank {local_rank} quantizing text encoder 2")
+        quantize(text_encoder_3, weights=qfloat8)
+        freeze(text_encoder_3)
+
+    pipe = xFuserStableDiffusion3Pipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        engine_config=engine_config,
+        torch_dtype=torch.float16,
+        text_encoder_3=text_encoder_3,
+    ).to(f"cuda:{local_rank}")
+    
+    replace_fused_rmsnorm_with_t5(text_encoder_3)
+    parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+    import os
+    if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1":
+        from w8a8_linear import apply_quant_linear_i8w8o16
+        pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)
+    
+    pipe.prepare_run(input_config)
+
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        prompt=input_config.prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        output_type=input_config.output_type,
+        guidance_scale=input_config.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+    )
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    if input_config.output_type == "pil":
+        dp_group_index = get_data_parallel_rank()
+        num_dp_groups = get_data_parallel_world_size()
+        dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups
+        if pipe.is_dp_last_group():
+            if not os.path.exists("results"):
+                os.mkdir("results")
+            for i, image in enumerate(output.images):
+                image_rank = dp_group_index * dp_batch_size + i
+                image.save(
+                    f"./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png"
+                )
+                print(
+                    f"image {i} saved to ./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png"
+                )
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(
+            f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB"
+        )
+
+    get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..01a4357fba4260239c00a819d7408237777602d2
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md
@@ -0,0 +1,61 @@
+# Wan2.1-T2V-14B-Diffusers (xDiT)
+
+## Model Description
+
+Wan2.1-T2V-14B is Wan AI's large-scale text-to-video diffusion model with 14B parameters. It generates high-quality, cinematic videos from text prompts with excellent motion dynamics and visual fidelity.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B-Diffusers>
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_wan_2.1_t2v_14b.sh
+# Update MODEL_ID to your actual model path
+# Modify TASK_ARGS if needed
+```
+
+2. Run script:
+```bash
+bash run_wan_2.1_t2v_14b.sh
+```
+
+3. The model supports BS=1/BS=2. Different BS prompts format:
+```bash
+# BS1 (default) prompt format
+--prompt "一个虎虎生威的老虎" \
+--negative_prompt "畸形,光照不好" \
+# BS2 prompt format
+--prompt "一个虎虎生威的老虎" "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "畸形,光照不好" "畸形,光照不好" \
+```
+
+## References
+
+- [Wan2.1](https://github.com/Wan-Video/Wan2.1)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fa5b1c5f7e309593a6e11bde46979d4c4255b4b
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers==4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..133c5fd10a7c2520a5bf27710d3b5e5850616537
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -x
+export WORD_RANK_SUPPORT_TP=1
+export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export ENABLE_IXFORMER_SAGEATTN=1 #xdit ==0.4.4
+export TOKENIZERS_PARALLELISM=true
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+# CogVideoX configuration
+SCRIPT="wan2.1_t2v_example.py"
+MODEL_ID="/data/nlp/Wan2.1-T2V-14B-Diffusers/"
+INFERENCE_STEP=20
+
+mkdir -p ./results
+
+# CogVideoX specific task args
+TASK_ARGS="--height 480 --width 832 --num_frames 33 --seed 33 "
+
+# CogVideoX parallel configuration
+N_GPUS=4
+PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1 --tensor_parallel_degree  2" 
+CFG_ARGS="--use_cfg_parallel"
+
+# Uncomment and modify these as needed
+# PIPEFUSION_ARGS="--num_pipeline_patch 8"
+# OUTPUT_ARGS="--output_type latent"
+# PARALLLEL_VAE="--use_parallel_vae"
+# ENABLE_TILING="--enable_tiling"
+# MODEL_OFFLOAD="--enable_model_cpu_offload"
+ENABLE_CACHE="--use_teacache"
+COMPILE_FLAG="--use_torch_compile"
+#ENABLE_W8A8="--use_w8a8_linear"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$ENABLE_W8A8 \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "A rainy night in a dense cyberpunk market, neon kanji signs flicker overhead. The camera starts shoulder-height behind a hooded courier, steadily tracking forward as he weaves through crowds of holographic umbrellas. Volumetric pink-blue backlight cuts through steam vents, puddles mirror the glow. Lens flare, shallow depth of field. Moody, Blade-Runner vibe." \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+$ENABLE_TILING \
+$ENABLE_CACHE \
+$COMPILE_FLAG \
+$CFG_ARGS
diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..73da9b094935591b99a81d05dca9d17809f0ec97
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py
@@ -0,0 +1,131 @@
+import logging
+import time
+import torch
+import torch.distributed
+from diffusers import AutoencoderKLTemporalDecoder
+from xfuser import xFuserWanPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    get_data_parallel_rank,
+    get_data_parallel_world_size,
+    get_runtime_state,
+    is_dp_last_group,
+    get_world_group
+)
+from diffusers import WanPipeline
+
+from xfuser.model_executor.cache.teacache.backend import TeaCacheBackend
+from xfuser.model_executor.cache.data import DiffusionCacheConfig
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from diffusers.utils import export_to_video
+
+
+def main():
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+
+    assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+    # assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for CogVideo"
+    # assert not (engine_args.tensor_parallel_degree > 1 and  engine_args.ulysses_degree > 1), "This script cannot support tensor_parallel_degree and ulysses_degree at the same time."
+
+    pipe = xFuserWanPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        engine_config=engine_config,
+        torch_dtype=torch.bfloat16,
+    )
+
+    # https://github.com/ali-vilab/TeaCache/blob/main/TeaCache4Wan2.1/teacache_generate.py#L892
+    if engine_args.use_teacache:
+        config = DiffusionCacheConfig(rel_l1_thresh = 0.2,
+                                      coefficients = [-5784.54975374,  5449.50911966, -1811.16591783,   256.27178429, -13.02252404]
+                                    )
+        backend = TeaCacheBackend(config)
+        backend.enable(pipe,transformer_key = "transformer")
+        backend.refresh(pipe, input_config.num_inference_steps, transformer_key = "transformer")
+
+    if args.enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} sequential CPU offload enabled")
+    elif args.enable_model_cpu_offload:
+        pipe.enable_model_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} model CPU offload enabled")
+    else:
+        device = torch.device(f"cuda:{local_rank}")
+        pipe = pipe.to(device)
+
+    if args.enable_tiling:
+        pipe.vae.enable_tiling()
+
+    if args.enable_slicing:
+        pipe.vae.enable_slicing()
+    
+    if args.use_easycache:
+        cache_kwargs = {
+        "use_easycache":True,
+        "cache_thresh":0.02  #easy eacch thresh
+       }
+    else:
+        cache_kwargs = None  
+    
+    
+    if  engine_args.use_w8a8_linear:  
+        from w8a8_linear import apply_quant_linear_i8w8o16
+        pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)      
+
+    # warmup
+    # output = pipe(
+    #     height=input_config.height,
+    #     width=input_config.width,
+    #     num_frames=input_config.num_frames,
+    #     prompt=input_config.prompt,
+    #     num_inference_steps=1,
+    #     generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+    # ).frames
+
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+
+    prompt=["一个虎虎生威的老虎","Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage"]
+    negative_prompt = ["畸形,光照不好","畸形,光照不好"]
+
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        prompt=input_config.prompt,
+        negative_prompt = input_config.negative_prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        guidance_scale=input_config.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+        cache_kwargs = cache_kwargs
+    )
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_reserved(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"tp{engine_args.tensor_parallel_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    # if is_dp_last_group():
+    resolution = f"{input_config.width}x{input_config.height}"
+    for i, frames in enumerate(output.frames):
+        output_filename = f"results/wan2.1_t2v_14b_{i}_{parallel_info}_{resolution}.mp4"
+        export_to_video(frames, output_filename, fps=16)
+        print(f"output saved to {output_filename}")
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
+    # get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad0ac17341e34ab57603a858950e5a6f3547b2dd
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md
@@ -0,0 +1,51 @@
+# Wan2.2-TI2V-5B-Diffusers (xDiT)
+
+## Model Description
+
+Wan2.2-TI2V-5B is Wan AI's image-to-video diffusion model with 5B parameters. It generates smooth, high-quality videos from input images, maintaining visual consistency and adding natural motion.
+
+This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://www.modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B-Diffusers>
+
+### Install Dependencies
+
+1. Install Iluvatar CoreX adapted framework:
+```bash
+pip install diffusers-{version}-py3-none-any.whl
+pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl
+```
+
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+## Model Inference
+
+1. The scripts are pre-copied in this directory. Modify model path:
+```bash
+vim run_wan_2.2_t2v_5b.sh
+# Update MODEL_ID to your actual model path
+# Modify TASK_ARGS if needed
+```
+
+2. Run script:
+```bash
+bash run_wan_2.2_t2v_5b.sh
+```
+
+## References
+
+- [Wan2.2](https://github.com/Wan-Video/Wan2.1)
+- [xDiT](https://github.com/xdit-team/xDiT)
\ No newline at end of file
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fa5b1c5f7e309593a6e11bde46979d4c4255b4b
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt
@@ -0,0 +1,9 @@
+#diffusers
+yunchang
+ftfy
+transformers==4.55
+numpy==1.26.4
+imageio
+imageio-ffmpeg
+distvae
+
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9bea267c7861522207d604e3c115da59b6da9ff4
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+set -x
+#export ATTN_OPT_LEVEL=2 #xdit >=0.4.5
+export PYTHONPATH=$PWD:$PYTHONPATH
+
+# CogVideoX configuration
+SCRIPT="wan2.2_t2v_example.py"
+MODEL_ID="/data/nlp/Wan2.2-TI2V-5B-Diffusers/"
+INFERENCE_STEP=50
+
+mkdir -p ./results
+
+# CogVideoX specific task args
+TASK_ARGS="--height 704 --width 1280 --num_frames 33 --seed 32 "
+
+# CogVideoX parallel configuration
+N_GPUS=4
+PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1 --tensor_parallel_degree  1"
+CFG_ARGS="--use_cfg_parallel"
+
+# Uncomment and modify these as needed
+# PIPEFUSION_ARGS="--num_pipeline_patch 8"
+# OUTPUT_ARGS="--output_type latent"
+# PARALLLEL_VAE="--use_parallel_vae"
+# ENABLE_TILING="--enable_tiling"
+# MODEL_OFFLOAD="--enable_model_cpu_offload"
+# COMPILE_FLAG="--use_torch_compile"
+
+torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \
+--model $MODEL_ID \
+$PARALLEL_ARGS \
+$TASK_ARGS \
+$PIPEFUSION_ARGS \
+$OUTPUT_ARGS \
+--num_inference_steps $INFERENCE_STEP \
+--warmup_steps 0 \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \
+--negative_prompt "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
+$ENABLE_TILING \
+$ENABLE_CACHE \
+$COMPILE_FLAG \
+$CFG_ARGS
diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..346b9e937cba7a0c4add5b860732fab7225e2818
--- /dev/null
+++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py
@@ -0,0 +1,131 @@
+import logging
+import time
+import torch
+import torch.distributed
+from diffusers import AutoencoderKLTemporalDecoder
+from xfuser import xFuserWanPipeline, xFuserArgs
+from xfuser.config import FlexibleArgumentParser
+from xfuser.core.distributed import (
+    get_world_group,
+    get_data_parallel_rank,
+    get_data_parallel_world_size,
+    get_runtime_state,
+    is_dp_last_group,
+    get_world_group
+)
+from diffusers import WanPipeline
+
+from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from diffusers.utils import export_to_video
+from xfuser.model_executor.cache.teacache.backend import TeaCacheBackend
+from xfuser.model_executor.cache.data import DiffusionCacheConfig
+
+
+def main():
+    parser = FlexibleArgumentParser(description="xFuser Arguments")
+    args = xFuserArgs.add_cli_args(parser).parse_args()
+    engine_args = xFuserArgs.from_cli_args(args)
+
+    engine_config, input_config = engine_args.create_config()
+    local_rank = get_world_group().local_rank
+
+    assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion."
+
+    pipe = xFuserWanPipeline.from_pretrained(
+        pretrained_model_name_or_path=engine_config.model_config.model,
+        engine_config=engine_config,
+        torch_dtype=torch.bfloat16,
+    )
+
+    if engine_args.use_teacache:
+        config = DiffusionCacheConfig(rel_l1_thresh = 0.2,
+                                      coefficients = [
+                                       6.85271205e+04,
+                                       -9.88214072e+03,
+                                       5.08858742e+02,
+                                       -7.39731467e+00,
+                                        1.22746295e-01,])
+        backend = TeaCacheBackend(config)
+        backend.enable(pipe,transformer_key = "transformer_2")
+        backend.refresh(pipe, input_config.num_inference_steps, transformer_key = "transformer_2")
+    
+    
+    if args.enable_sequential_cpu_offload:
+        pipe.enable_sequential_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} sequential CPU offload enabled")
+    elif args.enable_model_cpu_offload:
+        pipe.enable_model_cpu_offload(gpu_id=local_rank)
+        logging.info(f"rank {local_rank} model CPU offload enabled")
+    else:
+        device = torch.device(f"cuda:{local_rank}")
+        pipe = pipe.to(device)
+
+    if args.enable_tiling:
+        pipe.vae.enable_tiling()
+
+    if args.enable_slicing:
+        pipe.vae.enable_slicing()
+        
+    if  engine_args.use_w8a8_linear:  
+        from w8a8_linear import apply_quant_linear_i8w8o16
+        pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer)   
+
+    # warmup
+    # output = pipe(
+    #     height=input_config.height,
+    #     width=input_config.width,
+    #     num_frames=input_config.num_frames,
+    #     prompt=input_config.prompt,
+    #     num_inference_steps=1,
+    #     generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+    # ).frames
+
+    torch.cuda.reset_peak_memory_stats()
+    start_time = time.time()
+
+
+    if args.use_easycache:
+        cache_kwargs = {
+        "use_easycache":True,
+        "cache_thresh":0.02,  #easy eacch thresh
+        #"ret_steps":10
+       }
+    else:
+        cache_kwargs = None  
+
+    output = pipe(
+        height=input_config.height,
+        width=input_config.width,
+        num_frames=input_config.num_frames,
+        prompt=input_config.prompt,
+        negative_prompt = input_config.negative_prompt,
+        num_inference_steps=input_config.num_inference_steps,
+        guidance_scale=input_config.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(input_config.seed),
+        cache_kwargs = cache_kwargs
+    )
+
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    peak_memory = torch.cuda.max_memory_reserved(device=f"cuda:{local_rank}")
+
+    parallel_info = (
+        f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_"
+        f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_"
+        f"tp{engine_args.tensor_parallel_degree}_"
+        f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}"
+    )
+    # if is_dp_last_group():
+    resolution = f"{input_config.width}x{input_config.height}"
+    for i, frames in enumerate(output.frames):
+        output_filename = f"results/wan2.2_t2v_{i}_{parallel_info}_{resolution}.mp4"
+        export_to_video(frames, output_filename, fps=16)
+        print(f"output saved to {output_filename}")
+
+    if get_world_group().rank == get_world_group().world_size - 1:
+        print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB")
+    get_runtime_state().destroy_distributed_env()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md
index bf11ca80b8e0699868b7a002d488cfb1b50f5938..be0e86e64ef652cd7c502375791ce5181fe4c286 100644
--- a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md
@@ -9,6 +9,7 @@ Qwen2.5-VL is not only proficient in recognizing common objects such as flowers,
 | GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
 | :----: | :----: | :----: |
 | MR-V100 | 4.3.0     |  25.09  |
+| MR-V100 | 4.4.0     |  26.03  |
 
 ## Model Preparation
 
@@ -32,6 +33,49 @@ export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1
 python3 offline_inference_vision_language.py --model /path/to/Qwen2.5-VL-3B-Instruct/ -tp 4 --trust-remote-code --temperature 0.0 --max-token 256
 ```
 
+### Qwen2.5-VL-32B-Instruct (W8A8/W4A16)
+
+#### Performance Test
+
+1. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+2. Start server:
+```bash
+vllm serve /path/to/model  --max-num-seqs 1 --max-model-len 98304 --limit_mm_per_prompt '{"image": 5}' --disable-cascade-attn --tensor-parallel-size 4 --gpu_memory_utilization 0.9 --pipeline-parallel-size 1 --host 0.0.0.0 --port 8000  --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+3. Run client:
+```bash
+# Use the pre-copied guidellm
+cd guidellm && pip install .
+pip install beautifulsoup4
+cd ..
+guidellm --data "prompt_tokens=512,generated_tokens=512,images=1,width=1770,height=1180" --data-type emulated --model /path/to/model --target "http://localhost:8000/v1" --max-requests 1
+```
+
+### Qwen2.5-VL-72B-Instruct (W4A16)
+
+#### Performance Test
+
+1. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+2. Start server:
+```bash
+vllm serve /path/to/model  --max-num-seqs 1 --max-model-len 98304 --limit_mm_per_prompt '{"image": 5}' --disable-cascade-attn --tensor-parallel-size 8 --gpu_memory_utilization 0.9 --pipeline-parallel-size 1 --host 0.0.0.0 --port 8000 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+3. Run client:
+```bash
+# Same as 32B version
+guidellm --data "prompt_tokens=512,generated_tokens=512,images=1,width=1770,height=1180" --data-type emulated --model /path/to/model --target "http://localhost:8000/v1" --max-requests 1
+```
+
 ## Model Results
 
 ### Benchmarking vLLM
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..6ab2c6e9940c580355ebf34c530ffa4fb6b5ce83
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml
@@ -0,0 +1,212 @@
+[build-system]
+requires = ["setuptools >= 61.0", "wheel", "build"]
+build-backend = "setuptools.build_meta"
+
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["*"]
+
+[tool.setuptools.package-data]
+guidellm = ["*"]
+
+
+# ************************************************
+# ********** Project Metadata **********
+# ************************************************
+
+[project]
+name = "guidellm"
+version = "0.1.0"
+description = "Guidance platform for deploying and managing large language models."
+readme = { file = "README.md", content-type = "text/markdown" }
+requires-python = ">=3.8.0,<4.0"
+license = { file = "LICENSE" }
+authors = [ { name = "Neuralmagic, Inc." } ]
+urls = { homepage = "https://github.com/neuralmagic/guidellm" }
+dependencies = [
+    "click",
+    "datasets",
+    "ftfy>=6.0.0",
+    "loguru",
+    "numpy",
+    "openai",
+    "pydantic>=2.0.0",
+    "pydantic-settings>=2.0.0",
+    "pyyaml>=6.0.0",
+    "requests",
+    "rich",
+    "transformers",
+]
+
+[project.optional-dependencies]
+dev = [
+    # general and configurations
+    "pre-commit~=3.5.0",
+    "scipy~=1.10",
+    "sphinx~=7.1.2",
+    "tox~=4.16.0",
+
+    # testing
+    "pytest~=8.2.2",
+    "pytest-asyncio~=0.23.8",
+    "pytest-cov~=5.0.0",
+    "pytest-mock~=3.14.0",
+    "pytest-rerunfailures~=14.0",
+    "requests-mock~=1.12.1",
+
+    # code quality
+    "mypy~=1.10.1",
+    "ruff~=0.5.2",
+
+    # docs quality
+    "mdformat~=0.7.17",
+    "mdformat-footnote~=0.1.1",
+    "mdformat-frontmatter~=2.0.8",
+    "mdformat-gfm~=0.3.6",
+
+    # type-checking
+    "types-click~=7.1.8",
+    "types-PyYAML~=6.0.1",
+    "types-requests~=2.32.0",
+    "types-toml",
+]
+
+
+[project.entry-points.console_scripts]
+guidellm = "guidellm.main:generate_benchmark_report_cli"
+guidellm-config = "guidellm.config:print_config"
+
+
+# ************************************************
+# ********** Code Quality Tools **********
+# ************************************************
+
+[tool.black]
+line-length = 88
+target-version = ['py38']
+
+
+[tool.isort]
+profile = "black"
+
+
+[tool.mypy]
+files = ["src/guidellm", "tests"]
+python_version = '3.8'
+warn_redundant_casts = true
+warn_unused_ignores = false
+show_error_codes = true
+namespace_packages = true
+exclude = ["venv", ".tox"]
+
+# Silence "type import errors" as our 3rd-party libs does not have types
+# Check: https://mypy.readthedocs.io/en/latest/config_file.html#import-discovery
+follow_imports = 'silent'
+
+[[tool.mypy.overrides]]
+module = ["datasets.*"]
+ignore_missing_imports=true
+
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+exclude = ["build", "dist", "env", ".venv"]
+
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+
+[tool.ruff.lint]
+ignore = [
+    "PLR0913",
+    "TCH001",
+    "COM812",
+    "ISC001",
+    "TCH002",
+    "PLW1514", # allow Path.open without encoding
+    "RET505", # allow `else` blocks
+    "RET506" # allow `else` blocks
+
+]
+select = [
+    # Rules reference: https://docs.astral.sh/ruff/rules/
+
+    # Code Style / Formatting
+    "E", # pycodestyle: checks adherence to PEP 8 conventions including spacing, indentation, and line length
+    "W", # pycodestyle: checks adherence to PEP 8 conventions including spacing, indentation, and line length
+    "A", # flake8-builtins: prevents shadowing of Python built-in names
+    "C", # Convention: ensures code adheres to specific style and formatting conventions
+    "COM", # flake8-commas: enforces the correct use of trailing commas
+    "ERA", # eradicate: detects commented-out code that should be removed
+    "I", # isort: ensures imports are sorted in a consistent manner
+    "ICN", # flake8-import-conventions: enforces import conventions for better readability
+    "N", # pep8-naming: enforces PEP 8 naming conventions for classes, functions, and variables
+    "NPY", # NumPy: enforces best practices for using the NumPy library
+    "PD", # pandas-vet: enforces best practices for using the pandas library
+    "PT", # flake8-pytest-style: enforces best practices and style conventions for pytest tests
+    "PTH", # flake8-use-pathlib: encourages the use of pathlib over os.path for file system operations
+    "Q", # flake8-quotes: enforces consistent use of single or double quotes
+    "TCH", # flake8-type-checking: enforces type checking practices and standards
+    "TID", # flake8-tidy-imports: enforces tidy and well-organized imports
+    "RUF022", # flake8-ruff: enforce sorting of __all__ in modules
+
+    # Code Structure / Complexity
+    "C4", # flake8-comprehensions: improves readability and performance of list, set, and dict comprehensions
+    "C90", # mccabe: checks for overly complex code using cyclomatic complexity
+    "ISC", # flake8-implicit-str-concat: prevents implicit string concatenation
+    "PIE", # flake8-pie: identifies and corrects common code inefficiencies and mistakes
+    "R", # Refactor: suggests improvements to code structure and readability
+    "SIM", # flake8-simplify: simplifies complex expressions and improves code readability
+
+    # Code Security / Bug Prevention
+    "ARG", # flake8-unused-arguments: detects unused function and method arguments
+    "ASYNC", # flake8-async: identifies incorrect or inefficient usage patterns in asynchronous code
+    "B", # flake8-bugbear: detects common programming mistakes and potential bugs
+    "BLE", # flake8-blind-except: prevents blind exceptions that catch all exceptions without handling
+    "E", # Error: detects and reports errors in the code
+    "F", # Pyflakes: detects unused imports, shadowed imports, undefined variables, and various formatting errors in string operations
+    "INP", # flake8-no-pep420: prevents implicit namespace packages by requiring __init__.py
+    "PGH", # pygrep-hooks: detects deprecated and dangerous code patterns
+    "PL", # Pylint: comprehensive source code analyzer for enforcing coding standards and detecting errors
+    "RSE", # flake8-raise: ensures exceptions are raised correctly
+    "S", # flake8-bandit: detects security issues and vulnerabilities in the code
+    "SLF", # flake8-self: prevents incorrect usage of the self argument in class methods
+    "T10", # flake8-debugger: detects the presence of debugging tools such as pdb
+    "T20", # flake8-print: detects print statements left in the code
+    "UP", # pyupgrade: automatically upgrades syntax for newer versions of Python
+    "W", # Warning: provides warnings about potential issues in the code
+    "YTT", # flake8-2020: identifies code that will break with future Python releases
+
+    # Code Documentation
+    "FIX", # flake8-fixme: detects FIXMEs and other temporary comments that should be resolved
+]
+
+[tool.ruff.lint.extend-per-file-ignores]
+"tests/**/*.py" = [
+    "S101", # asserts allowed in tests
+    "ARG", # Unused function args allowed in tests
+    "PLR2004", # Magic value used in comparison
+    "TCH002", # No import only type checking in tests
+    "SLF001", # enable private member access in tests
+    "S105", # allow hardcoded passwords in tests
+    "S311", # allow standard pseudo-random generators in tests
+    "PT011", # allow generic exceptions in tests
+    "N806", # allow uppercase variable names in tests
+    "PGH003", # allow general ignores in tests
+    "S106", # allow hardcoded passwords in tests
+    "PLR0915", # allow complext statements in tests
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["guidellm", "tests"]
+
+
+[tool.pytest.ini_options]
+addopts = '-s -vvv --cache-clear'
+markers = [
+    "smoke: quick tests to check basic functionality",
+    "sanity: detailed tests to ensure major functions work correctly",
+    "regression: tests to ensure that new changes do not break existing functionality"
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b10b4455ae29b9476829955b81bdfef07f515b25
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py
@@ -0,0 +1,20 @@
+"""
+Guidellm is a package that provides an easy and intuitive interface for
+evaluating and benchmarking large language models (LLMs).
+"""
+
+# flake8: noqa
+
+import os
+
+import transformers  # type: ignore
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Silence warnings for tokenizers
+transformers.logging.set_verbosity_error()  # Silence warnings for transformers
+
+
+from .config import settings
+from .logger import configure_logger, logger
+from .main import generate_benchmark_report
+
+__all__ = ["configure_logger", "logger", "settings", "generate_benchmark_report"]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13910180a77e3958a18da428932bce45aeff538a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py
@@ -0,0 +1,12 @@
+from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse
+from .openai import OpenAIBackend
+from .aiohttp import AiohttpBackend
+
+__all__ = [
+    "Backend",
+    "BackendEngine",
+    "BackendEnginePublic",
+    "GenerativeResponse",
+    "OpenAIBackend",
+    "AiohttpBackend"
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbbd97158fab0f547a812534eb06152e83366328
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py
@@ -0,0 +1,180 @@
+import base64
+import io
+from typing import AsyncGenerator, Dict, List, Optional
+from loguru import logger
+
+import aiohttp
+import json
+
+from guidellm.backend.base import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+__all__ = ["AiohttpBackend"]
+
+@Backend.register("aiohttp_server")
+class AiohttpBackend(Backend):
+    """
+    An aiohttp-based backend implementation for LLM requests.
+
+    This class provides an interface to communicate with a server hosting
+    an LLM API using aiohttp for asynchronous requests.
+    """
+
+    def __init__(
+        self,
+        openai_api_key: Optional[str] = None,
+        target: Optional[str] = None,
+        model: Optional[str] = None,
+        timeout: Optional[float] = None,
+        **request_args,
+    ):
+        self._request_args: Dict = request_args        
+        self._api_key: str = openai_api_key or settings.aiohttp.api_key
+
+        if not self._api_key:
+            err = ValueError(
+                "`GUIDELLM__AIOHTTP__API_KEY` environment variable or "
+                "--openai-api-key CLI parameter must be specified for the "
+                "aiohttp backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        base_url = target or settings.aiohttp.base_url
+        self._api_url = f"{base_url}/chat/completions"
+
+        if not base_url:
+            err = ValueError(
+                "`GUIDELLM__AIOHTTP__BASE_URL` environment variable or "
+                "target parameter must be specified for the OpenAI backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        self._timeout = aiohttp.ClientTimeout(total=timeout or settings.request_timeout)
+        self._model = model
+
+        super().__init__(type_="aiohttp_backend", target=base_url, model=self._model)
+        logger.info("aiohttp {} Backend listening on {}", self._model, base_url)
+
+    async def make_request(
+        self,
+        request: TextGenerationRequest,
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the aiohttp backend.
+
+        Sends a prompt to the LLM server and streams the response tokens.
+
+        :param request: The text generation request to submit.
+        :type request: TextGenerationRequest
+        :yield: A stream of GenerativeResponse objects.
+        :rtype: AsyncGenerator[GenerativeResponse, None]
+        """
+
+        async with aiohttp.ClientSession(timeout=self._timeout) as session:
+            logger.debug("Making request to aiohttp backend with prompt: {}", request.prompt)
+
+            request_args = {}
+            if request.output_token_count is not None:
+                request_args.update(
+                    {
+                        "max_completion_tokens": request.output_token_count,
+                        "stop": None,
+                        "ignore_eos": True,
+                    }
+                )
+            elif settings.aiohttp.max_gen_tokens and settings.aiohttp.max_gen_tokens > 0:
+                request_args.update(
+                    {
+                        "max_tokens": settings.aiohttp.max_gen_tokens,
+                    }
+                )
+
+            request_args.update(self._request_args)
+
+            messages = self._build_messages(request)
+
+            payload = {
+                "model": self._model,
+                "messages": messages,
+                "stream": True,
+                **request_args,
+            }
+
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {self._api_key}",
+            }
+
+            try:
+                async with session.post(url=self._api_url, json=payload, headers=headers) as response:
+                    if response.status != 200:
+                        error_message = await response.text()
+                        logger.error("Request failed: {} - {}", response.status, error_message)
+                        raise Exception(f"Failed to generate response: {error_message}")
+
+                    token_count = 0
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk == "[DONE]":
+                            # Final response
+                            yield GenerativeResponse(
+                                type_="final",
+                                prompt=request.prompt,
+                                output_token_count=token_count,
+                                prompt_token_count=request.prompt_token_count,
+                            )
+                        else:
+                            # Intermediate token response
+                            token_count += 1
+                            data = json.loads(chunk)
+                            delta = data["choices"][0]["delta"]
+                            token = delta["content"]
+                            yield GenerativeResponse(
+                                type_="token_iter",
+                                add_token=token,
+                                prompt=request.prompt,
+                                output_token_count=token_count,
+                                prompt_token_count=request.prompt_token_count,
+                            )
+            except Exception as e:
+                logger.error("Error while making request: {}", e)
+                raise
+
+    def available_models(self) -> List[str]:
+        """
+        Retrieve a list of available models from the server.
+        """
+        # This could include an API call to `self._api_url/models` if the server supports it.
+        logger.warning("Fetching available models is not implemented for aiohttp backend.")
+        return []
+
+    def validate_connection(self):
+        """
+        Validate the connection to the backend server.
+        """
+        logger.info("Connection validation is not explicitly implemented for aiohttp backend.")
+
+    def _build_messages(self, request: TextGenerationRequest) -> Dict:
+        if request.number_images == 0:
+            messages = [{"role": "user", "content": request.prompt}]
+        else:
+            content = []
+            for image in request.images:
+                stream = io.BytesIO()
+                im_format = image.image.format or "PNG"
+                image.image.save(stream, format=im_format)
+                im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
+                image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+                content.append({"type": "image_url", "image_url": image_url})
+
+            content.append({"type": "text", "text": request.prompt})
+            messages = [{"role": "user", "content": content}]
+
+        return messages
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a165859454ac462a8dfedade0240a7e118acf50a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py
@@ -0,0 +1,320 @@
+import asyncio
+import functools
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Dict, List, Literal, Optional, Type, Union
+
+from loguru import logger
+from pydantic import BaseModel
+from transformers import (  # type: ignore  # noqa: PGH003
+    AutoTokenizer,
+    PreTrainedTokenizer,
+)
+
+from guidellm.core import TextGenerationRequest, TextGenerationResult
+
+__all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"]
+
+
+BackendEnginePublic = Literal["openai_server", "aiohttp_server"]
+BackendEngine = Union[BackendEnginePublic, Literal["test"]]
+
+
+class GenerativeResponse(BaseModel):
+    """
+    A model representing a response from a generative AI backend.
+
+    :param type_: The type of response, either 'token_iter' for intermediate
+        token output or 'final' for the final result.
+    :type type_: Literal["token_iter", "final"]
+    :param add_token: The token to add to the output
+        (only applicable if type_ is 'token_iter').
+    :type add_token: Optional[str]
+    :param prompt: The original prompt sent to the backend.
+    :type prompt: Optional[str]
+    :param output: The final generated output (only applicable if type_ is 'final').
+    :type output: Optional[str]
+    :param prompt_token_count: The number of tokens in the prompt.
+    :type prompt_token_count: Optional[int]
+    :param output_token_count: The number of tokens in the output.
+    :type output_token_count: Optional[int]
+    """
+
+    type_: Literal["token_iter", "final"]
+    add_token: Optional[str] = None
+    prompt: Optional[str] = None
+    output: Optional[str] = None
+    prompt_token_count: Optional[int] = None
+    output_token_count: Optional[int] = None
+
+
+class Backend(ABC):
+    """
+    Abstract base class for generative AI backends.
+
+    This class provides a common interface for creating and interacting with different
+    generative AI backends. Subclasses should implement the abstract methods to
+    define specific backend behavior.
+
+    :cvar _registry: A dictionary that maps BackendEngine types to backend classes.
+    :type _registry: Dict[BackendEngine, Type[Backend]]
+    :param type_: The type of the backend.
+    :type type_: BackendEngine
+    :param target: The target URL for the backend.
+    :type target: str
+    :param model: The model used by the backend.
+    :type model: str
+    """
+
+    _registry: Dict[BackendEngine, "Type[Backend]"] = {}
+
+    @classmethod
+    def register(cls, backend_type: BackendEngine):
+        """
+        A decorator to register a backend class in the backend registry.
+
+        :param backend_type: The type of backend to register.
+        :type backend_type: BackendEngine
+        :return: The decorated backend class.
+        :rtype: Type[Backend]
+        """
+
+        def inner_wrapper(wrapped_class: Type["Backend"]):
+            cls._registry[backend_type] = wrapped_class
+            logger.info("Registered backend type: {}", backend_type)
+            return wrapped_class
+
+        return inner_wrapper
+
+    @classmethod
+    def create(cls, backend_type: BackendEngine, **kwargs) -> "Backend":
+        """
+        Factory method to create a backend instance based on the backend type.
+
+        :param backend_type: The type of backend to create.
+        :type backend_type: BackendEngine
+        :param kwargs: Additional arguments for backend initialization.
+        :return: An instance of a subclass of Backend.
+        :rtype: Backend
+        :raises ValueError: If the backend type is not registered.
+        """
+
+        logger.info("Creating backend of type {}", backend_type)
+
+        if backend_type not in cls._registry:
+            err = ValueError(f"Unsupported backend type: {backend_type}")
+            logger.error("{}", err)
+            raise err
+
+        return Backend._registry[backend_type](**kwargs)
+
+    def __init__(self, type_: BackendEngine, target: str, model: str):
+        """
+        Base constructor for the Backend class.
+        Calls into test_connection to ensure the backend is reachable.
+        Ensure all setup is done in the subclass constructor before calling super.
+
+        :param type_: The type of the backend.
+        :param target: The target URL for the backend.
+        :param model: The model used by the backend.
+        """
+        self._type = type_
+        self._target = target
+        self._model = model
+
+        self.test_connection()
+
+    @property
+    def default_model(self) -> str:
+        """
+        Get the default model for the backend.
+
+        :return: The default model.
+        :rtype: str
+        :raises ValueError: If no models are available.
+        """
+        return _cachable_default_model(self)
+
+    @property
+    def type_(self) -> BackendEngine:
+        """
+        Get the type of the backend.
+
+        :return: The type of the backend.
+        :rtype: BackendEngine
+        """
+        return self._type
+
+    @property
+    def target(self) -> str:
+        """
+        Get the target URL for the backend.
+
+        :return: The target URL.
+        :rtype: str
+        """
+        return self._target
+
+    @property
+    def model(self) -> str:
+        """
+        Get the model used by the backend.
+
+        :return: The model name.
+        :rtype: str
+        """
+        return self._model
+
+    def model_tokenizer(self) -> PreTrainedTokenizer:
+        """
+        Get the tokenizer for the backend model.
+
+        :return: The tokenizer instance.
+        """
+        return AutoTokenizer.from_pretrained(self.model)
+
+    def test_connection(self) -> bool:
+        """
+        Test the connection to the backend by running a short text generation request.
+        If successful, returns True, otherwise raises an exception.
+
+        :return: True if the connection is successful.
+        :rtype: bool
+        :raises ValueError: If the connection test fails.
+        """
+        try:
+            asyncio.get_running_loop()
+            is_async = True
+        except RuntimeError:
+            is_async = False
+
+        if is_async:
+            logger.warning("Running in async mode, cannot test connection")
+            return True
+
+        try:
+            request = TextGenerationRequest(
+                prompt="Test connection", output_token_count=5
+            )
+
+            asyncio.run(self.submit(request))
+            return True
+        except Exception as err:
+            raise_err = RuntimeError(
+                f"Backend connection test failed for backend type={self.type_} "
+                f"with target={self.target} and model={self.model} with error: {err}"
+            )
+            logger.error(raise_err)
+            raise raise_err from err
+
+    async def submit(self, request: TextGenerationRequest) -> TextGenerationResult:
+        """
+        Submit a text generation request and return the result.
+
+        This method handles the request submission to the backend and processes
+        the response in a streaming fashion if applicable.
+
+        :param request: The request object containing the prompt
+            and other configurations.
+        :type request: TextGenerationRequest
+        :return: The result of the text generation request.
+        :rtype: TextGenerationResult
+        :raises ValueError: If no response is received from the backend.
+        """
+
+        logger.debug("Submitting request with prompt: {}", request.prompt)
+
+        result = TextGenerationResult(request=request)
+        result.start(request.prompt)
+        received_final = False
+
+        async for response in self.make_request(request):
+            logger.debug("Received response: {}", response)
+            if response.type_ == "token_iter":
+                result.output_token(response.add_token if response.add_token else "")
+            elif response.type_ == "final":
+                if received_final:
+                    err = ValueError(
+                        "Received multiple final responses from the backend."
+                    )
+                    logger.error(err)
+                    raise err
+
+                result.end(
+                    output=response.output,
+                    prompt_token_count=response.prompt_token_count,
+                    output_token_count=response.output_token_count,
+                )
+                received_final = True
+            else:
+                err = ValueError(
+                    f"Invalid response received from the backend of type: "
+                    f"{response.type_} for {response}"
+                )
+                logger.error(err)
+                raise err
+
+        if not received_final:
+            err = ValueError("No final response received from the backend.")
+            logger.error(err)
+            raise err
+
+        logger.info("Request completed with output: {}", result.output)
+
+        return result
+
+    @abstractmethod
+    async def make_request(
+        self,
+        request: TextGenerationRequest,
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Abstract method to make a request to the backend.
+
+        Subclasses must implement this method to define how requests are handled
+        by the backend.
+
+        :param request: The request object containing the prompt and
+            other configurations.
+        :type request: TextGenerationRequest
+        :yield: A generator yielding responses from the backend.
+        :rtype: AsyncGenerator[GenerativeResponse, None]
+        """
+        yield None  # type: ignore  # noqa: PGH003
+
+    @abstractmethod
+    def available_models(self) -> List[str]:
+        """
+        Abstract method to get the available models for the backend.
+
+        Subclasses must implement this method to provide the list of models
+        supported by the backend.
+
+        :return: A list of available models.
+        :rtype: List[str]
+        :raises NotImplementedError: If the method is not implemented by a subclass.
+        """
+        raise NotImplementedError
+
+
+@functools.lru_cache(maxsize=1)
+def _cachable_default_model(backend: Backend) -> str:
+    """
+    Get the default model for a backend using LRU caching.
+
+    This function caches the default model to optimize repeated lookups.
+
+    :param backend: The backend instance for which to get the default model.
+    :type backend: Backend
+    :return: The default model.
+    :rtype: str
+    :raises ValueError: If no models are available.
+    """
+    logger.debug("Getting default model for backend: {}", backend)
+    models = backend.available_models()
+    if models:
+        logger.debug("Default model: {}", models[0])
+        return models[0]
+
+    err = ValueError("No models available.")
+    logger.error(err)
+    raise err
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..9843fc1a06ac7fbecd2198b7317af8545a07c81a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py
@@ -0,0 +1,192 @@
+import base64
+import io
+from typing import AsyncGenerator, Dict, List, Optional
+
+from loguru import logger
+from openai import AsyncOpenAI, OpenAI
+
+from guidellm.backend.base import Backend, GenerativeResponse
+from guidellm.config import settings
+from guidellm.core import TextGenerationRequest
+
+__all__ = ["OpenAIBackend"]
+
+
+@Backend.register("openai_server")
+class OpenAIBackend(Backend):
+    """
+    An OpenAI backend implementation for generative AI results.
+
+    This class provides an interface to communicate with the
+    OpenAI server for generating responses based on given prompts.
+
+    :param openai_api_key: The API key for OpenAI.
+        If not provided, it will default to the key from settings.
+    :type openai_api_key: Optional[str]
+    :param target: The target URL string for the OpenAI server.
+    :type target: Optional[str]
+    :param model: The OpenAI model to use, defaults to the first available model.
+    :type model: Optional[str]
+    :param request_args: Additional arguments for the OpenAI request.
+    :type request_args: Dict[str, Any]
+    """
+
+    def __init__(
+        self,
+        openai_api_key: Optional[str] = None,
+        target: Optional[str] = None,
+        model: Optional[str] = None,
+        **request_args,
+    ):
+        self._request_args: Dict = request_args
+        api_key: str = openai_api_key or settings.openai.api_key
+
+        if not api_key:
+            err = ValueError(
+                "`GUIDELLM__OPENAI__API_KEY` environment variable or "
+                "--openai-api-key CLI parameter must be specified for the "
+                "OpenAI backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        base_url = target or settings.openai.base_url
+
+        if not base_url:
+            err = ValueError(
+                "`GUIDELLM__OPENAI__BASE_URL` environment variable or "
+                "target parameter must be specified for the OpenAI backend."
+            )
+            logger.error("{}", err)
+            raise err
+
+        self._async_client = AsyncOpenAI(api_key=api_key, base_url=base_url)
+        self._client = OpenAI(api_key=api_key, base_url=base_url)
+        self._model = model or self.default_model
+
+        super().__init__(type_="openai_server", target=base_url, model=self._model)
+        logger.info("OpenAI {} Backend listening on {}", self._model, base_url)
+
+    async def make_request(
+        self,
+        request: TextGenerationRequest,
+    ) -> AsyncGenerator[GenerativeResponse, None]:
+        """
+        Make a request to the OpenAI backend.
+
+        This method sends a prompt to the OpenAI backend and streams
+        the response tokens back.
+
+        :param request: The text generation request to submit.
+        :type request: TextGenerationRequest
+        :yield: A stream of GenerativeResponse objects.
+        :rtype: AsyncGenerator[GenerativeResponse, None]
+        """
+
+        logger.debug("Making request to OpenAI backend with prompt: {}", request.prompt)
+
+        request_args: Dict = {
+            "n": 1,  # Number of completions for each prompt
+        }
+
+        if request.output_token_count is not None:
+            request_args.update(
+                {
+                    "max_tokens": request.output_token_count,
+                    "stop": None,
+                    "extra_body": {
+                        "ignore_eos": True,
+                    }
+                }
+            )
+        elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0:
+            request_args.update(
+                {
+                    "max_tokens": settings.openai.max_gen_tokens,
+                }
+            )
+
+        request_args.update(self._request_args)
+
+        messages = self._build_messages(request)
+
+        stream = await self._async_client.chat.completions.create(
+            model=self.model,
+            messages=messages,
+            stream=True,
+            **request_args,
+        )
+
+        token_count = 0
+        async for chunk in stream:
+            choice = chunk.choices[0]
+            token = choice.delta.content or ""
+
+            if choice.finish_reason is not None:
+                yield GenerativeResponse(
+                    type_="final",
+                    prompt=request.prompt,
+                    prompt_token_count=request.prompt_token_count,
+                    output_token_count=token_count,
+                )
+                break
+
+            token_count += 1
+            yield GenerativeResponse(
+                type_="token_iter",
+                add_token=token,
+                prompt=request.prompt,
+                prompt_token_count=request.prompt_token_count,
+                output_token_count=token_count,
+            )
+
+    def available_models(self) -> List[str]:
+        """
+        Get the available models for the backend.
+
+        This method queries the OpenAI API to retrieve a list of available models.
+
+        :return: A list of available models.
+        :rtype: List[str]
+        :raises openai.OpenAIError: If an error occurs while retrieving models.
+        """
+
+        try:
+            return [model.id for model in self._client.models.list().data]
+        except Exception as error:
+            logger.error("Failed to retrieve available models: {}", error)
+            raise error
+
+    def validate_connection(self):
+        """
+        Validate the connection to the OpenAI backend.
+
+        This method checks that the OpenAI backend is reachable and
+        the API key is valid.
+
+        :raises openai.OpenAIError: If the connection is invalid.
+        """
+
+        try:
+            self._client.models.list()
+        except Exception as error:
+            logger.error("Failed to validate OpenAI connection: {}", error)
+            raise error
+
+    def _build_messages(self, request: TextGenerationRequest) -> Dict:
+        if request.number_images == 0:
+            messages = [{"role": "user", "content": request.prompt}]
+        else:
+            content = []
+            for image in request.images:
+                stream = io.BytesIO()
+                im_format = image.image.format or "PNG"
+                image.image.save(stream, format=im_format)
+                im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8")
+                image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"}
+                content.append({"type": "image_url", "image_url": image_url})
+
+            content.append({"type": "text", "text": request.prompt})
+            messages = [{"role": "user", "content": content}]
+
+        return messages
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c81f67a6666490dd48a1eddba4dad6c90e2bf08a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py
@@ -0,0 +1,239 @@
+import json
+from enum import Enum
+from typing import Dict, List, Optional, Sequence
+
+from pydantic import BaseModel, Field, model_validator
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+__all__ = [
+    "DatasetSettings",
+    "EmulatedDataSettings",
+    "Environment",
+    "LoggingSettings",
+    "OpenAISettings",
+    "print_config",
+    "ReportGenerationSettings",
+    "Settings",
+    "reload_settings",
+    "settings",
+]
+
+
+class Environment(str, Enum):
+    """
+    Enum for the supported environments
+    """
+
+    LOCAL = "local"
+    DEV = "dev"
+    STAGING = "staging"
+    PROD = "prod"
+
+
+ENV_REPORT_MAPPING = {
+    Environment.PROD: "https://guidellm.neuralmagic.com/local-report/index.html",
+    Environment.STAGING: "https://staging.guidellm.neuralmagic.com/local-report/index.html",
+    Environment.DEV: "https://dev.guidellm.neuralmagic.com/local-report/index.html",
+    Environment.LOCAL: "tests/dummy/report.html",
+}
+
+
+class LoggingSettings(BaseModel):
+    """
+    Logging settings for the application
+    """
+
+    disabled: bool = False
+    clear_loggers: bool = True
+    console_log_level: str = "WARNING"
+    log_file: Optional[str] = None
+    log_file_level: Optional[str] = None
+
+
+class DatasetSettings(BaseModel):
+    """
+    Dataset settings for the application
+    """
+
+    preferred_data_columns: List[str] = Field(
+        default_factory=lambda: [
+            "prompt",
+            "instruction",
+            "input",
+            "inputs",
+            "question",
+            "context",
+            "text",
+            "content",
+            "body",
+            "data",
+        ]
+    )
+    preferred_data_splits: List[str] = Field(
+        default_factory=lambda: ["test", "tst", "validation", "val", "train"]
+    )
+
+
+class EmulatedDataSettings(BaseModel):
+    """
+    Emulated data settings for the application to use
+    """
+
+    source: str = "http://localhost:666/aimages/1342-0.txt"
+    filter_start: str = "It is a truth universally acknowledged, that a"
+    filter_end: str = "CHISWICK PRESS:--CHARLES WHITTINGHAM AND CO."
+    clean_text_args: Dict[str, bool] = Field(
+        default_factory=lambda: {
+            "fix_encoding": True,
+            "clean_whitespace": True,
+            "remove_empty_lines": True,
+            "force_new_line_punctuation": True,
+        }
+    )
+    image_source: List[str] = "http://localhost:666/aimages/pg1-images.html"
+
+
+class OpenAISettings(BaseModel):
+    """
+    OpenAI settings for the application to connect to the API
+    for OpenAI server based pathways
+    """
+
+    # OpenAI API key.
+    api_key: str = "invalid_token"
+
+    # OpenAI-compatible server URL
+    # NOTE: The default value is default address of llama.cpp web server
+    base_url: str = "http://localhost:8000/v1"
+
+    max_gen_tokens: int = 4096
+
+
+class AiohttpSettings(OpenAISettings):
+    pass
+
+class ReportGenerationSettings(BaseModel):
+    """
+    Report generation settings for the application
+    """
+
+    source: str = ""
+    report_html_match: str = "window.report_data = {};"
+    report_html_placeholder: str = "{}"
+
+
+class Settings(BaseSettings):
+    """
+    All the settings are powered by pydantic_settings and could be
+    populated from the .env file.
+
+    The format to populate the settings is next
+
+    ```sh
+    export GUIDELLM__LOGGING__DISABLED=true
+    export GUIDELLM__OPENAI__API_KEY=******
+    ```
+    """
+
+    model_config = SettingsConfigDict(
+        env_prefix="GUIDELLM__",
+        env_nested_delimiter="__",
+        extra="ignore",
+        validate_default=True,
+        env_file=".env",
+    )
+
+    # general settings
+    env: Environment = Environment.PROD
+    request_timeout: int = 30
+    max_concurrency: int = 512
+    num_sweep_profiles: int = 9
+    logging: LoggingSettings = LoggingSettings()
+
+    # Data settings
+    dataset: DatasetSettings = DatasetSettings()
+    emulated_data: EmulatedDataSettings = EmulatedDataSettings()
+
+    # Request settings
+    openai: OpenAISettings = OpenAISettings()
+    aiohttp: AiohttpSettings = AiohttpSettings()
+
+    # Report settings
+    report_generation: ReportGenerationSettings = ReportGenerationSettings()
+
+    @model_validator(mode="after")
+    @classmethod
+    def set_default_source(cls, values):
+        if not values.report_generation.source:
+            values.report_generation.source = ENV_REPORT_MAPPING.get(values.env)
+
+        return values
+
+    def generate_env_file(self) -> str:
+        """
+        Generate the .env file from the current settings
+        """
+        return Settings._recursive_generate_env(
+            self,
+            self.model_config["env_prefix"],  # type: ignore  # noqa: PGH003
+            self.model_config["env_nested_delimiter"],  # type: ignore  # noqa: PGH003
+        )
+
+    @staticmethod
+    def _recursive_generate_env(model: BaseModel, prefix: str, delimiter: str) -> str:
+        env_file = ""
+        add_models = []
+        for key, value in model.model_dump().items():
+            if isinstance(value, BaseModel):
+                # add nested properties to be processed after the current level
+                add_models.append((key, value))
+                continue
+
+            dict_values = (
+                {
+                    f"{prefix}{key.upper()}{delimiter}{sub_key.upper()}": sub_value
+                    for sub_key, sub_value in value.items()
+                }
+                if isinstance(value, dict)
+                else {f"{prefix}{key.upper()}": value}
+            )
+
+            for tag, sub_value in dict_values.items():
+                if isinstance(sub_value, Sequence) and not isinstance(sub_value, str):
+                    value_str = ",".join(f'"{item}"' for item in sub_value)
+                    env_file += f"{tag}=[{value_str}]\n"
+                elif isinstance(sub_value, Dict):
+                    value_str = json.dumps(sub_value)
+                    env_file += f"{tag}={value_str}\n"
+                elif not sub_value:
+                    env_file += f"{tag}=\n"
+                else:
+                    env_file += f'{tag}="{sub_value}"\n'
+
+        for key, value in add_models:
+            env_file += Settings._recursive_generate_env(
+                value, f"{prefix}{key.upper()}{delimiter}", delimiter
+            )
+        return env_file
+
+
+settings = Settings()
+
+
+def reload_settings():
+    """
+    Reload the settings from the environment variables
+    """
+    new_settings = Settings()
+    settings.__dict__.update(new_settings.__dict__)
+
+
+def print_config():
+    """
+    Print the current configuration settings
+    """
+    print(f"Settings: \n{settings.generate_env_file()}")  # noqa: T201
+
+
+if __name__ == "__main__":
+    print_config()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e738aa769737a158dccb697a36d61697488d6b55
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py
@@ -0,0 +1,24 @@
+from .distribution import Distribution
+from .report import GuidanceReport
+from .request import TextGenerationRequest
+from .result import (
+    RequestConcurrencyMeasurement,
+    TextGenerationBenchmark,
+    TextGenerationBenchmarkReport,
+    TextGenerationError,
+    TextGenerationResult,
+)
+from .serializable import Serializable, SerializableFileType
+
+__all__ = [
+    "Distribution",
+    "GuidanceReport",
+    "RequestConcurrencyMeasurement",
+    "Serializable",
+    "SerializableFileType",
+    "TextGenerationBenchmark",
+    "TextGenerationBenchmarkReport",
+    "TextGenerationError",
+    "TextGenerationRequest",
+    "TextGenerationResult",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f770528c3bcc1a1ba0049797ee59067f816d9a0
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py
@@ -0,0 +1,190 @@
+from typing import List, Sequence
+
+import numpy as np
+from loguru import logger
+from pydantic import Field
+
+from guidellm.core.serializable import Serializable
+
+__all__ = ["Distribution"]
+
+
+class Distribution(Serializable):
+    """
+    A class to represent a statistical distribution and perform various
+    statistical analyses.
+    """
+
+    data: Sequence[float] = Field(
+        default_factory=list,
+        description="The data points of the distribution.",
+    )
+
+    def __str__(self):
+        return f"Distribution({self.describe()})"
+
+    def __len__(self):
+        return len(self.data)
+
+    @property
+    def mean(self) -> float:
+        """
+        Calculate and return the mean of the distribution.
+        :return: The mean of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate mean.")
+            return 0.0
+
+        mean_value = np.mean(self.data).item()
+        logger.debug(f"Calculated mean: {mean_value}")
+        return mean_value
+
+    @property
+    def median(self) -> float:
+        """
+        Calculate and return the median of the distribution.
+        :return: The median of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate median.")
+            return 0.0
+
+        median_value = np.median(self.data).item()
+        logger.debug(f"Calculated median: {median_value}")
+        return median_value
+
+    @property
+    def variance(self) -> float:
+        """
+        Calculate and return the variance of the distribution.
+        :return: The variance of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate variance.")
+            return 0.0
+
+        variance_value = np.var(self.data).item()
+        logger.debug(f"Calculated variance: {variance_value}")
+        return variance_value
+
+    @property
+    def std_deviation(self) -> float:
+        """
+        Calculate and return the standard deviation of the distribution.
+        :return: The standard deviation of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate standard deviation.")
+            return 0.0
+
+        std_deviation_value = np.std(self.data).item()
+        logger.debug(f"Calculated standard deviation: {std_deviation_value}")
+        return std_deviation_value
+
+    def percentile(self, percentile: float) -> float:
+        """
+        Calculate and return the specified percentile of the distribution.
+        :param percentile: The desired percentile to calculate (0-100).
+        :return: The specified percentile of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate percentile.")
+            return 0.0
+
+        percentile_value = np.percentile(self.data, percentile).item()
+        logger.debug(f"Calculated {percentile}th percentile: {percentile_value}")
+        return percentile_value
+
+    def percentiles(self, percentiles: List[float]) -> List[float]:
+        """
+        Calculate and return the specified percentiles of the distribution.
+        :param percentiles: A list of desired percentiles to calculate (0-100).
+        :return: A list of the specified percentiles of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate percentiles.")
+            return [0.0] * len(percentiles)
+
+        percentiles_values: List[float] = np.percentile(self.data, percentiles).tolist()  # type: ignore  # noqa: PGH003
+        logger.debug(f"Calculated percentiles {percentiles}: {percentiles_values}")
+        return percentiles_values
+
+    @property
+    def min(self) -> float:
+        """
+        Return the minimum value of the distribution.
+        :return: The minimum value of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate minimum.")
+            return 0.0
+
+        min_value: float = np.min(self.data).item()  # type: ignore  # noqa: PGH003
+        logger.debug(f"Calculated min: {min_value}")
+        return min_value
+
+    @property
+    def max(self) -> float:
+        """
+        Return the maximum value of the distribution.
+        :return: The maximum value of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate maximum.")
+            return 0.0
+
+        max_value: float = np.max(self.data).item()  # type: ignore  # noqa: PGH003
+        logger.debug(f"Calculated max: {max_value}")
+        return max_value
+
+    @property
+    def range(self) -> float:
+        """
+        Calculate and return the range of the distribution (max - min).
+        :return: The range of the distribution.
+        """
+        if not self.data:
+            logger.warning("No data points available to calculate range.")
+            return 0.0
+
+        range_value = self.max - self.min
+        logger.debug(f"Calculated range: {range_value}")
+        return range_value
+
+    def describe(self) -> dict:
+        """
+        Return a dictionary describing various statistics of the distribution.
+        :return: A dictionary with statistical summaries of the distribution.
+        """
+        description = {
+            "mean": self.mean,
+            "median": self.median,
+            "variance": self.variance,
+            "std_deviation": self.std_deviation,
+            "percentile_indices": [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99],
+            "percentile_values": self.percentiles(
+                [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99],
+            ),
+            "min": self.min,
+            "max": self.max,
+            "range": self.range,
+        }
+        logger.debug(f"Generated description: {description}")
+        return description
+
+    def add_data(self, new_data: Sequence[float]):
+        """
+        Add new data points to the distribution.
+        :param new_data: A list of new numerical data points to add.
+        """
+        self.data = list(self.data) + list(new_data)
+        logger.debug(f"Added new data: {new_data}")
+
+    def remove_data(self, remove_data: Sequence[float]):
+        """
+        Remove specified data points from the distribution.
+        :param remove_data: A list of numerical data points to remove.
+        """
+        self.data = [item for item in self.data if item not in remove_data]
+        logger.debug(f"Removed data: {remove_data}")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py
new file mode 100644
index 0000000000000000000000000000000000000000..c48eed561d4eaad4a84dc934264ed4b68d17830a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py
@@ -0,0 +1,311 @@
+import time
+from datetime import datetime
+from typing import List, Optional
+
+from loguru import logger
+from pydantic import Field
+from rich.console import Console, Group
+from rich.live import Live
+from rich.panel import Panel
+from rich.table import Table
+
+from guidellm.core.result import TextGenerationBenchmark, TextGenerationBenchmarkReport
+from guidellm.core.serializable import Serializable
+
+__all__ = ["GuidanceReport"]
+
+
+def _create_benchmark_report_details(report: TextGenerationBenchmarkReport) -> str:
+    """
+    Create a detailed string representation of a benchmark report.
+
+    :param report: The benchmark report to generate details for.
+    :type report: TextGenerationBenchmarkReport
+    :return: A string containing the backend, data, rate, and limits of
+        the benchmark report.
+    :rtype: str
+    """
+    backend = (
+        f"Backend(type={report.args.get('backend_type', 'N/A')}, "
+        f"target={report.args.get('target', 'N/A')}, "
+        f"model={report.args.get('model', 'N/A')})"
+    )
+    data = (
+        f"Data(type={report.args.get('data_type', 'N/A')}, "
+        f"source={report.args.get('data', 'N/A')}, "
+        f"tokenizer={report.args.get('tokenizer', 'N/A')})"
+    )
+    rate = (
+        f"Rate(type={report.args.get('mode', 'N/A')}, "
+        f"rate={report.args.get('rate', 'N/A')})"
+    )
+    limits = (
+        f"Limits(max_number={report.args.get('max_number', 'N/A')} requests, "
+        f"max_duration={report.args.get('max_duration', 'N/A')} sec)"
+    )
+
+    logger.debug(
+        "Created benchmark report details for backend={}, data={}, rate={}, limits={}",
+        backend,
+        data,
+        rate,
+        limits,
+    )
+
+    return backend + "\n" + data + "\n" + rate + "\n" + limits + "\n"
+
+
+def _benchmark_rate_id(benchmark: TextGenerationBenchmark) -> str:
+    """
+    Generate a string identifier for a benchmark rate.
+
+    :param benchmark: The benchmark for which to generate the rate ID.
+    :type benchmark: TextGenerationBenchmark
+    :return: A string representing the benchmark rate ID.
+    :rtype: str
+    """
+    rate_id = (
+        f"{benchmark.mode}@{benchmark.rate:.2f} req/sec"
+        if benchmark.rate
+        else f"{benchmark.mode}"
+    )
+    logger.debug("Generated benchmark rate ID: {}", rate_id)
+    return rate_id
+
+
+def _create_benchmark_report_requests_summary(
+    report: TextGenerationBenchmarkReport,
+) -> Table:
+    """
+    Create a table summarizing the requests of a benchmark report.
+
+    :param report: The benchmark report to summarize.
+    :type report: TextGenerationBenchmarkReport
+    :return: A rich Table object summarizing the requests.
+    :rtype: Table
+    """
+    table = Table(
+        "Benchmark",
+        "Requests Completed",
+        "Request Failed",
+        "Duration",
+        "Start Time",
+        "End Time",
+        title="[magenta]Requests Data by Benchmark[/magenta]",
+        title_style="bold",
+        title_justify="left",
+        show_header=True,
+    )
+
+    for benchmark in report.benchmarks_sorted:
+        start_time_str = (
+            datetime.fromtimestamp(benchmark.start_time).strftime("%H:%M:%S")
+            if benchmark.start_time
+            else "N/A"
+        )
+        end_time_str = (
+            datetime.fromtimestamp(benchmark.end_time).strftime("%H:%M:%S")
+            if benchmark.end_time
+            else "N/A"
+        )
+
+        table.add_row(
+            _benchmark_rate_id(benchmark),
+            f"{benchmark.request_count}/{benchmark.total_count}",
+            f"{benchmark.error_count}/{benchmark.total_count}",
+            f"{benchmark.duration:.2f} sec",
+            f"{start_time_str}",
+            f"{end_time_str}",
+        )
+    logger.debug("Created requests summary table for the report.")
+    return table
+
+
+def _create_benchmark_report_data_tokens_summary(
+    report: TextGenerationBenchmarkReport,
+) -> Table:
+    """
+    Create a table summarizing data tokens of a benchmark report.
+
+    :param report: The benchmark report to summarize.
+    :type report: TextGenerationBenchmarkReport
+    :return: A rich Table object summarizing the data tokens.
+    :rtype: Table
+    """
+    table = Table(
+        "Benchmark",
+        "Prompt",
+        "Prompt (1%, 5%, 50%, 95%, 99%)",
+        "Output",
+        "Output (1%, 5%, 50%, 95%, 99%)",
+        title="[magenta]Tokens Data by Benchmark[/magenta]",
+        title_style="bold",
+        title_justify="left",
+        show_header=True,
+    )
+
+    for benchmark in report.benchmarks_sorted:
+        table.add_row(
+            _benchmark_rate_id(benchmark),
+            f"{benchmark.prompt_token:.2f}",
+            ", ".join(
+                f"{percentile:.1f}"
+                for percentile in benchmark.prompt_token_percentiles
+            ),
+            f"{benchmark.output_token:.2f}",
+            ", ".join(
+                f"{percentile:.1f}"
+                for percentile in benchmark.output_token_percentiles
+            ),
+        )
+    logger.debug("Created data tokens summary table for the report.")
+    return table
+
+
+def _create_benchmark_report_dist_perf_summary(
+    report: TextGenerationBenchmarkReport,
+) -> Table:
+    """
+    Create a table summarizing distribution performance of a benchmark report.
+
+    :param report: The benchmark report to summarize.
+    :type report: TextGenerationBenchmarkReport
+    :return: A rich Table object summarizing the performance statistics.
+    :rtype: Table
+    """
+    table = Table(
+        "Benchmark",
+        "Request Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (sec)",
+        "Time to First Token [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
+        "Inter Token Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)",
+        title="[magenta]Performance Stats by Benchmark[/magenta]",
+        title_style="bold",
+        title_justify="left",
+        show_header=True,
+    )
+
+    for benchmark in report.benchmarks_sorted:
+        table.add_row(
+            _benchmark_rate_id(benchmark),
+            ", ".join(
+                f"{percentile:.2f}"
+                for percentile in benchmark.request_latency_percentiles
+            ),
+            ", ".join(
+                f"{percentile * 1000:.1f}"
+                for percentile in benchmark.time_to_first_token_percentiles
+            ),
+            ", ".join(
+                f"{percentile * 1000:.1f}"
+                for percentile in benchmark.inter_token_latency_percentiles
+            ),
+        )
+    logger.debug("Created distribution performance summary table for the report.")
+    return table
+
+
+def _create_benchmark_report_summary(report: TextGenerationBenchmarkReport) -> Table:
+    """
+    Create a summary table for a benchmark report.
+
+    :param report: The benchmark report to summarize.
+    :type report: TextGenerationBenchmarkReport
+    :return: A rich Table object summarizing overall performance.
+    :rtype: Table
+    """
+    table = Table(
+        "Benchmark",
+        "Requests per Second",
+        "Request Latency",
+        "Time to First Token",
+        "Inter Token Latency",
+        "Output Token Throughput",
+        title="[magenta]Performance Summary by Benchmark[/magenta]",
+        title_style="bold",
+        title_justify="left",
+        show_header=True,
+    )
+
+    for benchmark in report.benchmarks_sorted:
+        table.add_row(
+            _benchmark_rate_id(benchmark),
+            f"{benchmark.completed_request_rate:.2f} req/sec",
+            f"{benchmark.request_latency:.2f} sec",
+            f"{benchmark.time_to_first_token:.2f} ms",
+            f"{benchmark.inter_token_latency:.2f} ms",
+            f"{benchmark.output_token_throughput:.2f} tokens/sec",
+        )
+    logger.debug("Created overall performance summary table for the report.")
+    return table
+
+
+class GuidanceReport(Serializable):
+    """
+    A class to manage the guidance reports that include the benchmarking details,
+    potentially across multiple runs, for saving and loading from disk.
+
+    :param benchmarks: The list of benchmarking reports.
+    :type benchmarks: List[TextGenerationBenchmarkReport]
+    """
+
+    benchmarks: List[TextGenerationBenchmarkReport] = Field(
+        default_factory=list, description="The list of benchmark reports."
+    )
+
+    def print(
+        self, save_path: Optional[str] = None, continual_refresh: bool = False
+    ) -> None:
+        """
+        Print the guidance report to the console.
+
+        :param save_path: Optional path to save the report to disk.
+        :type save_path: Optional[str]
+        :param continual_refresh: Whether to continually refresh the report.
+        :type continual_refresh: bool
+        :return: None
+        """
+        logger.info("Printing guidance report to console with save_path={}", save_path)
+        report_viz = Panel(
+            Group(
+                *[
+                    Panel(
+                        Group(
+                            _create_benchmark_report_details(benchmark),
+                            "",
+                            _create_benchmark_report_requests_summary(benchmark),
+                            "",
+                            _create_benchmark_report_data_tokens_summary(benchmark),
+                            "",
+                            _create_benchmark_report_dist_perf_summary(benchmark),
+                            "",
+                            _create_benchmark_report_summary(benchmark),
+                        ),
+                        title=(
+                            f"[bold magenta]Benchmark Report "
+                            f"{index + 1}[/bold magenta]"
+                        ),
+                        expand=True,
+                        title_align="left",
+                    )
+                    for index, benchmark in enumerate(self.benchmarks)
+                ],
+            ),
+            title=(
+                "[bold cyan]GuideLLM Benchmarks Report[/bold cyan] [italic]"
+                f"({save_path})[/italic]"
+            ),
+            expand=True,
+            title_align="left",
+        )
+        console = Console()
+
+        if continual_refresh:
+            logger.info("Starting live report with continual refresh.")
+            with Live(report_viz, refresh_per_second=1, console=console) as live:
+                while True:
+                    live.update(report_viz)
+                    time.sleep(1)
+        else:
+            console.print(report_viz)
+
+        logger.info("Guidance report printing completed.")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d0f37c8640e637591e199722567dacbf04102b
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py
@@ -0,0 +1,65 @@
+import uuid
+from typing import Any, Dict, List, Optional, Tuple
+
+from pydantic import Field
+
+from guidellm.core.serializable import Serializable
+from guidellm.utils import ImageDescriptor
+
+
+class TextGenerationRequest(Serializable):
+    """
+    A class to represent a text generation request for generative AI workloads.
+    """
+
+    id: str = Field(
+        default_factory=lambda: str(uuid.uuid4()),
+        description="The unique identifier for the request.",
+    )
+    prompt: str = Field(description="The input prompt for the text generation.")
+    images: Optional[List[ImageDescriptor]] = Field(
+        default=None,
+        description="Input images.",
+    )
+    prompt_token_count: Optional[int] = Field(
+        default=None,
+        description="The number of tokens in the input prompt.",
+    )
+    output_token_count: Optional[int] = Field(
+        default=None,
+        description="The number of tokens to generate.",
+    )
+    params: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="The parameters for the text generation request.",
+    )
+
+    @property
+    def number_images(self) -> int:
+        if self.images is None:
+            return 0
+        else:
+            return len(self.images)
+
+    @property
+    def image_resolution(self) -> List[Tuple[int, int]]:
+        if self.images is None:
+            return None
+        else:
+            return [im.size for im in self.images]
+
+
+    def __str__(self) -> str:
+        prompt_short = (
+            self.prompt[:32] + "..."
+            if self.prompt and len(self.prompt) > 32  # noqa: PLR2004
+            else self.prompt
+        )
+
+        return (
+            f"TextGenerationRequest(id={self.id}, "
+            f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, "
+            f"output_token_count={self.output_token_count}, "
+            f"params={self.params})"
+            f"image_resolution={self.image_resolution}"
+        )
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..aebd1763728192228e7115c5842c4a8cec7fc0fe
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py
@@ -0,0 +1,637 @@
+from time import time
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from loguru import logger
+from pydantic import Field, computed_field
+
+from guidellm.core.distribution import Distribution
+from guidellm.core.request import TextGenerationRequest
+from guidellm.core.serializable import Serializable
+
+__all__ = [
+    "RequestConcurrencyMeasurement",
+    "TextGenerationBenchmark",
+    "TextGenerationBenchmarkReport",
+    "TextGenerationError",
+    "TextGenerationResult",
+]
+
+
+class TextGenerationResult(Serializable):
+    """
+    A class to represent the result of a text generation request
+    for generative AI workloads.
+    """
+
+    request: TextGenerationRequest = Field(
+        description="The text generation request used to generate the result.",
+    )
+    prompt: str = Field(
+        default_factory=str,
+        description="The input prompt for the text generation.",
+    )
+    prompt_word_count: int = Field(
+        default=0,
+        description="The number of words in the input prompt.",
+    )
+    prompt_token_count: int = Field(
+        default=0,
+        description="The number of tokens in the input prompt.",
+    )
+    output: str = Field(
+        default_factory=str,
+        description="The generated output for the text generation.",
+    )
+    output_word_count: int = Field(
+        default=0,
+        description="The number of words in the output.",
+    )
+    output_token_count: int = Field(
+        default=0,
+        description="The number of tokens in the output.",
+    )
+    last_time: Optional[float] = Field(
+        default=None,
+        description="The last time recorded.",
+    )
+    first_token_set: bool = Field(
+        default=False,
+        description="Whether the first token time is set.",
+    )
+    start_time: Optional[float] = Field(
+        default=None,
+        description="The start time of the text generation.",
+    )
+    end_time: Optional[float] = Field(
+        default=None,
+        description="The end time of the text generation.",
+    )
+    first_token_time: Optional[float] = Field(
+        default=None,
+        description="The time taken to decode the first token.",
+    )
+    decode_times: Distribution = Field(
+        default_factory=Distribution,
+        description="The distribution of decode times.",
+    )
+
+    def start(self, prompt: str):
+        """
+        Start the text generation by recording the prompt and start time.
+
+        :param prompt: The input prompt for the text generation.
+        :type prompt: str
+        """
+        self.prompt = prompt
+        self.prompt_word_count = len(prompt.split())
+        self.prompt_token_count = len(prompt)  # Token count placeholder
+        self.start_time = time()
+        self.last_time = time()
+        self.first_token_set = False
+
+        logger.info("Text generation started with prompt: '{}'", prompt)
+
+    def output_token(self, token: str):
+        """
+        Add a token to the output and record the decode time.
+
+        :param token: The decoded token.
+        :type token: str
+        """
+        self._check_recording_started()
+
+        if self.last_time is None:
+            raise ValueError(
+                "last time is not specified. "
+                "Did you call `text_generation_benchmark.start()`?"
+            )
+
+        current_counter = time()
+
+        if not self.first_token_set:
+            self.first_token_time = current_counter - self.last_time
+            self.first_token_set = True
+            logger.debug(f"First token decode time: {self.first_token_time}")
+        else:
+            decode_time = current_counter - self.last_time
+            self.decode_times.add_data([decode_time])
+            logger.debug(f"Token '{token}' decoded in {decode_time} seconds")
+
+        self.last_time = current_counter
+        self.output += token
+        logger.debug("Added token {} to output", token)
+
+    def end(
+        self,
+        output: Optional[str] = None,
+        prompt_token_count: Optional[int] = None,
+        output_token_count: Optional[int] = None,
+    ):
+        """
+        End the text generation by recording the output and end time.
+
+        :param output: The generated output for the text generation.
+        :type output: str
+        :param prompt_token_count: Optional token count for the prompt,
+            defaults to word count.
+        :type prompt_token_count: Optional[int]
+        :param output_token_count: Optional token count for the output,
+            defaults to word count.
+        :type output_token_count: Optional[int]
+        """
+        self._check_recording_started()
+        self.end_time = time()
+
+        if output:
+            self.output = output
+
+        self.output_word_count = len(self.output.split())
+        self.output_token_count = output_token_count or self.output_word_count
+        self.prompt_token_count = prompt_token_count or self.prompt_word_count
+
+        logger.info(f"Text generation ended with output: '{self.output}'")
+
+    def _check_recording_started(
+        self,
+    ):
+        if self.start_time is None:
+            raise ValueError(
+                "start time is not specified. "
+                "Did you make the `text_generation_benchmark.start()`?",
+            )
+
+
+class TextGenerationError(Serializable):
+    """
+    A class to represent an error that occurred during a text generation request
+    for generative AI workloads.
+    """
+
+    request: TextGenerationRequest = Field(
+        description="The text generation request that resulted in an error.",
+    )
+    message: str = Field(
+        description="The error message that occurred during text generation.",
+    )
+
+
+class RequestConcurrencyMeasurement(Serializable):
+    """
+    A dataclass to represent the concurrency measurement of a request.
+    """
+
+    time: float = Field(description="The time of the measurement.")
+    completed: int = Field(description="The number of completed requests.")
+    errored: int = Field(description="The number of errored requests.")
+    processing: int = Field(description="The number of processing requests.")
+
+
+class TextGenerationBenchmark(Serializable):
+    """
+    A class to represent a report of text generation requests
+    (results and errors) for generative AI workloads.
+    This is a set of results and errors for a specific mode and rate.
+    """
+
+    mode: Literal["asynchronous", "synchronous", "throughput"] = Field(
+        description="The generation mode, one of 'async', 'sync', or 'throughput'."
+    )
+    rate: Optional[float] = Field(
+        default=None,
+        description="The requested rate of requests per second.",
+    )
+    results: List[TextGenerationResult] = Field(
+        default_factory=list,
+        description="The results of the text generation requests.",
+    )
+    errors: List[TextGenerationError] = Field(
+        default_factory=list,
+        description="The errors of the text generation requests.",
+    )
+    concurrencies: List[RequestConcurrencyMeasurement] = Field(
+        default_factory=list,
+        description="The concurrency measurements of the requests.",
+    )
+
+    def __iter__(self):
+        """
+        Provide an iterator interface to iterate over the results.
+
+        :return: An iterator over the results.
+        """
+        return iter(self.results)
+
+    @computed_field # type: ignore[misc]
+    @property
+    def request_count(self) -> int:
+        """
+        Get the number of requests in the result.
+
+        :return: The number of requests.
+        :rtype: int
+        """
+        return len(self.results)
+
+    @computed_field # type: ignore[misc]
+    @property
+    def error_count(self) -> int:
+        """
+        Get the number of errors in the result.
+
+        :return: The number of errors.
+        :rtype: int
+        """
+        return len(self.errors)
+
+    @computed_field # type: ignore[misc]
+    @property
+    def total_count(self) -> int:
+        """
+        Get the total number of requests in the result.
+
+        :return: The total number of requests.
+        :rtype: int
+        """
+        return self.request_count + self.error_count
+
+    @computed_field # type: ignore[misc]
+    @property
+    def start_time(self) -> Optional[float]:
+        """
+        Get the start time of the first request in the result.
+
+        :return: The start time of the first request.
+        :rtype: Optional[float]
+        """
+        if not self.results:
+            return None
+
+        return self.results[0].start_time
+
+    @computed_field # type: ignore[misc]
+    @property
+    def end_time(self) -> Optional[float]:
+        """
+        Get the end time of the last request in the result.
+
+        :return: The end time of the last request.
+        :rtype: Optional[float]
+        """
+        if not self.results:
+            return None
+
+        return self.results[-1].end_time
+
+    @computed_field # type: ignore[misc]
+    @property
+    def duration(self) -> float:
+        """
+        Get the duration of the result in seconds.
+
+        :return: The duration of the result.
+        :rtype: float
+        """
+        if not self.results or not self.start_time or not self.end_time:
+            return 0.0
+
+        return self.end_time - self.start_time
+
+    @computed_field # type: ignore[misc]
+    @property
+    def completed_request_rate(self) -> float:
+        """
+        Get the rate of requests per second in the result.
+
+        :return: The rate of requests per second.
+        :rtype: float
+        """
+        if not self.results or not self.duration:
+            return 0.0
+
+        return len(self.results) / self.duration
+
+    @computed_field # type: ignore[misc]
+    @property
+    def request_latency(self) -> float:
+        """
+        Get the average request latency in seconds.
+
+        :return: The average request latency in seconds.
+        :rtype: float
+        """
+        if not self.results:
+            return 0.0
+
+        return self.request_latency_distribution.mean
+
+    @property
+    def request_latency_distribution(self) -> Distribution:
+        """
+        Get the distribution of request latencies.
+
+        :return: The distribution of request latencies.
+        :rtype: Distribution
+        """
+        return Distribution(
+            data=[
+                result.end_time - result.start_time
+                for result in self.results
+                if result.end_time is not None and result.start_time is not None
+            ]
+        )
+
+    @computed_field # type: ignore[misc]
+    @property
+    def request_latency_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles of request latency in seconds.
+
+        :return: List of percentile request latency in seconds
+        :rtype: List[float]
+        """
+        return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+
+    @computed_field # type: ignore[misc]
+    @property
+    def time_to_first_token(self) -> float:
+        """
+        Get the time taken to decode the first token in milliseconds.
+
+        :return: The time taken to decode the first token in milliseconds.
+        :rtype: float
+        """
+        if not self.results:
+            return 0.0
+
+        return 1000 * self.ttft_distribution.mean
+
+    @property
+    def ttft_distribution(self) -> Distribution:
+        """
+        Get the distribution of time taken to decode the first token.
+
+        :return: The distribution of time taken to decode the first token.
+        :rtype: Distribution
+        """
+        return Distribution(
+            data=[
+                result.first_token_time
+                for result in self.results
+                if result.first_token_time is not None
+            ]
+        )
+
+    @computed_field # type: ignore[misc]
+    @property
+    def time_to_first_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for time taken to decode the first token
+        in milliseconds.
+
+        :return: List of percentile time taken to decode the first token
+        in milliseconds.
+        :rtype: List[float]
+        """
+        return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+    @computed_field # type: ignore[misc]
+    @property
+    def inter_token_latency(self) -> float:
+        """
+        Get the average time between tokens in milliseconds.
+
+        :return: The average time between tokens.
+        :rtype: float
+        """
+        if not self.results:
+            return 0.0
+
+        return 1000 * self.itl_distribution.mean
+
+    @property
+    def itl_distribution(self) -> Distribution:
+        """
+        Get the distribution of time between tokens.
+
+        :return: The distribution of time between tokens.
+        :rtype: Distribution
+        """
+        return Distribution(
+            data=[
+                decode for result in self.results for decode in result.decode_times.data
+            ]
+        )
+
+    @computed_field # type: ignore[misc]
+    @property
+    def inter_token_latency_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for the time between tokens in milliseconds.
+
+        :return: List of percentiles for the average time between tokens.
+        :rtype: List[float]
+        """
+        return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99])
+
+    @computed_field # type: ignore[misc]
+    @property
+    def output_token_throughput(self) -> float:
+        """
+        Get the average token throughput in tokens per second.
+
+        :return: The average token throughput.
+        :rtype: float
+        """
+        if not self.results or not self.duration:
+            return 0.0
+
+        total_tokens = sum(result.output_token_count for result in self.results)
+
+        return total_tokens / self.duration
+
+    @computed_field # type: ignore[misc]
+    @property
+    def prompt_token(self) -> float:
+        """
+        Get the average number of prompt tokens.
+
+        :return: The average number of prompt tokens.
+        :rtype: float
+        """
+        return self.prompt_token_distribution.mean
+
+    @property
+    def prompt_token_distribution(self) -> Distribution:
+        """
+        Get the distribution of prompt token counts.
+
+        :return: The distribution of prompt token counts.
+        :rtype: Distribution
+        """
+        return Distribution(data=[result.prompt_token_count for result in self.results])
+
+    @computed_field # type: ignore[misc]
+    @property
+    def prompt_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for number of prompt tokens.
+
+        :return: List of percentiles of number of prompt tokens.
+        :rtype: List[float]
+        """
+        return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+    @computed_field # type: ignore[misc]
+    @property
+    def output_token(self) -> float:
+        """
+        Get the average number of output tokens.
+
+        :return: The average number of output tokens.
+        :rtype: float
+        """
+        return self.output_token_distribution.mean
+
+    @property
+    def output_token_distribution(self) -> Distribution:
+        """
+        Get the distribution of output token counts.
+
+        :return: The distribution of output token counts.
+        :rtype: Distribution
+        """
+        return Distribution(data=[result.output_token_count for result in self.results])
+
+    @computed_field # type: ignore[misc]
+    @property
+    def output_token_percentiles(self) -> List[float]:
+        """
+        Get standard percentiles for number of output tokens.
+
+        :return: List of percentiles of number of output tokens.
+        :rtype: List[float]
+        """
+        return self.output_token_distribution.percentiles([1, 5, 50, 95, 99])
+
+    @computed_field # type: ignore[misc]
+    @property
+    def overloaded(self) -> bool:
+        if (
+            self.rate is None
+            or not self.results
+            or not self.concurrencies
+            or len(self.concurrencies) < 2  # noqa: PLR2004
+        ):
+            # if rate was not set, sync mode is assumed,
+            # or we have less than 2 data points,
+            # then we cannot be overloaded by definition
+            return False
+
+        # if the calculated rate is less than 75% of the requested rate,
+        # safe to assume the system is overloaded
+        return self.completed_request_rate < 0.75 * self.rate
+
+    def request_started(self):
+        """
+        Record the start of a generation request.
+        """
+        if not self.concurrencies:
+            self.concurrencies = [
+                RequestConcurrencyMeasurement(
+                    time=time(),
+                    completed=0,
+                    errored=0,
+                    processing=1,
+                ),
+            ]
+        else:
+            last = self.concurrencies[-1]
+            self.concurrencies.append(
+                RequestConcurrencyMeasurement(
+                    time=time(),
+                    completed=last.completed,
+                    errored=last.errored,
+                    processing=last.processing + 1,
+                ),
+            )
+
+        logger.info("Text generation request started")
+
+    def request_completed(
+        self,
+        result: Union[TextGenerationResult, TextGenerationError],
+    ):
+        """
+        Record the completion of a text generation request.
+
+        :param result: The completed result or error.
+        :type result: Union[TextGenerationResult, TextGenerationError]
+        """
+        if not self.concurrencies:
+            raise ValueError("Request completed without starting")
+
+        if isinstance(result, TextGenerationError):
+            is_error = True
+            self.errors.append(result)
+            logger.info(
+                "Text generation request resulted in error: {}",
+                result.message,
+            )
+        else:
+            if not result.start_time or not result.end_time:
+                raise ValueError("Start time and End time are not defined")
+
+            is_error = False
+            self.results.append(result)
+            logger.info("Text generation request completed successfully: {}", result)
+
+        last = self.concurrencies[-1]
+        self.concurrencies.append(
+            RequestConcurrencyMeasurement(
+                time=time(),
+                completed=last.completed + (not is_error),
+                errored=last.errored + is_error,
+                processing=last.processing - 1,
+            )
+        )
+
+
+class TextGenerationBenchmarkReport(Serializable):
+    """
+    A class to represent a report of text generation benchmarks
+    for generative AI workloads.
+    This is a collection of benchmarks for different modes and rates.
+    """
+
+    benchmarks: List[TextGenerationBenchmark] = Field(
+        default_factory=list,
+        description="The benchmarks of text generation requests.",
+    )
+    args: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="The arguments used for the benchmarks.",
+    )
+
+    def __iter__(self):
+        return iter(self.benchmarks)
+
+    @property
+    def benchmarks_sorted(self) -> List[TextGenerationBenchmark]:
+        """
+        Get the list of benchmarks sorted by request rate.
+
+        :return: The sorted list of benchmarks.
+        :rtype: List[TextGenerationBenchmark]
+        """
+        return sorted(self.benchmarks, key=lambda x: x.completed_request_rate)
+
+    def add_benchmark(self, benchmark: TextGenerationBenchmark):
+        """
+        Add a result to the report.
+
+        :param benchmark: The result to add.
+        :type benchmark: TextGenerationBenchmark
+        """
+        self.benchmarks.append(benchmark)
+        logger.debug("Added result: {}", benchmark)
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e6b2944ebe0877ed813f3bc5a41147b91b60092
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py
@@ -0,0 +1,169 @@
+from pathlib import Path
+from typing import Any, Literal, Union, get_args
+
+import yaml
+from loguru import logger
+from pydantic import BaseModel, ConfigDict
+
+__all__ = ["Serializable", "SerializableFileType"]
+
+
+SerializableFileType = Literal["yaml", "json"]
+
+
+class Serializable(BaseModel):
+    """
+    A base class for models that require YAML and JSON serialization and
+    deserialization.
+    """
+
+    model_config = ConfigDict(
+        extra="forbid",
+        use_enum_values=True,
+        validate_assignment=True,
+        from_attributes=True,
+    )
+
+    def __init__(self, /, **data: Any) -> None:
+        super().__init__(**data)
+        logger.debug(
+            "Initialized new instance of {} with data: {}",
+            self.__class__.__name__,
+            data,
+        )
+
+    def to_yaml(self) -> str:
+        """
+        Serialize the model to a YAML string.
+
+        :return: YAML string representation of the model.
+        """
+        logger.debug("Serializing to YAML... {}", self)
+
+        return yaml.dump(self.model_dump())
+
+    @classmethod
+    def from_yaml(cls, data: str):
+        """
+        Deserialize a YAML string to a model instance.
+
+        :param data: YAML string to deserialize.
+        :return: An instance of the model.
+        """
+        logger.debug("Deserializing from YAML... {}", data)
+
+        return cls.model_validate(yaml.safe_load(data))
+
+    def to_json(self) -> str:
+        """
+        Serialize the model to a JSON string.
+
+        :return: JSON string representation of the model.
+        """
+        logger.debug("Serializing to JSON... {}", self)
+
+        return self.model_dump_json()
+
+    @classmethod
+    def from_json(cls, data: str):
+        """
+        Deserialize a JSON string to a model instance.
+
+        :param data: JSON string to deserialize.
+        :return: An instance of the model.
+        """
+        logger.debug("Deserializing from JSON... {}", data)
+
+        return cls.model_validate_json(data)
+
+    def save_file(
+        self,
+        path: Union[str, Path],
+        type_: SerializableFileType = "yaml",
+    ) -> str:
+        """
+        Save the model to a file in either YAML or JSON format.
+
+        :param path: Path to the exact file or the containing directory.
+            If it is a directory, the file name will be inferred from the class name.
+        :param type_: Optional type to save ('yaml' or 'json').
+            If not provided and the path has an extension,
+            it will be inferred to save in that format.
+            If not provided and the path does not have an extension,
+            it will save in YAML format.
+        :return: The path to the saved file.
+        """
+        logger.debug("Saving to file... {} with format: {}", path, type_)
+
+        if isinstance(path, str):
+            path = Path(path)
+
+        if path.suffix:
+            # is a file
+            ext = path.suffix[1:].lower()
+            if type_ not in get_args(SerializableFileType):
+                raise ValueError(
+                    f"Unsupported file extension: {type_}. "
+                    f"Expected one of {SerializableFileType} "
+                    f"for {path}"
+                )
+            type_ = ext  # type: ignore # noqa: PGH003
+        else:
+            # is a directory
+            file_name = f"{self.__class__.__name__.lower()}.{type_}"
+            path = path / file_name
+
+        path.parent.mkdir(parents=True, exist_ok=True)
+
+        with path.open("w") as file:
+            if type_ == "yaml":
+                file.write(self.to_yaml())
+            elif type_ == "json":
+                file.write(self.to_json())
+            else:
+                raise ValueError(
+                    f"Unsupported file extension: {type_}"
+                    f"Expected one of {SerializableFileType} "
+                    f"for {path}"
+                )
+
+        logger.info("Successfully saved {} to {}", self.__class__.__name__, path)
+
+        return str(path)
+
+    @classmethod
+    def load_file(cls, path: Union[str, Path]):
+        """
+        Load a model from a file in either YAML or JSON format.
+
+        :param path: Path to the file.
+        :return: An instance of the model.
+        """
+        logger.debug("Loading from file... {}", path)
+
+        if isinstance(path, str):
+            path = Path(path)
+
+        if not path.exists():
+            raise FileNotFoundError(f"File not found: {path}")
+
+        if not path.is_file():
+            raise ValueError(f"Path is not a file: {path}")
+
+        extension = path.suffix[1:].lower()
+
+        with path.open() as file:
+            data = file.read()
+
+            if extension == "yaml":
+                obj = cls.from_yaml(data)
+            elif extension == "json":
+                obj = cls.from_json(data)
+            else:
+                raise ValueError(
+                    f"Unsupported file extension: {extension}"
+                    f"Expected one of {SerializableFileType} "
+                    f"for {path}"
+                )
+
+        return obj
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5858d072bfd8ae7e1092259ccba537f69e65743
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py
@@ -0,0 +1,10 @@
+from .base import Executor, ExecutorResult
+from .profile_generator import Profile, ProfileGenerationMode, ProfileGenerator
+
+__all__ = [
+    "Executor",
+    "ExecutorResult",
+    "Profile",
+    "ProfileGenerationMode",
+    "ProfileGenerator",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..865ab30de412797485b49d2175ac7f94ac3900ba
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py
@@ -0,0 +1,213 @@
+from dataclasses import dataclass
+from typing import AsyncGenerator, Optional, Sequence, Union
+
+from loguru import logger
+
+from guidellm.backend import Backend
+from guidellm.core import TextGenerationBenchmarkReport
+from guidellm.executor.profile_generator import (
+    Profile,
+    ProfileGenerationMode,
+    ProfileGenerator,
+)
+from guidellm.request import RequestGenerator
+from guidellm.scheduler import Scheduler, SchedulerResult
+
+__all__ = ["Executor", "ExecutorResult"]
+
+
+@dataclass
+class ExecutorResult:
+    """
+    Data class representing the result of executing tasks in the Executor.
+
+    :param completed: Indicates whether all tasks have completed.
+    :type completed: bool
+    :param count_total: Total number of profiles.
+    :type count_total: int
+    :param count_completed: Number of completed profiles.
+    :type count_completed: int
+    :param report: A report report for text generation.
+    :type report: TextGenerationBenchmarkReport
+    :param scheduler_result: Optional scheduler result for the last task.
+    :type scheduler_result: Optional[SchedulerResult]
+    """
+
+    completed: bool
+    count_total: int
+    count_completed: int
+    generation_modes: Sequence[ProfileGenerationMode]
+    report: TextGenerationBenchmarkReport
+    scheduler_result: Optional[SchedulerResult] = None
+    current_index: Optional[int] = None
+    current_profile: Optional[Profile] = None
+
+
+class Executor:
+    """
+    The Executor class manages the execution of tasks based on a given profile
+    generation mode and rate. It orchestrates the interaction between the backend,
+    request generator, and profile generator, and runs benchmarks accordingly.
+
+    :param backend: The backend to run tasks against.
+    :type backend: Backend
+    :param request_generator: The generator that creates requests for execution.
+    :type request_generator: RequestGenerator
+    :param mode: The mode for profile generation (e.g., sweep, synchronous).
+    :type mode: ProfileGenerationMode
+    :param rate: The list of rates for load generation, or None.
+    :type rate: Optional[List[float]]
+    :param max_number: Maximum number of requests to generate for the scheduler
+        (a single report run), or None.
+    :type max_number: Optional[int]
+    :param max_duration: Maximum duration for generating requests for the scheduler,
+        (a single report run), or None.
+    :type max_duration: Optional[float]
+    """
+
+    def __init__(
+        self,
+        backend: Backend,
+        request_generator: RequestGenerator,
+        mode: ProfileGenerationMode = "sweep",
+        rate: Optional[Union[float, Sequence[float]]] = None,
+        max_number: Optional[int] = None,
+        max_duration: Optional[float] = None,
+    ):
+        self._backend = backend
+        self._generator = request_generator
+        self._max_number = max_number
+        self._max_duration = max_duration
+        self._profile_generator = ProfileGenerator(mode=mode, rate=rate)
+        logger.info("Executor initialized with mode: {}, rate: {}", mode, rate)
+
+    @property
+    def backend(self) -> Backend:
+        """
+        Returns the backend being used by the Executor.
+
+        :return: Backend
+        :rtype: Backend
+        """
+        return self._backend
+
+    @property
+    def request_generator(self) -> RequestGenerator:
+        """
+        Returns the request generator used by the Executor.
+
+        :return: RequestGenerator
+        :rtype: RequestGenerator
+        """
+        return self._generator
+
+    @property
+    def profile_generator(self) -> ProfileGenerator:
+        """
+        Returns the profile generator for generating profiles during execution.
+
+        :return: ProfileGenerator
+        :rtype: ProfileGenerator
+        """
+        return self._profile_generator
+
+    @property
+    def max_number(self) -> Optional[int]:
+        """
+        Returns the maximum number of requests to generate.
+
+        :return: Maximum number of requests or None.
+        :rtype: Optional[int]
+        """
+        return self._max_number
+
+    @property
+    def max_duration(self) -> Optional[float]:
+        """
+        Returns the maximum duration for generating requests.
+
+        :return: Maximum duration in seconds or None.
+        :rtype: Optional[float]
+        """
+        return self._max_duration
+
+    async def run(self) -> AsyncGenerator[ExecutorResult, None]:
+        """
+        Runs the Executor, generating and scheduling tasks based on the profile
+        generation mode. Yields results incrementally.
+
+        :rtype: AsyncGenerator[ExecutorResult, None]
+        """
+        report = TextGenerationBenchmarkReport()
+        report.args = {
+            # backend args
+            "backend_type": self.backend.type_,
+            "target": self.backend.target,
+            "model": self.backend.model,
+            # data args
+            "data_type": self.request_generator.type_,
+            "data": self.request_generator.source,
+            "tokenizer": self.request_generator.tokenizer.name_or_path,
+            # rate args
+            "mode": self.profile_generator.mode,
+            "rate": self.profile_generator.rates,
+            # limits args
+            "max_number": self.max_number,
+            "max_duration": self.max_duration,
+        }
+        profile_index = -1
+        logger.info("Starting Executor run")
+
+        yield ExecutorResult(
+            completed=False,
+            count_total=len(self.profile_generator),
+            count_completed=0,
+            generation_modes=self.profile_generator.profile_generation_modes,
+            report=report,
+        )
+
+        while profile := self.profile_generator.next(report):
+            logger.debug("Generated profile: {}", profile)
+            scheduler = Scheduler(
+                generator=self.request_generator,
+                worker=self.backend,
+                mode=profile.load_gen_mode,
+                rate=profile.load_gen_rate,
+                max_number=self.max_number or profile.args.get("max_number", None),
+                max_duration=self.max_duration,
+            )
+            profile_index += 1
+
+            logger.info(
+                "Scheduling tasks with mode: {}, rate: {}",
+                profile.load_gen_mode,
+                profile.load_gen_rate,
+            )
+
+            async for scheduler_result in scheduler.run():
+                if scheduler_result.completed:
+                    report.add_benchmark(scheduler_result.benchmark)
+                    logger.debug(
+                        "Benchmark added for scheduler result: {}",
+                        scheduler_result.benchmark,
+                    )
+
+                yield ExecutorResult(
+                    completed=False,
+                    count_total=len(self.profile_generator),
+                    count_completed=len(report.benchmarks),
+                    generation_modes=self.profile_generator.profile_generation_modes,
+                    report=report,
+                    scheduler_result=scheduler_result,
+                    current_index=profile_index,
+                    current_profile=profile,
+                )
+
+        logger.info("Executor run completed")
+        yield ExecutorResult(
+            completed=True,
+            count_total=len(self.profile_generator),
+            count_completed=len(report.benchmarks),
+            generation_modes=self.profile_generator.profile_generation_modes,
+            report=report,
+        )
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..757646cf668ed1be6983a955ead5d81460c6d71e
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py
@@ -0,0 +1,350 @@
+from typing import Any, Dict, List, Literal, Optional, Sequence, Union, get_args
+
+import numpy as np
+from loguru import logger
+from numpy._typing import NDArray
+from pydantic import Field
+
+from guidellm.config import settings
+from guidellm.core import TextGenerationBenchmark, TextGenerationBenchmarkReport
+from guidellm.core.serializable import Serializable
+from guidellm.scheduler import LoadGenerationMode
+
+__all__ = [
+    "Profile",
+    "ProfileGenerationMode",
+    "ProfileGenerator",
+]
+
+ProfileGenerationMode = Literal[
+    "sweep", "synchronous", "throughput", "constant", "poisson"
+]
+
+
+class Profile(Serializable):
+    """
+    A data class representing a profile for load generation.
+
+    :param load_gen_mode: The mode of load generation (e.g., constant, poisson).
+    :type load_gen_mode: LoadGenerationMode
+    :param load_gen_rate: The rate of load generation, if applicable.
+    :type load_gen_rate: Optional[float]
+    :param args: Additional arguments for the profile.
+    :type args: Optional[Dict[str, Any]]
+    """
+
+    load_gen_mode: LoadGenerationMode
+    load_gen_rate: Optional[float] = None
+    args: Dict[str, Any] = Field(default_factory=dict)
+
+
+class ProfileGenerator:
+    """
+    Generates profiles based on different load generation modes.
+
+    :param mode: The mode for profile generation (e.g., sweep, synchronous).
+    :type mode: ProfileGenerationMode
+    :param rate: The rate(s) for load generation; could be a float or list of floats.
+    :type rate: Optional[Union[float, Sequence[float]]]
+    """
+
+    def __init__(
+        self,
+        mode: ProfileGenerationMode,
+        rate: Optional[Union[float, Sequence[float]]] = None,
+    ):
+        if mode not in get_args(ProfileGenerationMode):
+            err = ValueError(
+                f"{mode} is not a valid Profile Generation Mode. "
+                f"Valid options are {get_args(ProfileGenerationMode)}"
+            )
+            logger.error(err)
+            raise err
+
+        self._mode = mode
+
+        if self._mode in ("sweep", "throughput", "synchronous"):
+            if rate is not None:
+                err = ValueError(f"Rates are not applicable for {self._mode} mode")
+                logger.error(err)
+                raise err
+            self._rates = None
+        else:
+            if not rate:
+                err = ValueError(f"Rates are required for {self._mode} mode")
+                logger.error(err)
+                raise err
+            self._rates = rate if isinstance(rate, Sequence) else [rate]
+
+            for rt in self._rates:
+                if rt <= 0:
+                    err = ValueError(
+                        f"Rate must be > 0 for mode: {self._mode}. Given: {rt}"
+                    )
+                    logger.error(err)
+                    raise err
+
+        self._generated_count = 0
+
+    def __len__(self) -> int:
+        """
+        Returns the number of profiles to generate based on the mode and rates.
+
+        :return: The number of profiles.
+        :rtype: int
+        """
+        if self._mode == "sweep":
+            return settings.num_sweep_profiles + 2
+
+        if self._mode in ("throughput", "synchronous"):
+            return 1
+
+        if not self._rates:
+            raise ValueError(f"Rates are required for {self._mode} mode")
+
+        return len(self._rates)
+
+    @property
+    def mode(self) -> ProfileGenerationMode:
+        """
+        Returns the current mode of profile generation.
+
+        :return: The profile generation mode.
+        :rtype: ProfileGenerationMode
+        """
+        return self._mode
+
+    @property
+    def rates(self) -> Optional[Sequence[float]]:
+        """
+        Returns the list of rates for load generation, if any.
+
+        :return: Sequence of rates or None if not applicable.
+        :rtype: Optional[Sequence[float]]
+        """
+        return self._rates
+
+    @property
+    def generated_count(self) -> int:
+        """
+        Returns the current count of generated profiles.
+
+        :return: The current count of generated profiles.
+        :rtype: int
+        """
+        return self._generated_count
+
+    @property
+    def profile_generation_modes(self) -> Sequence[ProfileGenerationMode]:
+        """
+        Return the list of profile modes to be run in the report.
+
+        :return: Sequence of profile modes to be run in the report.
+        :rtype: Sequence[ProfileGenerationMode]
+        """
+        if self._mode == "sweep":
+            return ["synchronous", "throughput"] + ["constant"] * (  # type: ignore  # noqa: PGH003
+                settings.num_sweep_profiles
+            )
+
+        if self._mode in ["throughput", "synchronous"]:
+            return [self._mode]
+
+        if self._rates is None:
+            raise ValueError(f"Rates are required for {self._mode} mode")
+
+        if self._mode in ["constant", "poisson"]:
+            return [self._mode] * len(self._rates)
+
+        raise ValueError(f"Invalid mode: {self._mode}")
+
+    def next(self, current_report: TextGenerationBenchmarkReport) -> Optional[Profile]:
+        """
+        Generates the next profile based on the current mode and report.
+
+        :param current_report: The current report report.
+        :type current_report: TextGenerationBenchmarkReport
+        :return: The generated profile or None if no more profiles.
+        :rtype: Optional[Profile]
+        """
+        logger.debug(
+            "Generating the next profile with mode: {}, current report: {}",
+            self.mode,
+            current_report,
+        )
+
+        if self.mode in ["constant", "poisson"]:
+            if not self.rates:
+                err = ValueError(f"Rates are required for {self.mode} mode")
+                logger.error(err)
+                raise err
+
+            profile = self.create_fixed_rate_profile(
+                self.generated_count,
+                self.mode,
+                self.rates,
+            )
+        elif self.mode == "synchronous":
+            profile = self.create_synchronous_profile(self.generated_count)
+        elif self.mode == "throughput":
+            profile = self.create_throughput_profile(self.generated_count)
+        elif self.mode == "sweep":
+            profile = self.create_sweep_profile(
+                self.generated_count,
+                sync_benchmark=(
+                    current_report.benchmarks[0] if current_report.benchmarks else None
+                ),
+                throughput_benchmark=(
+                    current_report.benchmarks[1]
+                    if len(current_report.benchmarks) > 1
+                    else None
+                ),
+            )
+        else:
+            err = ValueError(f"Invalid mode: {self.mode}")
+            logger.error(err)
+            raise err
+
+        self._generated_count += 1
+        logger.info(
+            "Generated profile: {}, total generated count: {}",
+            profile,
+            self._generated_count,
+        )
+        return profile
+
+    @staticmethod
+    def create_fixed_rate_profile(
+        index: int, mode: ProfileGenerationMode, rates: Sequence[float]
+    ) -> Optional[Profile]:
+        """
+        Creates a profile with a fixed rate.
+
+        :param index: The index of the rate in the list.
+        :type index: int
+        :param mode: The mode for profile generation (e.g., constant, poisson).
+        :type mode: ProfileGenerationMode
+        :param rates: The list of rates for load generation.
+        :type rates: Sequence[float]
+        :return: The generated profile or None if index is out of range.
+        :rtype: Optional[Profile]
+        """
+        modes_map: Dict[str, LoadGenerationMode] = {
+            "constant": "constant",
+            "poisson": "poisson",
+        }
+
+        if mode not in modes_map:
+            err = ValueError(f"Invalid mode: {mode}")
+            logger.error(err)
+            raise err
+
+        profile = (
+            Profile(
+                load_gen_mode=modes_map[mode],
+                load_gen_rate=rates[index],
+            )
+            if index < len(rates)
+            else None
+        )
+        logger.debug("Created fixed rate profile: {}", profile)
+        return profile
+
+    @staticmethod
+    def create_synchronous_profile(index: int) -> Optional[Profile]:
+        """
+        Creates a profile with synchronous mode.
+
+        :param index: The index of the profile to create.
+        :type index: int
+        :return: The generated profile or None if index is out of range.
+        :rtype: Optional[Profile]
+        """
+        profile = (
+            Profile(
+                load_gen_mode="synchronous",
+                load_gen_rate=None,
+            )
+            if index < 1
+            else None
+        )
+        logger.debug("Created synchronous profile: {}", profile)
+        return profile
+
+    @staticmethod
+    def create_throughput_profile(index: int) -> Optional[Profile]:
+        """
+        Creates a profile with throughput mode.
+
+        :param index: The index of the profile to create.
+        :type index: int
+        :return: The generated profile or None if index is out of range.
+        :rtype: Optional[Profile]
+        """
+        profile = (
+            Profile(
+                load_gen_mode="throughput",
+                load_gen_rate=None,
+            )
+            if index < 1
+            else None
+        )
+        logger.debug("Created throughput profile: {}", profile)
+        return profile
+
+    @staticmethod
+    def create_sweep_profile(
+        index: int,
+        sync_benchmark: Optional[TextGenerationBenchmark],
+        throughput_benchmark: Optional[TextGenerationBenchmark],
+    ) -> Optional[Profile]:
+        """
+        Creates a profile with sweep mode, generating profiles between
+        synchronous and throughput benchmarks.
+
+        :param index: The index of the profile to create.
+        :type index: int
+        :param sync_benchmark: The synchronous report data.
+        :type sync_benchmark: Optional[TextGenerationBenchmark]
+        :param throughput_benchmark: The throughput report data.
+        :type throughput_benchmark: Optional[TextGenerationBenchmark]
+        :return: The generated profile or None if index is out of range.
+        :rtype: Optional[Profile]
+        """
+        if index < 0 or index >= settings.num_sweep_profiles + 2:
+            return None
+
+        if index == 0:
+            return ProfileGenerator.create_synchronous_profile(0)
+
+        if not sync_benchmark:
+            err = ValueError("Synchronous report is required for sweep mode")
+            logger.error(err)
+            raise err
+
+        if index == 1:
+            throughput_profile: Profile = ProfileGenerator.create_throughput_profile(0)  # type: ignore  # noqa: PGH003
+            # set the max number of requests to 5 times the number of requests
+            # incase it is not set for the sweep to limit the number of requests
+            throughput_profile.args = {"max_number": sync_benchmark.request_count * 5}
+            return throughput_profile
+
+        if not throughput_benchmark:
+            err = ValueError("Throughput report is required for sweep mode")
+            logger.error(err)
+            raise err
+
+        min_rate = sync_benchmark.completed_request_rate
+        max_rate = throughput_benchmark.completed_request_rate
+        intermediate_rates: List[NDArray] = list(
+            np.linspace(min_rate, max_rate, settings.num_sweep_profiles + 1)
+        )[1:]
+
+        return Profile(
+            load_gen_mode="constant",
+            load_gen_rate=(
+                float(load_gen_rate)
+                if (load_gen_rate := intermediate_rates[index - 2])
+                else 1.0  # the fallback value
+            ),
+        )
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..f26966c029ac8e173031822233e971ec7512144b
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py
@@ -0,0 +1,83 @@
+"""
+Logger configuration for GuideLLM.
+
+This module provides a flexible logging configuration using the loguru library.
+It supports console and file logging with options to configure via environment
+variables or direct function calls.
+
+Environment Variables:
+    - GUIDELLM__LOGGING__DISABLED: Disable logging (default: false).
+    - GUIDELLM__LOGGING__CLEAR_LOGGERS: Clear existing loggers
+        from loguru (default: true).
+    - GUIDELLM__LOGGING__LOG_LEVEL: Log level for console logging
+        (default: none, options: DEBUG, INFO, WARNING, ERROR, CRITICAL).
+    - GUIDELLM__LOGGING__FILE: Path to the log file for file logging
+        (default: guidellm.log if log file level set else none)
+    - GUIDELLM__LOGGING__FILE_LEVEL: Log level for file logging
+        (default: INFO if log file set else none).
+
+Usage:
+    from guidellm import logger, configure_logger, LoggerConfig
+
+    # Configure metrics with default settings
+    configure_logger(
+        config=LoggingConfig
+            disabled=False,
+            clear_loggers=True,
+            console_log_level="DEBUG",
+            log_file=None,
+            log_file_level=None,
+        )
+    )
+
+    logger.debug("This is a debug message")
+    logger.info("This is an info message")
+"""
+
+import sys
+
+from loguru import logger
+
+from guidellm.config import LoggingSettings, settings
+
+__all__ = ["configure_logger", "logger"]
+
+
+def configure_logger(config: LoggingSettings = settings.logging):
+    """
+    Configure the metrics for LLM Compressor.
+    This function sets up the console and file logging
+    as per the specified or default parameters.
+
+    Note: Environment variables take precedence over the function parameters.
+
+    :param config: The configuration for the logger to use.
+    :type config: LoggerConfig
+    """
+
+    if config.disabled:
+        logger.disable("guidellm")
+        return
+
+    logger.enable("guidellm")
+
+    if config.clear_loggers:
+        logger.remove()
+
+    # log as a human readable string with the time, function, level, and message
+    logger.add(
+        sys.stdout,
+        level=config.console_log_level.upper(),
+        format="{time} | {function} | {level} - {message}",
+    )
+
+    if config.log_file or config.log_file_level:
+        log_file = config.log_file or "guidellm.log"
+        log_file_level = config.log_file_level or "INFO"
+        # log as json to the file for easier parsing
+        logger.add(log_file, level=log_file_level.upper(), serialize=True)
+
+
+# invoke logger setup on import with default values
+# enabling console logging with INFO and disabling file logging
+configure_logger()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..4748b12d92126698ad18e55388db5e6491293cb6
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py
@@ -0,0 +1,341 @@
+import asyncio
+from typing import Literal, Optional, Union, get_args
+
+import click
+from loguru import logger
+
+from guidellm.backend import Backend, BackendEnginePublic
+from guidellm.core import GuidanceReport, TextGenerationBenchmarkReport
+from guidellm.executor import Executor, ProfileGenerationMode
+from guidellm.request import (
+    EmulatedRequestGenerator,
+    FileRequestGenerator,
+    TransformersDatasetRequestGenerator,
+)
+from guidellm.request.base import RequestGenerator
+from guidellm.utils import BenchmarkReportProgress, cli_params
+
+__all__ = ["generate_benchmark_report"]
+
+
+@click.command()
+@click.option(
+    "--target",
+    type=str,
+    required=True,
+    help=(
+        "The target path or url for the backend to evaluate. "
+        "Ex: 'http://localhost:8000/v1'"
+    ),
+)
+@click.option(
+    "--backend",
+    type=click.Choice(get_args(BackendEnginePublic)),
+    default="openai_server",
+    help=(
+        "The backend to use for benchmarking. "
+        "The default is OpenAI Server enabling compatability with any server that "
+        "follows the OpenAI spec including vLLM."
+    ),
+)
+@click.option(
+    "--model",
+    type=str,
+    default=None,
+    help=(
+        "The Model to use for benchmarking. If not provided, it will use "
+        "the first available model provided the backend supports listing models."
+    ),
+)
+@click.option(
+    "--data",
+    type=str,
+    required=True,
+    help=(
+        "The data source to use for benchmarking. "
+        "Depending on the data-type, it should be a "
+        "path to a data file containing prompts to run (ex: data.txt), "
+        "a HuggingFace dataset name (ex: 'neuralmagic/LLM_compression_calibration'), "
+        "or a configuration for emulated data "
+        "(ex: 'prompt_tokens=128,generated_tokens=128')."
+    ),
+)
+@click.option(
+    "--data-type",
+    type=click.Choice(["emulated", "file", "transformers"]),
+    required=True,
+    help=(
+        "The type of data to use for benchmarking. "
+        "Use 'emulated' for synthetic data, 'file' for a file, or 'transformers' "
+        "for a HuggingFace dataset. Specify the data source with the --data flag."
+    ),
+)
+@click.option(
+    "--tokenizer",
+    type=str,
+    default=None,
+    help=(
+        "The tokenizer to use for calculating the number of prompt tokens. "
+        "This should match the tokenizer used by the model."
+        "By default, it will use the --model flag to determine the tokenizer. "
+        "If not provided and the model is not available, will raise an error. "
+        "Ex: 'neuralmagic/Meta-Llama-3.1-8B-quantized.w8a8'"
+    ),
+)
+@click.option(
+    "--rate-type",
+    type=click.Choice(get_args(ProfileGenerationMode)),
+    default="sweep",
+    help=(
+        "The type of request rate to use for benchmarking. "
+        "Use sweep to run a full range from synchronous to throughput (default), "
+        "synchronous for sending requests one after the other, "
+        "throughput to send requests as fast as possible, "
+        "constant for a fixed request rate, "
+        "or poisson for a real-world variable request rate."
+    ),
+)
+@click.option(
+    "--rate",
+    type=float,
+    default=None,
+    help=(
+        "The request rate to use for constant and poisson rate types. "
+        "To run multiple, provide the flag multiple times. "
+    ),
+    multiple=True,
+)
+@click.option(
+    "--max-seconds",
+    type=int,
+    default=120,
+    help=(
+        "The maximum number of seconds for each benchmark run. "
+        "Either max-seconds, max-requests, or both must be set. "
+        "The default is 120 seconds. "
+        "Note, this is the maximum time for each rate supplied, not the total time. "
+        "This value should be large enough to allow for "
+        "the server's performance to stabilize."
+    ),
+)
+@click.option(
+    "--max-requests",
+    type=cli_params.MAX_REQUESTS,
+    default=None,
+    help=(
+        "The maximum number of requests for each benchmark run. "
+        "Either max-seconds, max-requests, or both must be set. "
+        "Note, this is the maximum number of requests for each rate supplied, "
+        "not the total number of requests. "
+        "This value should be large enough to allow for "
+        "the server's performance to stabilize."
+    ),
+)
+@click.option(
+    "--output-path",
+    type=str,
+    default=None,
+    help=(
+        "The output path to save the output report to for loading later. "
+        "Ex: guidance_report.json. "
+        "The default is None, meaning no output is saved and results are only "
+        "printed to the console."
+    ),
+)
+@click.option(
+    "--enable-continuous-refresh",
+    is_flag=True,
+    default=False,
+    help=(
+        "Enable continual refreshing of the output table in the CLI "
+        "until the user exits. "
+    ),
+)
+def generate_benchmark_report_cli(
+    target: str,
+    backend: BackendEnginePublic,
+    model: Optional[str],
+    data: Optional[str],
+    data_type: Literal["emulated", "file", "transformers"],
+    tokenizer: Optional[str],
+    rate_type: ProfileGenerationMode,
+    rate: Optional[float],
+    max_seconds: Optional[int],
+    max_requests: Union[Literal["dataset"], int, None],
+    output_path: str,
+    enable_continuous_refresh: bool,
+):
+    """
+    Generate a benchmark report for a specified backend and dataset.
+    """
+    generate_benchmark_report(
+        target=target,
+        backend=backend,
+        model=model,
+        data=data,
+        data_type=data_type,
+        tokenizer=tokenizer,
+        rate_type=rate_type,
+        rate=rate,
+        max_seconds=max_seconds,
+        max_requests=max_requests,
+        output_path=output_path,
+        cont_refresh_table=enable_continuous_refresh,
+    )
+
+
+def generate_benchmark_report(
+    target: str,
+    data: Optional[str],
+    data_type: Literal["emulated", "file", "transformers"],
+    backend: BackendEnginePublic="openai_server",
+    model: Optional[str]=None,
+    tokenizer: Optional[str]=None,
+    rate_type: ProfileGenerationMode="sweep",
+    rate: Optional[float]=None,
+    max_seconds: Optional[int]=120,
+    max_requests: Union[Literal["dataset"], int, None]=None,
+    output_path: str=None,
+    cont_refresh_table: bool=False,
+) -> GuidanceReport:
+    """
+    Generate a benchmark report for a specified backend and dataset.
+
+    :param target: The target URL or path for the backend to evaluate.
+    :param backend: The backend type to use for benchmarking.
+    :param model: The model to benchmark;
+        defaults to the first available if not specified.
+    :param data: The data source for benchmarking,
+        which may be a path, dataset name, or config.
+    :param data_type: The type of data to use,
+        such as 'emulated', 'file', or 'transformers'.
+    :param tokenizer: The tokenizer to use for token counting,
+        defaulting to Llama 3.1 if not provided.
+    :param rate_type: The rate type for requests during benchmarking.
+    :param rate: The specific request rate for constant and poisson rate types.
+    :param max_seconds: Maximum duration for each benchmark run in seconds.
+    :param max_requests: Maximum number of requests per benchmark run.
+    :param output_path: Path to save the output report file.
+    :param cont_refresh_table: Continually refresh the table in the CLI
+        until the user exits.
+    """
+    logger.info(
+        "Generating benchmark report with target: {}, backend: {}", target, backend
+    )
+
+    # Create backend
+    backend_inst = Backend.create(
+        backend_type=backend,
+        target=target,
+        model=model,
+    )
+
+    request_generator: RequestGenerator
+
+    # Create tokenizer and request generator
+    tokenizer_inst = tokenizer
+    if not tokenizer_inst:
+        try:
+            tokenizer_inst = backend_inst.model_tokenizer()
+        except Exception as err:
+            raise ValueError(
+                "Could not load model's tokenizer, "
+                "--tokenizer must be provided for request generation"
+            ) from err
+
+    if data_type == "emulated":
+        request_generator = EmulatedRequestGenerator(
+            config=data, tokenizer=tokenizer_inst
+        )
+    elif data_type == "file":
+        request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer_inst)
+    elif data_type == "transformers":
+        request_generator = TransformersDatasetRequestGenerator(
+            dataset=data, tokenizer=tokenizer_inst
+        )
+    else:
+        raise ValueError(f"Unknown data type: {data_type}")
+
+    if data_type == "emulated" and max_requests == "dataset":
+        raise ValueError("Cannot use 'dataset' for emulated data")
+
+    # Create executor
+    executor = Executor(
+        backend=backend_inst,
+        request_generator=request_generator,
+        mode=rate_type,
+        rate=rate if rate_type in ("constant", "poisson") else None,
+        max_number=(
+            len(request_generator) if max_requests == "dataset" else max_requests
+        ),
+        max_duration=max_seconds,
+    )
+
+    # Run executor
+    logger.debug(
+        "Running executor with args: {}",
+        {
+            "backend": backend,
+            "request_generator": request_generator,
+            "mode": rate_type,
+            "rate": rate,
+            "max_number": max_requests,
+            "max_duration": max_seconds,
+        },
+    )
+    report = asyncio.run(_run_executor_for_result(executor))
+
+    # Save and print report
+    guidance_report = GuidanceReport()
+    guidance_report.benchmarks.append(report)
+
+    if output_path:
+        guidance_report.save_file(output_path)
+
+    guidance_report.print(
+        save_path=output_path if output_path is not None else "stdout",
+        continual_refresh=cont_refresh_table,
+    )
+
+    return guidance_report
+
+
+async def _run_executor_for_result(executor: Executor) -> TextGenerationBenchmarkReport:
+    report = None
+    progress = BenchmarkReportProgress()
+    started = False
+
+    async for result in executor.run():
+        if not started:
+            progress.start(result.generation_modes)  # type: ignore  # noqa: PGH003
+            started = True
+
+        if result.current_index is not None:
+            description = f"{result.current_profile.load_gen_mode}"  # type: ignore  # noqa: PGH003
+            if result.current_profile.load_gen_mode in ("constant", "poisson"):  # type: ignore  # noqa: PGH003
+                description += f"@{result.current_profile.load_gen_rate:.2f} req/s"  # type: ignore  # noqa: PGH003
+
+            progress.update_benchmark(
+                index=result.current_index,
+                description=description,
+                completed=result.scheduler_result.completed,  # type: ignore  # noqa: PGH003
+                completed_count=result.scheduler_result.count_completed,  # type: ignore  # noqa: PGH003
+                completed_total=result.scheduler_result.count_total,  # type: ignore  # noqa: PGH003
+                start_time=result.scheduler_result.benchmark.start_time,  # type: ignore  # noqa: PGH003
+                req_per_sec=result.scheduler_result.benchmark.completed_request_rate,  # type: ignore  # noqa: PGH003
+            )
+
+        if result.completed:
+            report = result.report
+            break
+
+    progress.finish()
+
+    if not report:
+        raise ValueError("No report generated by executor")
+
+    return report
+
+
+if __name__ == "__main__":
+    generate_benchmark_report_cli()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4feca91cdbbe9a137bd8ad404394116a50868360
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py
@@ -0,0 +1,13 @@
+from .base import GenerationMode, RequestGenerator
+from .emulated import EmulatedConfig, EmulatedRequestGenerator
+from .file import FileRequestGenerator
+from .transformers import TransformersDatasetRequestGenerator
+
+__all__ = [
+    "EmulatedConfig",
+    "EmulatedRequestGenerator",
+    "FileRequestGenerator",
+    "GenerationMode",
+    "RequestGenerator",
+    "TransformersDatasetRequestGenerator",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fd303e605f7043408c7751733c75e7429caa726
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py
@@ -0,0 +1,200 @@
+import contextlib
+import threading
+import time
+from abc import ABC, abstractmethod
+from queue import Empty, Full, Queue
+from typing import Iterator, Literal, Union
+
+from loguru import logger
+from transformers import (  # type: ignore  # noqa: PGH003
+    AutoTokenizer,
+    PreTrainedTokenizer,
+)
+
+from guidellm.core.request import TextGenerationRequest
+
+__all__ = ["GenerationMode", "RequestGenerator"]
+
+
+GenerationMode = Literal["async", "sync"]
+
+
+class RequestGenerator(ABC):
+    """
+    A base class for request generators that generate result requests.
+
+    :param type_: The type of the request generator.
+    :type type_: str
+    :param source: The data source for the request generator.
+    :type source: str
+    :param tokenizer: The tokenizer instance or the name/config to use
+        for tokenizing prompts.
+    :type tokenizer: Union[str, PreTrainedTokenizer]
+    :param mode: The generation mode, either 'async' or 'sync'.
+    :type mode: GenerationMode
+    :param async_queue_size: The size of the request queue.
+    :type async_queue_size: int
+    """
+
+    def __init__(
+        self,
+        type_: str,
+        source: str,
+        tokenizer: Union[str, PreTrainedTokenizer],
+        mode: GenerationMode = "async",
+        async_queue_size: int = 50,
+    ):
+        self._type = type_
+        self._source = source
+        self._async_queue_size: int = async_queue_size
+        self._mode: str = mode
+        self._queue: Queue = Queue(maxsize=async_queue_size)
+        self._stop_event: threading.Event = threading.Event()
+
+        if not tokenizer:
+            err = "Tokenizer must be provided for request generation"
+            logger.error(err)
+            raise ValueError(err)
+
+        self._tokenizer = (
+            AutoTokenizer.from_pretrained(tokenizer)
+            if isinstance(tokenizer, str)
+            else tokenizer
+        )
+        logger.info("Tokenizer initialized for request generation: {}", self._tokenizer)
+
+        if self._mode == "async":
+            self._thread = threading.Thread(target=self._populate_queue, daemon=True)
+            self._thread.start()
+            logger.info(
+                "RequestGenerator started in async mode with queue size: {}",
+                self._async_queue_size,
+            )
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation of the RequestGenerator.
+
+        :return: String representation of the RequestGenerator.
+        :rtype: str
+        """
+        return (
+            f"RequestGenerator("
+            f"mode={self._mode}, "
+            f"async_queue_size={self._async_queue_size}, "
+            f"tokenizer={self._tokenizer})"
+        )
+
+    def __iter__(self) -> Iterator[TextGenerationRequest]:
+        """
+        Provide an iterator interface to generate new requests.
+
+        :return: An iterator over result requests.
+        :rtype: Iterator[TextGenerationRequest]
+        """
+        if self.mode == "async":
+            while not self._stop_event.is_set():
+                try:
+                    item = self._queue.get_nowait()
+                    self._queue.task_done()
+                    yield item
+                except Empty:
+                    time.sleep(0.01)
+                    continue
+        else:
+            while not self._stop_event.is_set():
+                yield self.create_item()
+
+    @abstractmethod
+    def __len__(self) -> int:
+        """
+        Abstract method to get the length of the collection to be generated.
+        """
+
+    @abstractmethod
+    def create_item(self) -> TextGenerationRequest:
+        """
+        Abstract method to create a new result request item.
+
+        :return: A new result request.
+        :rtype: TextGenerationRequest
+        """
+
+    @property
+    def type_(self) -> str:
+        """
+        Get the type of the request generator.
+
+        :return: The type of the request generator.
+        :rtype: str
+        """
+        return self._type
+
+    @property
+    def source(self) -> str:
+        """
+        Get the data source for the request generator.
+
+        :return: The data source.
+        :rtype: str
+        """
+        return self._source
+
+    @property
+    def tokenizer(self) -> PreTrainedTokenizer:
+        """
+        Get the tokenizer instance.
+
+        :return: The tokenizer instance.
+        :rtype: PreTrainedTokenizer
+        """
+        return self._tokenizer
+
+    @property
+    def mode(self) -> str:
+        """
+        Get the generation mode.
+
+        :return: The generation mode.
+        :rtype: str
+        """
+        return self._mode
+
+    @property
+    def async_queue_size(self) -> int:
+        """
+        Get the size of the request queue.
+
+        :return: The size of the request queue.
+        :rtype: int
+        """
+        return self._async_queue_size
+
+    def stop(self):
+        """
+        Stop the background task that populates the queue.
+        """
+        logger.info("Stopping RequestGenerator...")
+        self._stop_event.set()
+        if self._mode == "async":
+            self._thread.join()
+        logger.info("RequestGenerator stopped")
+
+    def _populate_queue(self):
+        """
+        Populate the request queue in the background.
+        """
+
+        while not self._stop_event.is_set():
+            with contextlib.suppress(Full):
+                if self._queue.qsize() < self._async_queue_size:
+                    item = self.create_item()
+                    self._queue.put(item, timeout=0.1)
+                    logger.debug(
+                        "Item added to queue. Current queue size: {}",
+                        self._queue.qsize(),
+                    )
+                else:
+                    time.sleep(0.1)
+
+        logger.info("RequestGenerator stopped populating queue")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py
new file mode 100644
index 0000000000000000000000000000000000000000..02f564a1ceecd9e977ce0b8d5c37a0394adeb69a
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py
@@ -0,0 +1,416 @@
+import json
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+
+import numpy as np
+from loguru import logger
+from transformers import PreTrainedTokenizer  # type: ignore  # noqa: PGH003
+
+from guidellm.config import settings
+from guidellm.core.request import TextGenerationRequest
+from guidellm.request.base import GenerationMode, RequestGenerator
+from guidellm.utils import clean_text, filter_text, load_images, load_text, split_text
+
+__all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"]
+
+
+@dataclass
+class EmulatedConfig:
+    """
+    Configuration for emulated text generation requests.
+
+    Args:
+        prompt_tokens (int): Number of prompt tokens.
+        prompt_tokens_variance (Optional[int]): Variance for prompt tokens.
+        prompt_tokens_min (Optional[int]): Minimum number of prompt tokens.
+        prompt_tokens_max (Optional[int]): Maximum number of prompt tokens.
+        generated_tokens (Optional[int]): Number of generated tokens.
+        generated_tokens_variance (Optional[int]): Variance for generated tokens.
+        generated_tokens_min (Optional[int]): Minimum number of generated tokens.
+        generated_tokens_max (Optional[int]): Maximum number of generated tokens.
+        images (Optional[int]): Number of images.
+        width (Optional[int]): Width of images.
+        height (Optional[int]): Height of images.
+    """
+
+    @staticmethod
+    def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig":
+        """
+        Create an EmulatedConfig instance from a configuration source.
+
+        :param config: Configuration source, can be a dictionary, JSON string,
+            key=value string, or file path.
+        :type config: Union[str, Path, Dict]
+        :return: An instance of EmulatedConfig.
+        :rtype: EmulatedConfig
+        :raises FileNotFoundError: If the configuration file is not found.
+        :raises ValueError: If the configuration format is invalid.
+        """
+        if not config:
+            logger.debug("Creating default configuration")
+            return EmulatedConfig(prompt_tokens=1024, generated_tokens=256, images=0)
+
+        if isinstance(config, dict):
+            logger.debug("Loading configuration from dict: {}", config)
+            return EmulatedConfig(**config)
+
+        if isinstance(config, Path) or (
+            isinstance(config, str) and (config.endswith(".json") or "{" in config)
+        ):
+            logger.debug("Loading configuration from json: {}", config)
+
+            if isinstance(config, str) and "{" in config:
+                json_text = config.strip()
+            else:
+                if isinstance(config, str):
+                    config = Path(config)
+
+                if not config.exists():
+                    raise FileNotFoundError(f"Configuration file not found: {config}")
+
+                json_text = config.read_text(encoding="utf-8")
+
+            json_dict = json.loads(json_text)
+
+            return EmulatedConfig(**json_dict)
+
+        if isinstance(config, str) and "=" in config:
+            logger.debug("Loading configuration from csv string: {}", config)
+            items = config.split(",")
+            config_dict = {}
+            for item in items:
+                key_value = item.strip().split("=")
+                if len(key_value) != 2:  # noqa: PLR2004
+                    raise ValueError(f"Unexpected format for item: {item}")
+                key = key_value[0].strip()
+                value = (
+                    int(key_value[1].strip())
+                    if key_value[1].isnumeric()
+                    else key_value[1]
+                )
+                config_dict[key] = value
+
+            return EmulatedConfig(**config_dict)  # type: ignore # noqa: PGH003
+
+        raise ValueError(
+            f"Invalid configuration given for creation of EmulatedConfig: {config}"
+        )
+
+    prompt_tokens: int
+    prompt_tokens_variance: Optional[int] = None
+    prompt_tokens_min: Optional[int] = None
+    prompt_tokens_max: Optional[int] = None
+
+    generated_tokens: Optional[int] = None
+    generated_tokens_variance: Optional[int] = None
+    generated_tokens_min: Optional[int] = None
+    generated_tokens_max: Optional[int] = None
+
+    images: int = 0
+    width: int = None
+    height: int = None
+
+    @property
+    def prompt_tokens_range(self) -> Tuple[int, int]:
+        """
+        Get the range (min, max) of prompt tokens to generate.
+
+        :return: The range of prompt tokens.
+        :rtype: Tuple[int, int]
+        """
+        return self._token_range(
+            self.prompt_tokens,
+            self.prompt_tokens_variance,
+            self.prompt_tokens_min,
+            self.prompt_tokens_max,
+        )
+
+    @property
+    def output_tokens_range(self) -> Tuple[int, int]:
+        """
+        Get the range (min, max) of output tokens to generate.
+
+        :return: The range of generated tokens.
+        :rtype: Tuple[int, int]
+        """
+        if not self.generated_tokens:
+            return 0, 0
+
+        return self._token_range(
+            self.generated_tokens,
+            self.generated_tokens_variance,
+            self.generated_tokens_min,
+            self.generated_tokens_max,
+        )
+
+    def sample_prompt_tokens(self, rng: np.random.Generator) -> int:
+        """
+        Sample the number of prompt tokens to generate.
+
+        :param rng: The random number generator to use.
+        :type rng: np.random.Generator
+        :return: The number of prompt tokens to create.
+        :rtype: int
+        """
+        return self._sample_tokens(
+            self.prompt_tokens,
+            self.prompt_tokens_variance,
+            self.prompt_tokens_min,
+            self.prompt_tokens_max,
+            rng,
+        )
+
+    def sample_output_tokens(self, rng: np.random.Generator) -> Optional[int]:
+        """
+        Sample the number of output tokens to generate.
+
+        :param rng: The random number generator to use.
+        :type rng: np.random.Generator
+        :return: The number of output tokens to generate.
+        :rtype: Optional[int]
+        """
+        if not self.generated_tokens:
+            return None
+
+        return self._sample_tokens(
+            self.generated_tokens,
+            self.generated_tokens_variance,
+            self.generated_tokens_min,
+            self.generated_tokens_max,
+            rng,
+        )
+
+    @staticmethod
+    def _sample_tokens(
+        base: int,
+        variance: Optional[int],
+        min_tokens: Optional[int],
+        max_tokens: Optional[int],
+        rng: np.random.Generator,
+    ) -> int:
+        min_tokens, max_tokens = EmulatedConfig._token_range(
+            base, variance, min_tokens, max_tokens
+        )
+
+        if min_tokens == max_tokens:
+            return min_tokens
+
+        if not variance:
+            return rng.integers(min_tokens, max_tokens + 1)
+
+        rand = rng.normal(base, math.sqrt(variance))
+
+        return int(min(max(rand, min_tokens), max_tokens))
+
+    @staticmethod
+    def _token_range(
+        base: int,
+        variance: Optional[int],
+        min_tokens: Optional[int],
+        max_tokens: Optional[int],
+    ) -> Tuple[int, int]:
+        if not variance:
+            return (
+                min_tokens or base,
+                max_tokens or base,
+            )
+
+        min_tokens = min_tokens if min_tokens and min_tokens > 0 else 1
+        max_tokens = (
+            max_tokens if max_tokens and max_tokens > base else base + 5 * variance
+        )
+
+        return min_tokens, max_tokens
+
+
+class EndlessTokens(List[str]):
+    """
+    A list subclass that allows for endless data generation.
+    """
+
+    def __init__(
+        self,
+        data: Union[str, Path],
+        filter_start: Optional[Union[str, int]] = None,
+        filter_end: Optional[Union[str, int]] = None,
+        clean_text_args: Optional[Dict[str, bool]] = None,
+    ):
+        """
+        Initialize EndlessDataWords with data.
+
+        :param data: Source text data.
+        :type data: str
+        """
+        logger.debug("Loading data from: {}", data)
+        data = load_text(data)
+        data = filter_text(data, filter_start, filter_end)
+        data = (
+            clean_text(data)
+            if not clean_text_args
+            else clean_text(data, **clean_text_args)
+        )
+        self._tokens, self._token_separators, self._line_indices = split_text(data)
+
+        super().__init__(self._tokens)
+
+    @property
+    def line_indices(self) -> List[int]:
+        """
+        Get the list of start indices for lines.
+
+        :return: List of start indices.
+        :rtype: List[int]
+        """
+        return self._line_indices
+
+    def create_text(self, start: int, length: int) -> str:
+        """
+        Create a text snippet from the specified range.
+
+        :param start: Start index.
+        :type start: int
+        :param length: Length of the snippet.
+        :type length: int
+        :return: Text snippet.
+        :rtype: str
+        """
+        start = start % len(self)
+        text = ""
+        buff_token_sep = ""
+
+        for counter in range(length):
+            index = (start + counter) % len(self)
+            text += buff_token_sep + self[index]
+            buff_token_sep = self._token_separators[index]
+
+        return text
+
+
+class EmulatedRequestGenerator(RequestGenerator):
+    """
+    A request generator that generates emulated requests based on a configuration.
+
+    :param config: The configuration string, file path, or dictionary.
+    :type config: Union[str, Dict, Path]
+    :param random_seed: The random seed to use for generating requests.
+    :type random_seed: Optional[int]
+    :param tokenizer: The tokenizer instance or the name/config to use
+        for tokenizing prompts.
+    :type tokenizer: Optional[Union[str, PreTrainedTokenizer]]
+    :param mode: The generation mode, either 'async' or 'sync'.
+    :type mode: GenerationMode
+    :param async_queue_size: The size of the request queue.
+    :type async_queue_size: int
+    """
+
+    def __init__(
+        self,
+        config: Optional[Union[str, Path, Dict]],
+        random_seed: Optional[int] = None,
+        tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+        mode: GenerationMode = "async",
+        async_queue_size: int = 50,
+    ):
+        """
+        Initialize EmulatedRequestGenerator with configuration and tokenizer.
+
+        :param config: Configuration source, can be a dictionary,
+            JSON string, or file path.
+        :type config: Optional[Union[str, Path, Dict]]
+        :param random_seed: Optional seed for random number generator.
+        :type random_seed: Optional[int]
+        :param tokenizer: Tokenizer instance or configuration for tokenizing prompts.
+        :type tokenizer: Optional[Union[str, PreTrainedTokenizer]]
+        :param mode: Mode of request generation, either 'async' or 'sync'.
+        :type mode: str
+        :param async_queue_size: Size of the asynchronous queue.
+        :type async_queue_size: int
+        """
+        self._config = EmulatedConfig.create_config(config)
+        self._tokens = EndlessTokens(
+            settings.emulated_data.source,
+            settings.emulated_data.filter_start,
+            settings.emulated_data.filter_end,
+        )
+        if self._config.images > 0:
+            self._images = load_images(settings.emulated_data.image_source, [self._config.width, self._config.height])
+        self._rng = np.random.default_rng(random_seed)
+
+        # NOTE: Must be after all the parameters since the queue population
+        #       function requires attributes above
+        super().__init__(
+            type_="emulated",
+            source=str(config),
+            tokenizer=tokenizer,
+            mode=mode,
+            async_queue_size=async_queue_size,
+        )
+
+    def __len__(self) -> int:
+        raise NotImplementedError(
+            "Can't get the length of the emulated dataset. "
+            "Check the `--data-type` CLI parameter."
+        )
+
+    def create_item(self) -> TextGenerationRequest:
+        """
+        Create a new text generation request item from the data.
+
+        :return: A new text generation request.
+        :rtype: TextGenerationRequest
+        """
+        logger.debug("Creating new text generation request")
+        target_prompt_token_count = self._config.sample_prompt_tokens(self._rng)
+        prompt = self.sample_prompt(target_prompt_token_count)
+        images = self.sample_images()
+        prompt_token_count = len(self.tokenizer.tokenize(prompt))
+        output_token_count = self._config.sample_output_tokens(self._rng)
+        logger.debug("Generated prompt: {}", prompt)
+
+        return TextGenerationRequest(
+            prompt=prompt,
+            prompt_token_count=prompt_token_count,
+            output_token_count=output_token_count,
+            images=images,
+        )
+
+    def sample_prompt(self, tokens: int) -> str:
+        """
+        Sample a prompt with the specified number of tokens.
+
+        :param tokens: Number of tokens for the prompt.
+        :type tokens: int
+        :return: Sampled prompt text.
+        :rtype: str
+        """
+        start_line_index = self._rng.integers(0, len(self._tokens.line_indices))
+
+        # binary search to find the proper number of tokens for the prompt
+        # this is because tokenizers differ in tokenization behavior
+        left = 0
+        right = left + 5 * tokens
+
+        while left < right:
+            mid = (left + right) // 2
+            prompt = self._tokens.create_text(start_line_index, mid)
+            token_count = len(self.tokenizer.tokenize(prompt))
+
+            if token_count == tokens:
+                return prompt
+
+            if token_count < tokens:
+                left = mid + 1
+            else:
+                right = mid
+
+        return self._tokens.create_text(start_line_index, left)
+
+
+    def sample_images(self):
+        image_indices = self._rng.choice(
+            len(self._images), size=self._config.images, replace=False,
+        )
+
+        return [self._images[i] for i in image_indices]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py
new file mode 100644
index 0000000000000000000000000000000000000000..b187f7b46b343311daa32fe465d60fff163beff2
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py
@@ -0,0 +1,83 @@
+from pathlib import Path
+from typing import Optional, Union
+
+from loguru import logger
+from transformers import PreTrainedTokenizer  # type: ignore  # noqa: PGH003
+
+from guidellm.config import settings
+from guidellm.core.request import TextGenerationRequest
+from guidellm.request.base import GenerationMode, RequestGenerator
+from guidellm.utils import load_text_lines
+
+__all__ = ["FileRequestGenerator"]
+
+
+class FileRequestGenerator(RequestGenerator):
+    """
+    A request generator implementation for files.
+
+    :param path: The path to the file containing the data.
+    :type path: Optional[Union[str, Path]]
+    :param tokenizer: The tokenizer instance or the name/config to use
+        for tokenizing prompts.
+    :type tokenizer: Union[str, PreTrainedTokenizer]
+    :param mode: The generation mode, either 'async' or 'sync'.
+    :type mode: str
+    :param async_queue_size: The size of the request queue.
+    :type async_queue_size: int
+    """
+
+    def __init__(
+        self,
+        path: Optional[Union[str, Path]],
+        tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+        mode: GenerationMode = "async",
+        async_queue_size: int = 50,
+    ):
+        if not path:
+            raise ValueError("File path must be provided for FileRequestGenerator")
+
+        self._path = path
+        self._data = load_text_lines(
+            path,
+            filters=settings.dataset.preferred_data_columns,
+        )
+        self._iterator = iter(self._data)
+
+        # NOTE: Must be after all the parameters since the queue population
+        #       function requires attributes above
+        super().__init__(
+            type_="file",
+            source=str(path),
+            tokenizer=tokenizer,
+            mode=mode,
+            async_queue_size=async_queue_size,
+        )
+
+    def __len__(self) -> int:
+        """
+        Return the number of text lines.
+        """
+
+        return len(self._data)
+
+    def create_item(self) -> TextGenerationRequest:
+        """
+        Create a new result request item from the data.
+
+        :return: A new result request.
+        :rtype: TextGenerationRequest
+        """
+        logger.debug("Creating new request item from file data")
+
+        try:
+            data = next(self._iterator)
+        except StopIteration:
+            self._iterator = iter(self._data)
+            data = next(self._iterator)
+
+        token_count = len(self.tokenizer.tokenize(data))
+        request = TextGenerationRequest(prompt=data, prompt_token_count=token_count)
+        logger.debug("Created new TextGenerationRequest: {}", request)
+
+        return request
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fd24040d3e59a95a69a2b829552bbca83bc5338
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py
@@ -0,0 +1,103 @@
+from pathlib import Path
+from typing import Optional, Union
+
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
+from loguru import logger
+from transformers import PreTrainedTokenizer  # type: ignore  # noqa: PGH003
+
+from guidellm.core.request import TextGenerationRequest
+from guidellm.request.base import GenerationMode, RequestGenerator
+from guidellm.utils import (
+    load_transformers_dataset,
+    resolve_transformers_dataset_column,
+)
+
+__all__ = ["TransformersDatasetRequestGenerator"]
+
+
+class TransformersDatasetRequestGenerator(RequestGenerator):
+    """
+    A request generator implementation for Hugging Face datasets.
+
+    :param dataset: The name of the Hugging Face dataset to use or the path
+        to a local dataset.
+    :type dataset_name: str
+    :param split: The split of the dataset to use (e.g., 'train', 'test').
+    :type split: str
+    :param column: The column/field to use for generating requests.
+    :type column: str
+    :param tokenizer: The tokenizer instance or the name/config to use
+        for tokenizing prompts.
+    :type tokenizer: Union[str, PreTrainedTokenizer]
+    :param mode: The generation mode, either 'async' or 'sync'.
+    :type mode: str
+    :param async_queue_size: The size of the request queue.
+    :type async_queue_size: int
+    """
+
+    def __init__(
+        self,
+        dataset: Union[
+            str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+        ],
+        split: Optional[str] = None,
+        column: Optional[str] = None,
+        tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
+        mode: GenerationMode = "async",
+        async_queue_size: int = 50,
+        **kwargs,
+    ):
+        self._dataset = dataset
+        self._split = split
+        self._column = column
+        self._kwargs = kwargs
+
+        self._hf_dataset: Union[Dataset, IterableDataset] = load_transformers_dataset(
+            dataset, split=split, **kwargs
+        )
+        self._hf_column = resolve_transformers_dataset_column(
+            self._hf_dataset, column=column
+        )
+        self._hf_dataset_iterator = iter(self._hf_dataset)
+
+        # NOTE: Must be after all the parameters since the queue population
+        #       function requires attributes above
+        super().__init__(
+            type_="transformers_dataset",
+            source=str(dataset),
+            tokenizer=tokenizer,
+            mode=mode,
+            async_queue_size=async_queue_size,
+        )
+
+    def __len__(self) -> int:
+        if not isinstance(self._hf_dataset, Dataset):
+            raise ValueError("Can't get dataset size for IterableDataset object")
+        else:
+            return len(self._hf_dataset)
+
+    def create_item(self) -> TextGenerationRequest:
+        """
+        Create a new result request item from the dataset.
+
+        :return: A new result request.
+        :rtype: TextGenerationRequest
+        """
+
+        logger.debug("Creating new request item from dataset")
+
+        try:
+            data = next(self._hf_dataset_iterator)
+        except StopIteration:
+            self._hf_dataset_iterator = iter(self._hf_dataset)
+            data = next(self._hf_dataset_iterator)
+
+        prompt = data[self._hf_column]
+        token_count = len(self.tokenizer.tokenize(prompt))
+        request = TextGenerationRequest(
+            prompt=prompt,
+            prompt_token_count=token_count,
+        )
+        logger.debug(f"Created new TextGenerationRequest: {request}")
+
+        return request
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b4ac50c647ad1adea9ad4368ac58727c11bc19
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py
@@ -0,0 +1,4 @@
+from .base import Scheduler, SchedulerResult
+from .load_generator import LoadGenerationMode, LoadGenerator
+
+__all__ = ["LoadGenerationMode", "LoadGenerator", "Scheduler", "SchedulerResult"]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..602166b01a88d5e9525f3277b2b39602e3a82fd6
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py
@@ -0,0 +1,374 @@
+import asyncio
+import math
+import time
+from dataclasses import dataclass
+from typing import AsyncGenerator, Literal, Optional, Union, get_args
+
+from loguru import logger
+
+from guidellm.backend import Backend
+from guidellm.config import settings
+from guidellm.core import (
+    TextGenerationBenchmark,
+    TextGenerationError,
+    TextGenerationRequest,
+    TextGenerationResult,
+)
+from guidellm.request import RequestGenerator
+from guidellm.scheduler.load_generator import LoadGenerationMode, LoadGenerator
+
+__all__ = ["Scheduler", "SchedulerResult"]
+
+
+@dataclass
+class SchedulerResult:
+    """
+    Represents the result of a single task execution within the Scheduler.
+
+    :param completed: Indicates if the task is completed.
+    :type completed: bool
+    :param count_total: Total number of tasks to be executed.
+    :type count_total: int
+    :param count_completed: Number of tasks that have been completed so far.
+    :type count_completed: int
+    :param report: Benchmark data for the task execution.
+    :type benchmark: TextGenerationBenchmark
+    :param current_result: The result of the current request, if any.
+    :type current_result: Optional[Union[TextGenerationResult, Exception]]
+    """
+
+    completed: bool
+    count_total: int
+    count_completed: int
+    benchmark: TextGenerationBenchmark
+    current_result: Optional[Union[TextGenerationResult, TextGenerationError]] = None
+
+
+class Scheduler:
+    """
+    Schedules and manages the execution of tasks for text generation requests.
+
+    :param generator: The request generator that produces text generation requests.
+    :type generator: RequestGenerator
+    :param worker: The backend worker that processes the requests.
+    :type worker: Backend
+    :param mode: The mode of load generation (e.g., synchronous, asynchronous).
+    :type mode: LoadGenerationMode
+    :param rate: The rate at which requests are generated, if applicable.
+    :type rate: Optional[float]
+    :param max_number: Maximum number of requests to be processed.
+    :type max_number: Optional[int]
+    :param max_duration: Maximum duration in seconds for which requests
+        should be processed.
+    :type max_duration: Optional[float]
+
+    :raises ValueError: If neither max_number nor max_duration is specified or
+        if they are not positive.
+    """
+
+    def __init__(
+        self,
+        generator: RequestGenerator,
+        worker: Backend,
+        mode: LoadGenerationMode = "synchronous",
+        rate: Optional[float] = None,
+        max_number: Optional[int] = None,
+        max_duration: Optional[float] = None,
+    ):
+        logger.info(
+            "Scheduler initialized with params: generator={}, worker={}, mode={}, "
+            "rate={}, max_number={}, max_duration={}",
+            generator,
+            worker,
+            mode,
+            rate,
+            max_number,
+            max_duration,
+        )
+
+        if mode not in get_args(LoadGenerationMode):
+            err = ValueError(
+                f"{mode} is not a valid Load Generation Mode. "
+                f"Valid options are {get_args(LoadGenerationMode)}"
+            )
+            logger.error(err)
+            raise err
+
+        if not max_number and not max_duration:
+            err = ValueError("Either max_number or max_duration must be specified")
+            logger.error(err)
+            raise err
+
+        if max_number and max_number <= 0:
+            err = ValueError(f"max_number must be > 0, given: {max_number}")
+            logger.error(err)
+            raise err
+
+        if max_duration and max_duration <= 0:
+            err = ValueError(f"max_duration must be > 0, given: {max_duration}")
+            logger.error(err)
+            raise err
+
+        if mode in ["constant", "poisson"] and not rate:
+            err = ValueError(f"Rate must be > 0 for mode: {mode}. Given: {rate}")
+            logger.error(err)
+            raise err
+
+        self._generator = generator
+        self._worker = worker
+        self._mode = mode
+        self._rate = rate
+        self._max_number = max_number
+        self._max_duration = max_duration
+
+        self._load_generator = LoadGenerator(mode, rate)
+
+    @property
+    def generator(self) -> RequestGenerator:
+        """
+        The request generator that produces text generation requests.
+
+        :return: The request generator instance.
+        :rtype: RequestGenerator
+        """
+        return self._generator
+
+    @property
+    def worker(self) -> Backend:
+        """
+        The backend worker that processes the requests.
+
+        :return: The backend worker instance.
+        :rtype: Backend
+        """
+        return self._worker
+
+    @property
+    def mode(self) -> LoadGenerationMode:
+        """
+        The mode of load generation (e.g., synchronous, asynchronous).
+
+        :return: The load generation mode.
+        :rtype: LoadGenerationMode
+        """
+        return self._mode
+
+    @property
+    def rate(self) -> Optional[float]:
+        """
+        The rate at which requests are generated, if applicable.
+
+        :return: The rate of request generation.
+        :rtype: Optional[float]
+        """
+        return self._rate
+
+    @property
+    def max_number(self) -> Optional[int]:
+        """
+        Maximum number of requests to be processed.
+
+        :return: The maximum number of requests.
+        :rtype: Optional[int]
+        """
+        return self._max_number
+
+    @property
+    def max_duration(self) -> Optional[float]:
+        """
+        Maximum duration in seconds for which requests should be processed.
+
+        :return: The maximum duration in seconds.
+        :rtype: Optional[float]
+        """
+        return self._max_duration
+
+    @property
+    def load_generator(self) -> LoadGenerator:
+        """
+        The load generator responsible for generating load based on mode and rate.
+
+        :return: The load generator instance.
+        :rtype: LoadGenerator
+        """
+        return self._load_generator
+
+    @property
+    def benchmark_mode(self) -> Literal["asynchronous", "synchronous", "throughput"]:
+        """
+        The report mode for the scheduler.
+
+        :return: The report mode.
+        :rtype: Literal["asynchronous", "synchronous", "throughput"]
+        """
+        if self._mode == "synchronous":
+            return "synchronous"
+
+        if self._mode == "throughput":
+            return "throughput"
+
+        return "asynchronous"
+
+    async def run(self) -> AsyncGenerator[SchedulerResult, None]:
+        """
+        Run the scheduler to process requests based on the configured mode, rate,
+        maximum number, and maximum duration.
+
+        :yield: The result of each task executed by the scheduler.
+        :rtype: Generator[SchedulerResult, None, None]
+        """
+        logger.info("Starting Scheduler run")
+
+        benchmark = TextGenerationBenchmark(mode=self.benchmark_mode, rate=self.rate)
+        start_time = time.time()
+        end_time = start_time + self.max_duration if self.max_duration else math.inf
+        max_number = float(self.max_number) if self.max_number else math.inf
+        runner = self._run_sync if self._mode == "synchronous" else self._run_async
+        count_total = (
+            self.max_number
+            if self.max_number
+            else round(self.max_duration)
+            if self.max_duration
+            else 0
+        )
+
+        # yield initial result for progress tracking
+        yield SchedulerResult(
+            completed=False,
+            count_total=count_total,
+            count_completed=0,
+            benchmark=benchmark,
+        )
+
+        run_count = 0
+        async for res in runner(benchmark, end_time, max_number):
+            run_count += 1
+            count_completed = (
+                min(run_count, self.max_number)
+                if self.max_number
+                else round(time.time() - start_time)
+                if self.max_duration
+                else 0
+            )
+
+            yield SchedulerResult(
+                completed=False,
+                count_total=count_total,
+                count_completed=count_completed,
+                benchmark=benchmark,
+                current_result=res,
+            )
+
+        logger.info("Scheduler run completed")
+
+        yield SchedulerResult(
+            completed=True,
+            count_total=count_total,
+            count_completed=(
+                benchmark.request_count + benchmark.error_count
+                if self.max_number
+                else round(time.time() - start_time)
+                if self.max_duration
+                else 0
+            ),
+            benchmark=benchmark,
+        )
+
+    async def _run_sync(
+        self, benchmark: TextGenerationBenchmark, end_time: float, max_number: float
+    ) -> AsyncGenerator[Union[TextGenerationResult, TextGenerationError], None]:
+        for index, (request, submit_at) in enumerate(
+            zip(self.generator, self.load_generator.times())
+        ):
+            if index >= max_number or time.time() >= end_time:
+                break
+
+            logger.debug(
+                "Running synchronous request={} at submit_at={}",
+                request,
+                submit_at,
+            )
+            benchmark.request_started()
+            result = await self._submit_task_coroutine(request, submit_at, end_time)
+            if result is not None:
+                benchmark.request_completed(result)
+                logger.debug("Request completed with output: {}", result)
+                yield result
+
+    async def _run_async(
+        self, benchmark: TextGenerationBenchmark, end_time: float, max_number: float
+    ) -> AsyncGenerator[Union[TextGenerationResult, TextGenerationError], None]:
+        tasks = []
+        completed = 0
+
+        for index, (request, submit_at) in enumerate(
+            zip(self.generator, self.load_generator.times())
+        ):
+            while (index + 1 - completed) >= settings.max_concurrency:
+                await asyncio.sleep(0.1)
+
+            if index >= max_number or time.time() >= end_time or submit_at >= end_time:
+                break
+
+            logger.debug(
+                "Running asynchronous request={} at submit_at={}",
+                request,
+                submit_at,
+            )
+
+            def _completed(_task: asyncio.Task) -> None:
+                nonlocal completed
+                completed += 1
+                _res = _task.result()
+
+                if _res:
+                    benchmark.request_completed(_res)
+                    logger.debug("Request completed: {}", _res)
+
+            benchmark.request_started()
+            task = asyncio.create_task(
+                self._submit_task_coroutine(request, submit_at, end_time)
+            )
+            task.add_done_callback(_completed)
+            tasks.append(task)
+
+            # release control to the event loop for other tasks
+            await asyncio.sleep(0.001)
+
+        for compl_task in asyncio.as_completed(tasks):
+            task_res = await compl_task
+            if task_res is not None:
+                yield task_res
+
+    async def _submit_task_coroutine(
+        self, request: TextGenerationRequest, submit_at: float, end_time: float
+    ) -> Optional[Union[TextGenerationResult, TextGenerationError]]:
+        try:
+            if submit_at > end_time:
+                logger.info(
+                    "Request {} submission time {} is greater than end time {}",
+                    request,
+                    submit_at,
+                    end_time,
+                )
+                raise asyncio.TimeoutError(
+                    f"Request submission time {submit_at} "
+                    f"is greater than end time {end_time}"
+                )
+
+            if submit_at > time.time():
+                await asyncio.sleep(submit_at - time.time())
+
+            timeout = (
+                end_time - time.time() if end_time and end_time < math.inf else None
+            )
+
+            return await asyncio.wait_for(self._worker.submit(request), timeout=timeout)
+        except asyncio.TimeoutError as exc:
+            logger.info("Request {} timed out: {}", request, exc)
+
+            return None
+        except Exception as exc:  # noqa: BLE001
+            logger.warning("Request {} failed: {}", request, exc)
+
+            return TextGenerationError(request=request, message=str(exc))
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..f629752ab2e1b961615ece6ba1e90f48274e89a2
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py
@@ -0,0 +1,196 @@
+import time
+from typing import Generator, Literal, Optional, get_args
+
+import numpy as np
+from loguru import logger
+
+__all__ = ["LoadGenerationMode", "LoadGenerator"]
+
+LoadGenerationMode = Literal["synchronous", "constant", "poisson", "throughput"]
+
+
+class LoadGenerator:
+    """
+    Load Generator class that generates timestamps for load generation.
+
+    This class supports multiple load generation modes: "constant", "poisson",
+    "throughput", and "synchronous". Each mode has its own method for generating
+    timestamps based on the rate provided during initialization.
+
+    :param mode: The mode of load generation. Valid options are "constant",
+        "poisson", "throughput", and "synchronous".
+    :type mode: LoadGenerationMode
+    :param rate: The rate at which to generate timestamps. This value is
+        interpreted differently depending on the mode.
+    :type rate: float
+
+    :raises ValueError: If an invalid mode is provided.
+    """
+
+    def __init__(self, mode: LoadGenerationMode, rate: Optional[float] = None):
+        """
+        Initialize the Load Generator with the mode and rate.
+
+        :param mode: The mode of load generation ("constant", "poisson", "throughput",
+            or "synchronous").
+        :type mode: LoadGenerationMode
+        :param rate: The rate at which to generate timestamps. In the "constant"
+            mode, this represents the frequency of events. In the "poisson" mode,
+            it represents the average frequency.
+        :type rate: Optional[float]
+        """
+        if mode not in get_args(LoadGenerationMode):
+            error = ValueError(
+                f"{mode} is not a valid Load Generation Mode. "
+                f"Valid options are {get_args(LoadGenerationMode)}"
+            )
+            logger.error(error)
+            raise error
+
+        if mode not in ["synchronous", "throughput"] and (rate is None or rate <= 0):
+            error = ValueError(f"Rate must be > 0 for mode: {mode}. Given: {rate}")
+            logger.error(error)
+            raise error
+
+        self._mode = mode
+        self._rate = rate
+        logger.debug(
+            "Initialized LoadGenerator with mode: {mode}, rate: {rate}",
+            mode=mode,
+            rate=rate,
+        )
+
+    @property
+    def mode(self) -> LoadGenerationMode:
+        """
+        Get the mode of load generation.
+
+        :return: The mode of load generation.
+        :rtype: LoadGenerationMode
+        """
+        return self._mode
+
+    @property
+    def rate(self) -> Optional[float]:
+        """
+        Get the rate of load generation.
+
+        :return: The rate of load generation.
+        :rtype: Optional[float]
+        """
+        return self._rate
+
+    def times(self) -> Generator[float, None, None]:
+        """
+        Generate timestamps for load generation based on the selected mode.
+
+        :return: A generator that yields timestamps at which each load
+            should be initiated.
+        :rtype: Generator[float, None, None]
+
+        :raises ValueError: If the mode is invalid.
+        """
+        logger.debug(f"Generating timestamps using mode: {self._mode}")
+
+        if self._mode == "throughput":
+            yield from self.throughput_times()
+        elif self._mode == "constant":
+            yield from self.constant_times()
+        elif self._mode == "poisson":
+            yield from self.poisson_times()
+        elif self._mode == "synchronous":
+            yield from self.synchronous_times()
+        else:
+            logger.error(f"Invalid mode encountered: {self._mode}")
+            raise ValueError(f"Invalid mode: {self._mode}")
+
+    def synchronous_times(self) -> Generator[float, None, None]:
+        """
+        Generate invalid timestamps for the "synchronous" mode.
+
+        :return: A generator that yields a constant invalid timestamp (-1.0).
+        :rtype: Generator[float, None, None]
+        """
+        logger.debug("Generating invalid timestamps for synchronous mode")
+        while True:
+            yield -1.0
+
+    def throughput_times(self) -> Generator[float, None, None]:
+        """
+        Generate timestamps at the maximum rate possible, returning the current time.
+
+        :return: A generator that yields the current time in seconds.
+        :rtype: Generator[float, None, None]
+        """
+        logger.debug("Generating timestamps at throughput rate")
+        while True:
+            yield time.time()
+
+    def constant_times(self) -> Generator[float, None, None]:
+        """
+        Generate timestamps at a constant rate based on the specified rate.
+
+        :return: A generator that yields timestamps incremented by 1/rate seconds.
+        :rtype: Generator[float, None, None]
+        """
+        logger.debug("Generating constant rate timestamps with rate: {}", self._rate)
+
+        if self._rate is None or self._rate == 0:
+            raise ValueError(
+                "Rate must be > 0 for constant mode, given: {}", self._rate
+            )
+
+        start_time = time.time()
+        time_increment = 1.0 / self._rate
+        counter = 0
+
+        while True:
+            yield_time = start_time + time_increment * counter
+            logger.debug(f"Yielding timestamp: {yield_time}")
+            yield yield_time
+            counter += 1
+
+    def poisson_times(self) -> Generator[float, None, None]:
+        """
+        Generate timestamps based on a Poisson process, where the number
+        of requests to be sent per second is drawn from a Poisson distribution.
+        The inter arrival time between requests is exponentially distributed.
+
+        :return: A generator that yields timestamps based on a Poisson distribution.
+        :rtype: Generator[float, None, None]
+        """
+        logger.debug("Generating Poisson rate timestamps with rate: {}", self._rate)
+
+        if self._rate is None or self._rate == 0:
+            raise ValueError("Rate must be > 0 for poisson mode, given: {}", self._rate)
+
+        time_tracker = time.time()
+        rng = np.random.default_rng()
+        time_increment = 1.0
+
+        while True:
+            num_requests = rng.poisson(self._rate)
+
+            if num_requests == 0:
+                yield time_tracker + time_increment
+            else:
+                inter_arrival_times = rng.exponential(1.0 / self._rate, num_requests)
+                logger.debug(
+                    "Calculated new inter-arrival times for poisson process: {}",
+                    inter_arrival_times,
+                )
+                arrival_time_tracker = time_tracker
+
+                for arrival_time in inter_arrival_times:
+                    arrival_time_tracker += arrival_time
+
+                    if arrival_time_tracker > time_tracker + time_increment:
+                        logger.debug(
+                            "Arrival time tracker: {} is greater than current time",
+                            arrival_time_tracker,
+                        )
+                        break
+
+                    yield arrival_time_tracker
+
+            time_tracker += time_increment  # Move on to the next time period
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb4931bdabcddce94d443a809876950424c803f5
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py
@@ -0,0 +1,43 @@
+from .images import ImageDescriptor, load_images
+from .injector import create_report, inject_data
+from .progress import BenchmarkReportProgress
+from .text import (
+    clean_text,
+    filter_text,
+    is_path,
+    is_path_like,
+    is_url,
+    load_text,
+    load_text_lines,
+    parse_text_objects,
+    split_lines_by_punctuation,
+    split_text,
+)
+from .transformers import (
+    load_transformers_dataset,
+    resolve_transformers_dataset,
+    resolve_transformers_dataset_column,
+    resolve_transformers_dataset_split,
+)
+
+__all__ = [
+    "BenchmarkReportProgress",
+    "clean_text",
+    "create_report",
+    "filter_text",
+    "inject_data",
+    "is_path",
+    "is_path_like",
+    "is_url",
+    "load_text",
+    "load_text_lines",
+    "load_transformers_dataset",
+    "parse_text_objects",
+    "resolve_transformers_dataset",
+    "resolve_transformers_dataset_column",
+    "resolve_transformers_dataset_split",
+    "split_lines_by_punctuation",
+    "split_text",
+    "ImageDescriptor",
+    "load_images",
+]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e8800d2abf8df387de691bda21073c643f9129b
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py
@@ -0,0 +1,34 @@
+"""
+This module includes custom CLI parameters for the `click` package.
+"""
+
+from typing import Any, Optional
+
+from click import Context, Parameter, ParamType
+
+__all__ = ["MAX_REQUESTS"]
+
+
+class MaxRequestsType(ParamType):
+    """
+    Catch the `dataset` string parameter to determine the behavior of the Scheduler.
+    """
+
+    name = "max_requests"
+
+    def convert(
+        self, value: Any, param: Optional[Parameter], ctx: Optional[Context]
+    ) -> Any:
+        if isinstance(value, int):
+            return value
+
+        try:
+            return int(value)
+        except ValueError:
+            if value == "dataset":
+                return value
+            else:
+                self.fail(f"{value} is not a valid integer or 'dataset'", param, ctx)
+
+
+MAX_REQUESTS = MaxRequestsType()
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb66d4321309c39f92b6e1cf3ce737f7bf5c2f4c
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py
@@ -0,0 +1,80 @@
+from io import BytesIO
+from typing import List, Optional, Tuple
+from urllib.parse import urljoin
+
+import requests
+from bs4 import BeautifulSoup
+from loguru import logger
+from PIL import Image
+from pydantic import ConfigDict, Field, computed_field
+
+from guidellm.config import settings
+from guidellm.core.serializable import Serializable
+
+__all__ = ["load_images", "ImageDescriptor"]
+
+class ImageDescriptor(Serializable):
+    """
+    A class to represent image data in serializable format.
+    """
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    url: Optional[str] = Field(description="url address for image.")
+    image: Image.Image = Field(description="PIL image", exclude=True)
+    filename: Optional[int] = Field(
+        default=None,
+        description="Image filename.",
+    )
+
+    @computed_field # type: ignore[misc]
+    @property
+    def image_resolution(self) -> Tuple[int, int]:
+        if self.image is None:
+            return None
+        else:
+            return self.image.size
+
+
+def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]:
+    """
+    Load an HTML file from a path or URL
+
+    :param data: the path or URL to load the HTML file from
+    :type data: Union[str, Path]
+    :return: Descriptor containing image url and the data in PIL.Image.Image format
+    :rtype: ImageDescriptor
+    """
+
+    images = []
+    if not data:
+        return None
+    if isinstance(data, str) and data.startswith("http"):
+        response = requests.get(data, timeout=settings.request_timeout)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+        for img_tag in soup.find_all("img"):
+            img_url = img_tag.get("src")
+
+            if img_url:
+                # Handle relative URLs
+                img_url = urljoin(data, img_url)
+
+                # Download the image
+                logger.debug("Loading image: {}", img_url)
+                img_response = requests.get(img_url)
+                img_response.raise_for_status()
+                image = Image.open(BytesIO(img_response.content))
+
+                if image_resolution is not None:
+                    image = image.resize(image_resolution)
+
+                # Load image into Pillow
+                images.append(
+                    ImageDescriptor(
+                        url=img_url,
+                        image=image,
+                    )
+                )
+
+        return images
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb5216aa65fe83328015af1517e049fadd344677
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py
@@ -0,0 +1,70 @@
+from pathlib import Path
+from typing import Union
+
+from pydantic import BaseModel
+
+from guidellm.config import settings
+from guidellm.utils.text import load_text
+
+__all__ = ["create_report", "inject_data"]
+
+
+def create_report(model: BaseModel, output_path: Union[str, Path]) -> Path:
+    """
+    Creates a report from the model and saves it to the output path.
+
+    :param model: the model to serialize and inject
+    :type model: BaseModel
+    :param output_path: the path, either a file or a directory,
+        to save the report to. If a directory, the report will be saved
+        as "report.html" inside of the directory.
+    :type output_path: str
+    :return: the path to the saved report
+    :rtype: str
+    """
+    if not isinstance(output_path, Path):
+        output_path = Path(output_path)
+
+    html_content = load_text(settings.report_generation.source)
+    report_content = inject_data(
+        model,
+        html_content,
+        settings.report_generation.report_html_match,
+        settings.report_generation.report_html_placeholder,
+    )
+
+    if not output_path.suffix:
+        # assume directory, save as report.html
+        output_path = output_path / "report.html"
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(report_content)
+
+    return output_path
+
+
+def inject_data(
+    model: BaseModel,
+    html: str,
+    match: str,
+    placeholder: str,
+) -> str:
+    """
+    Injects the data from the model into the HTML while replacing the placeholder.
+
+    :param model: the model to serialize and inject
+    :type model: BaseModel
+    :param html: the html to inject the data into
+    :type html: str
+    :param match: the string to match in the html to find the placeholder
+    :type match: str
+    :param placeholder: the placeholder to replace with the model data
+        inside of the placeholder
+    :type placeholder: str
+    :return: the html with the model data injected
+    :rtype: str
+    """
+    model_str = model.json()
+    inject_str = match.replace(placeholder, model_str)
+
+    return html.replace(match, inject_str)
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1e1e7987e2aaa226e5c38dfbf9a9445aac60b43
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py
@@ -0,0 +1,199 @@
+from datetime import datetime
+from typing import List
+
+from loguru import logger
+from rich.console import Group
+from rich.live import Live
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TaskID,
+    TaskProgressColumn,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+
+__all__ = ["BenchmarkReportProgress"]
+
+
+class BenchmarkReportProgress:
+    """
+    Manages the progress display for benchmarks and report generation using Rich.
+
+    This class provides a visual representation of the benchmarking process
+    and report generation using Rich's progress bars and panels.
+    """
+
+    def __init__(self):
+        """
+        Initialize the BenchmarkReportProgress with default settings.
+
+        This method sets up the progress displays for both individual benchmarks
+        and the overall report, as well as initializing internal task management
+        structures.
+        """
+        logger.info("Initializing BenchmarkReportProgress instance")
+
+        self.benchmarks_progress = Progress(
+            TextColumn("[{task.fields[start_time_str]}]"),
+            SpinnerColumn(),
+            TaskProgressColumn(),
+            TextColumn("{task.description}"),
+            TextColumn(" "),
+            TextColumn(
+                "[bold cyan]({task.fields[req_per_sec]} req/sec avg)[/bold cyan]"
+            ),
+        )
+        self.benchmarks_panel = Panel(
+            self.benchmarks_progress,
+            title="Benchmarks",
+            title_align="left",
+            expand=True,
+        )
+        self.report_progress = Progress(
+            SpinnerColumn(),
+            TextColumn("Generating report..."),
+            BarColumn(bar_width=None),
+            TextColumn(
+                "({task.fields[completed_benchmarks]}/{task.fields[total_benchmarks]})"
+            ),
+            TextColumn("["),
+            TimeElapsedColumn(),
+            TextColumn("<"),
+            TimeRemainingColumn(),
+            TextColumn("]"),
+        )
+        self.render_group = Group(self.benchmarks_panel, self.report_progress)
+        self.live = Live(self.render_group, redirect_stdout=True, redirect_stderr=True)
+
+        self.report_task: TaskID = None  # type: ignore  # noqa: PGH003
+        self.benchmark_tasks: List[TaskID] = []
+        self.benchmark_tasks_started: List[bool] = []
+        self.benchmark_tasks_completed: List[bool] = []
+        self.benchmark_tasks_progress: List[float] = []
+
+    def start(self, task_descriptions: List[str]) -> None:
+        """
+        Starts the live progress display and initializes benchmark tasks.
+
+        :param task_descriptions: List of descriptions for each benchmark task.
+        :type task_descriptions: List[str]
+        """
+        logger.info(
+            "Starting BenchmarkReportProgress with task descriptions: {}",
+            task_descriptions,
+        )
+        self.live.start()
+
+        for task_description in task_descriptions:
+            logger.debug("Adding task with description: {}", task_description)
+            task_id = self.benchmarks_progress.add_task(
+                task_description,
+                start=False,
+                total=None,
+                start_time_str="--:--:--",
+                req_per_sec="#.##",
+            )
+            self.benchmark_tasks.append(task_id)
+            self.benchmark_tasks_started.append(False)
+            self.benchmark_tasks_completed.append(False)
+            self.benchmark_tasks_progress.append(0)
+
+        self.report_task = self.report_progress.add_task(
+            "",
+            total=len(self.benchmark_tasks) * 100,  # 100 points per report
+            completed_benchmarks=0,
+            total_benchmarks=len(task_descriptions),
+        )
+        logger.info("Initialized {} benchmark tasks", len(task_descriptions))
+
+    def update_benchmark(
+        self,
+        index: int,
+        description: str,
+        completed: bool,
+        completed_count: int,
+        completed_total: int,
+        start_time: float,
+        req_per_sec: float,
+    ) -> None:
+        """
+        Updates the progress of a specific benchmark task.
+
+        :param index: Index of the benchmark task to update.
+        :type index: int
+        :param description: Description of the current benchmark task.
+        :type description: str
+        :param completed: Flag indicating if the benchmark is completed.
+        :type completed: bool
+        :param completed_count: Number of completed operations for the task.
+        :type completed_count: int
+        :param completed_total: Total number of operations for the task.
+        :type completed_total: int
+        :param start_time: Start time of the benchmark in timestamp format.
+        :type start_time: float
+        :param req_per_sec: Average requests per second.
+        :type req_per_sec: float
+        :raises ValueError: If trying to update a completed benchmark.
+        """
+
+        if self.benchmark_tasks_completed[index]:
+            err = ValueError(f"Benchmark {index} already completed")
+            logger.error("Error updating benchmark: {}", err)
+            raise err
+
+        if not self.benchmark_tasks_started[index]:
+            self.benchmark_tasks_started[index] = True
+            self.benchmarks_progress.start_task(self.benchmark_tasks[index])
+            logger.info("Starting benchmark task at index {}", index)
+
+        if completed:
+            self.benchmark_tasks_completed[index] = True
+            self.benchmark_tasks_progress[index] = 100
+            self.benchmarks_progress.stop_task(self.benchmark_tasks[index])
+            logger.info("Completed benchmark task at index {}", index)
+
+        self.benchmark_tasks_progress[index] = completed_count / completed_total * 100
+        self.benchmarks_progress.update(
+            self.benchmark_tasks[index],
+            description=description,
+            total=completed_total,
+            completed=completed_count if not completed else completed_total,
+            req_per_sec=(f"{req_per_sec:.2f}" if req_per_sec else "#.##"),
+            start_time_str=(
+                datetime.fromtimestamp(start_time).strftime("%H:%M:%S")
+                if start_time
+                else "--:--:--"
+            ),
+        )
+        logger.debug(
+            "Updated benchmark task at index {}: {}% complete",
+            index,
+            self.benchmark_tasks_progress[index],
+        )
+        self.report_progress.update(
+            self.report_task,
+            total=len(self.benchmark_tasks) * 100,
+            completed=sum(self.benchmark_tasks_progress),
+            completed_benchmarks=sum(self.benchmark_tasks_completed),
+            total_benchmarks=len(self.benchmark_tasks),
+        )
+
+    def finish(self) -> None:
+        """
+        Marks the overall report task as finished and stops the live display.
+        """
+        logger.info("Finishing BenchmarkReportProgress")
+        self.report_progress.update(
+            self.report_task,
+            total=len(self.benchmark_tasks) * 100,
+            completed=len(self.benchmark_tasks) * 100,
+            completed_benchmarks=len(self.benchmark_tasks),
+            total_benchmarks=len(self.benchmark_tasks),
+        )
+        self.report_progress.stop_task(self.report_task)
+        self.live.stop()
+        logger.info("BenchmarkReportProgress finished and live display stopped")
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8c5038c2e8235f02acca0503d773dbce9814e76
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py
@@ -0,0 +1,455 @@
+import csv
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+from urllib.parse import urlparse
+
+import ftfy
+import requests
+import yaml
+from loguru import logger
+
+from guidellm.config import settings
+
+__all__ = [
+    "clean_text",
+    "filter_text",
+    "is_path",
+    "is_path_like",
+    "is_url",
+    "load_text",
+    "load_text_lines",
+    "parse_text_objects",
+    "split_lines_by_punctuation",
+    "split_text",
+]
+
+
+NAME_TITLES = [
+    "Mr.",
+    "Mrs.",
+    "Ms.",
+    "Dr.",
+    "Prof.",
+    "Jr.",
+    "Sr.",
+    "St.",
+    "Lt.",
+    "Col.",
+    "Gen.",
+    "Rep.",
+    "Sen.",
+    "Gov.",
+    "Pres.",
+]
+SENTENCE_REGEX = r'[^.!?]*[.!?]["\']?\s*(?=[A-Z])'
+MAX_EXTENSION_LENGTH = 8
+MAX_PATH_LENGTH = 4096
+EXTENSION_TYPES = {
+    "csv": "csv",
+    "jsonl": "jsonl",
+    "json": "json",
+    "yaml": "yaml",
+    "yml": "yaml",
+    "txt": "txt",
+    "text": "txt",
+}
+
+
+def filter_text(
+    text: str,
+    filter_start: Optional[Union[str, int]] = None,
+    filter_end: Optional[Union[str, int]] = None,
+) -> str:
+    """
+    Filter text by start and end strings or indices
+
+    :param text: the text to filter
+    :param filter_start: the start string or index to filter from
+    :param filter_end: the end string or index to filter to
+    :return: the filtered text
+    """
+    filter_start_index = -1
+    filter_end_index = -1
+
+    if filter_start and isinstance(filter_start, str):
+        filter_start_index = text.index(filter_start)
+    elif filter_start:
+        if not isinstance(filter_start, int):
+            raise ValueError(f"Invalid filter start index: {filter_start}")
+        filter_start_index = filter_start
+
+    if filter_end and isinstance(filter_end, str):
+        filter_end_index = text.index(filter_end)
+    elif filter_end:
+        if not isinstance(filter_end, int):
+            raise ValueError(f"Invalid filter end index: {filter_end}")
+        filter_end_index = filter_end
+
+    if filter_start_index > -1:
+        text = text[filter_start_index:]
+    if filter_end_index > -1:
+        text = text[:filter_end_index]
+
+    return text
+
+
+def clean_text(
+    text: str,
+    fix_encoding: bool = True,
+    clean_whitespace: bool = False,
+    remove_empty_lines: bool = False,
+    force_new_line_punctuation: bool = False,
+) -> str:
+    """
+    Clean text by fixing encoding, cleaning whitespace, removing empty lines,
+    and forcing new line punctuation
+
+    :param text: the text to clean
+    :param fix_encoding: True to fix the encoding of the text, False to leave as is
+    :param clean_whitespace: True to clean the whitespace in the text
+        (remove extra spaces, tabs, etc), False to leave as is
+    :param remove_empty_lines: True to remove empty lines from the text
+        (lines with only whitespace), False to leave as is
+    :param force_new_line_punctuation: True to force new lines at punctuation
+        (line ends in a period, exclamation point, or question mark),
+        False to leave as is
+    :return: The cleaned text
+    """
+
+    if fix_encoding:
+        text = ftfy.fix_text(text)
+
+    if clean_whitespace:
+        text = "\n".join(
+            [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()]
+        )
+
+    if remove_empty_lines:
+        text = "\n".join([line for line in text.splitlines() if line.strip()])
+
+    if force_new_line_punctuation:
+        # first remove any existing new lines
+        text = " ".join(line for line in text.splitlines() if line.strip())
+        lines = split_lines_by_punctuation(text)
+        text = "\n".join(lines)
+
+    return text
+
+
+def split_lines_by_punctuation(text: str) -> List[str]:
+    """
+    Split text into lines based on punctuation
+
+    :param text: the text to split
+    :return: the list of lines
+    """
+
+    lines = []
+    current_line = ""
+    skip_next = False
+
+    for index, char in enumerate(text):
+        if skip_next:
+            skip_next = False
+            continue
+
+        current_line += char
+
+        if char not in [".", "!", "?"]:
+            # must match end of sentence punctuation
+            continue
+
+        # if this is the character for a title, don't split
+        if any(current_line.endswith(title) for title in NAME_TITLES):
+            continue
+
+        char_next_1 = text[index + 1] if index + 1 < len(text) else None
+        char_next_2 = text[index + 2] if index + 2 < len(text) else None
+        char_next_3 = text[index + 3] if index + 3 < len(text) else None
+
+        next_is_space = char_next_1 and char_next_1.isspace()
+        next_is_quote_and_space = char_next_1 in ["'", '"'] and char_next_2 == " "
+
+        # next character must be a space or a quote, otherwise skip
+        if not next_is_space and not next_is_quote_and_space:
+            continue
+
+        # after this, next character must be an upper case letter
+        upper_char = char_next_3 if next_is_quote_and_space else char_next_2
+        next_is_upper = upper_char and (
+            upper_char.isupper() or upper_char in ["'", '"']
+        )
+
+        if not next_is_upper:
+            continue
+
+        # if next char is a quote, add it and skip next
+        if next_is_quote_and_space:
+            current_line += text[index + 1]
+            skip_next = True
+
+        lines.append(current_line.strip())
+        current_line = ""
+
+    if current_line:
+        lines.append(current_line.strip())
+
+    return lines
+
+
+def is_url(url: str) -> bool:
+    """
+    Check if a string is a URL
+
+    :param url: the string to check
+    :return: True if the string is a URL, False if not
+    """
+    try:
+        result = urlparse(url)
+        return all([result.scheme, result.netloc])
+    except Exception:  # noqa: BLE001
+        return False
+
+
+def is_path(path: Any) -> bool:
+    """
+    Check if a string is a path
+
+    :param path: the string to check
+    :return: True if the string is a path, False if not
+    """
+    if not isinstance(path, (str, Path)):
+        return False
+
+    if isinstance(path, str):
+        path = Path(path)
+
+    return path.exists()
+
+
+def is_path_like(path: Any, enforce_file: bool = False) -> bool:
+    """
+    Check if a string has a path like structure where it doesn't need to exist
+
+    :param path: the string to check
+    :param enforce_file: True if the path should be a file, False if not
+    :return: True if the string is path like, False if not
+    """
+    # if path isn't a str or Path, it's not a path
+    if not isinstance(path, (str, Path)):
+        return False
+
+    if isinstance(path, Path):
+        path = str(path)
+
+    # if text is too long, it's not a path (4096 for most linux setups)
+    if len(path) > MAX_PATH_LENGTH:
+        return False
+
+    # if it starts with a URL scheme, it's not a path
+    if path.startswith(("http", "ftp")):
+        return False
+
+    test_path = Path(path)
+
+    # if it's supposed to be a file and there's no extension or
+    # the extension is too long, it's not a path
+    return not enforce_file or (
+        bool(test_path.suffix) and len(test_path.suffix) <= MAX_EXTENSION_LENGTH
+    )
+
+
+def split_text(text: str) -> Tuple[List[str], List[str], List[int]]:
+    """
+    Split text into words / tokens, the white space separators between words,
+    and the indices for each new line
+
+    :param text: the text to split
+    :return: the words, the white space separators, and the new line indices
+    """
+    if not text or not text.strip():
+        return [], [], []
+
+    text = text.strip()
+    tokens = []  # type: List[str]
+    separators = []  # type: List[str]
+    new_lines = [0]
+    buffer = text[0]
+    is_token = not text[0].isspace()
+
+    for char in text[1:]:
+        char_whitespace = char.isspace()
+
+        if char == "\n":
+            new_lines.append(len(tokens) + 1)
+
+        if char_whitespace and is_token:
+            tokens.append(buffer)
+            buffer = char
+            is_token = False
+        elif char_whitespace:
+            buffer += char
+        elif not char_whitespace and not is_token:
+            separators.append(buffer)
+            buffer = char
+            is_token = True
+        else:
+            buffer += char
+
+    if buffer and is_token:
+        tokens.append(buffer)
+        separators.append(" ")
+    elif buffer:
+        separators.append(buffer)
+
+    return tokens, separators, new_lines
+
+
+def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str:
+    """
+    Load an HTML file from a path or URL
+
+    :param data: the path or URL to load the HTML file from
+    :type data: Union[str, Path]
+    :param encoding: the encoding to use when reading the file
+    :type encoding: str
+    :return: the HTML content
+    :rtype: str
+    """
+    logger.debug("Loading text: {}", data)
+
+    if not data:
+        return ""
+
+    # check URLs
+    if isinstance(data, str) and data.startswith("http"):
+        response = requests.get(data, timeout=settings.request_timeout)
+        response.raise_for_status()
+        return response.text
+
+    # check raw text
+    if isinstance(data, str) and not is_path_like(data, enforce_file=True):
+        return data
+
+    # assume local file
+    if not isinstance(data, Path):
+        data = Path(data)
+
+    if not data.exists():
+        raise FileNotFoundError(f"File not found: {data}")
+
+    if not data.is_file():
+        raise IsADirectoryError(f"Path is a directory: {data}")
+
+    return data.read_text(encoding=encoding)
+
+
+def parse_text_objects(data: str, format_: str = "txt") -> List[Dict]:
+    """
+    Parse text data into a list of dictionaries based on the format given
+    (csv, jsonl, json, yaml, txt).
+
+    :param data: the text data to parse
+    :param format_: the format of the data to parse:
+        'csv', 'jsonl', 'json', 'yaml', 'txt'
+    :return: the list of dictionaries parsed from the data, if text
+        then each line is a dictionary with a single key 'text'
+    """
+    if not isinstance(data, str):
+        raise ValueError(f"Unsupported data given of type: {type(data)}")
+
+    if format_ == "csv":
+        reader = csv.DictReader(data.splitlines())
+        columns = reader.fieldnames
+        return [{col: row[col] for col in columns} for row in reader]  # type: ignore # noqa: PGH003
+
+    if format_ == "jsonl":
+        return [json.loads(line) for line in data.splitlines() if line]
+
+    if format_ in ("json", "yaml"):
+        data = json.loads(data) if format_ == "json" else yaml.safe_load(data)
+
+        if not data:
+            return []
+
+        if isinstance(data, dict) and len(data) == 1:
+            logger.debug("Getting first value from JSON/YAML object: {}", data)
+            data = list(data.values())[0]
+        elif isinstance(data, dict):
+            logger.debug("Converting JSON/YAML object to list: {}", data)
+            data = list(data.values())
+
+        if not isinstance(data, list) or not isinstance(data[0], dict):
+            raise ValueError(f"Unsupported data structure given: {data}")
+
+        return data
+
+    if format_ == "txt":
+        return [{"text": line} for line in data.splitlines() if line]
+
+    raise ValueError(f"Unsupported format given: {format_}")
+
+
+def load_text_lines(
+    data: Union[str, Path, List[Dict]],
+    format_: Optional[str] = None,
+    filters: Optional[List[str]] = None,
+    encoding: Optional[str] = None,
+) -> List[str]:
+    """
+    Load text lines from a file or data object with optional filtering and formatting.
+
+
+    :param data: the data to load the text lines from
+    :param format_: the format of the data to load, if not provided will be inferred.
+        Supported formats: 'csv', 'jsonl', 'json', 'yaml', 'txt'
+    :param filters: the keys to filter the data by when loading in order of preference.
+        If not provided, will use the first key in the data object.
+    :param encoding: the encoding to use when reading the file
+    :return: the list of text lines
+    """
+    logger.debug(
+        "Loading text lines with format {}, filters {}, encoding {} for data: {}",
+        format_,
+        filters,
+        encoding,
+        data,
+    )
+
+    if not data:
+        return []
+
+    if not format_ and isinstance(data, (str, Path)) and "." in str(data):
+        extension = str(data).split(".")[-1]
+        format_ = EXTENSION_TYPES.get(extension, "txt")
+    elif not format_:
+        format_ = "txt"
+
+    # load the data if it's a path or URL
+    if isinstance(data, (Path, str)):
+        data = load_text(data, encoding=encoding)
+        data = clean_text(data)
+
+    # parse the data into a list of dictionaries based on the format
+    if isinstance(data, str):
+        data = parse_text_objects(data, format_)
+
+    if not isinstance(data, list):
+        raise ValueError(f"Unsupported data given of type: {type(data)}")
+
+    if not isinstance(data[0], dict):
+        raise ValueError(f"Unsupported data item type given: {type(data[0])}")
+
+    # grab the first available filter key to use if preference order as provided
+    filter_ = list(data[0].keys())[0]
+    for filt in filters or []:
+        if filt not in data[0]:
+            continue
+
+        filter_ = filt
+        break
+
+    # extract the lines from the data
+    return [row[filter_] for row in data] if filter_ else [str(row) for row in data]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..540572994eb692ddcaeced0055feb6a1c932f7f2
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py
@@ -0,0 +1,151 @@
+from pathlib import Path
+from typing import List, Optional, Union
+
+from datasets import (  # type: ignore  # noqa: PGH003
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    load_dataset,
+)
+from loguru import logger
+
+from guidellm.config import settings
+
+__all__ = [
+    "load_transformers_dataset",
+    "resolve_transformers_dataset",
+    "resolve_transformers_dataset_column",
+    "resolve_transformers_dataset_split",
+]
+
+
+def load_transformers_dataset(
+    dataset: Union[
+        str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+    ],
+    split: Optional[str] = None,
+    preferred_splits: Optional[List[str]] = settings.dataset.preferred_data_splits,
+    **kwargs,
+) -> Union[Dataset, IterableDataset]:
+    """
+    Load a dataset from a file or a script and resolve the preferred split.
+
+    :param dataset: the dataset file or script to load
+    :param split: the dataset split to use
+        (overrides preferred_splits, must be in dataset)
+    :param preferred_splits: the preferred dataset splits to use
+    :param kwargs: additional keyword arguments to pass to the dataset loader
+    :return: the loaded dataset
+    """
+    dataset = resolve_transformers_dataset(dataset, **kwargs)
+
+    return resolve_transformers_dataset_split(dataset, split, preferred_splits)
+
+
+def resolve_transformers_dataset(
+    dataset: Union[
+        str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset
+    ],
+    **kwargs,
+) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]:
+    """
+    Resolve the dataset from a file (csv, json, script) or a dataset name.
+
+    :param dataset: the dataset file or script to load
+    :param kwargs: additional keyword arguments to pass to the dataset loader
+    :return: the loaded dataset
+    """
+    if isinstance(
+        dataset, (DatasetDict, Dataset, IterableDatasetDict, IterableDataset)
+    ):
+        return dataset
+
+    if not isinstance(dataset, (str, Path)):
+        raise ValueError(f"Invalid dataset type: {type(dataset)}")
+
+    dataset = str(dataset)
+
+    if dataset.endswith((".csv", ".json")):
+        logger.debug("Loading dataset from local path: {}", dataset)
+        extension = dataset.split(".")[-1]
+
+        return load_dataset(extension, data_files=dataset, **kwargs)
+
+    if dataset.endswith(".py"):
+        logger.debug("Loading dataset from local script: {}", dataset)
+
+        return load_dataset(dataset, **kwargs)
+
+    logger.debug("Loading dataset: {}", dataset)
+
+    return load_dataset(dataset, **kwargs)
+
+
+def resolve_transformers_dataset_split(
+    dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset],
+    split: Optional[str] = None,
+    preferred_splits: Optional[List[str]] = settings.dataset.preferred_data_splits,
+) -> Union[Dataset, IterableDataset]:
+    """
+    Resolve the preferred split from a dataset dictionary.
+
+    :param dataset: the dataset to resolve the split from
+    :param split: the dataset split to use
+        (overrides preferred_splits, must be in dataset)
+    :param preferred_splits: the preferred dataset splits to use
+    :return: the resolved dataset split
+    """
+    if not isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+        logger.debug("Dataset is not a dictionary, using default split")
+        return dataset
+
+    if split:
+        if split not in dataset:
+            raise ValueError(f"Split '{split}' not found in dataset")
+
+        return dataset[split]
+
+    if preferred_splits:
+        for spl in preferred_splits:
+            if spl not in dataset:
+                continue
+            return dataset[spl]
+
+    return list(dataset.values())[0]
+
+
+def resolve_transformers_dataset_column(
+    dataset: Union[Dataset, IterableDataset],
+    column: Optional[str] = None,
+    preferred_columns: Optional[List[str]] = settings.dataset.preferred_data_columns,
+) -> str:
+    """
+    Resolve the preferred column from a dataset.
+
+    :param dataset: the dataset to resolve the column from
+    :param column: the dataset column to use
+        (overrides preferred_columns, must be in dataset)
+    :param preferred_columns: the preferred dataset columns to use
+    :return: the resolved dataset column
+    """
+    column_names = dataset.column_names
+
+    if not column_names:
+        # grab from the first item
+        first_item = next(iter(dataset))
+        column_names = list(first_item.keys())
+
+    if column:
+        if column not in column_names:
+            raise ValueError(f"Column '{column}' not found in dataset")
+
+        return column
+
+    if preferred_columns:
+        for col in preferred_columns:
+            if col not in column_names:
+                continue
+            return col
+
+    return list(column_names)[0]
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py
new file mode 100644
index 0000000000000000000000000000000000000000..74000dd8d3acdde3a539c1efb01f1de9b640f9db
--- /dev/null
+++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py
@@ -0,0 +1,79 @@
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+import toml
+from loguru import logger
+
+
+def get_build_type():
+    return os.getenv("GUIDELLM_BUILD_TYPE", "dev")
+
+
+def get_build_number():
+    return os.getenv("GUIDELLM_BUILD_NUMBER", "0")
+
+
+def construct_project_name_and_version(build_type, build_number, current_version):
+    if not re.match(r"^\d+\.\d+\.\d+$", current_version):
+        raise ValueError(
+            f"Version '{current_version}' does not match the "
+            f"semantic versioning pattern '#.#.#'",
+        )
+
+    if build_type == "dev":
+        project_name = "guidellm_dev"
+        version = f"{current_version}.dev{build_number}"
+    elif build_type == "nightly":
+        project_name = "guidellm_nightly"
+        date_str = datetime.now().strftime("%Y%m%d")
+        version = f"{current_version}.{date_str}"
+    elif build_type == "release":
+        project_name = "guidellm"
+        version = current_version
+    else:
+        raise ValueError(f"Unknown build type: {build_type}")
+
+    return project_name, version
+
+
+def update_pyproject_toml(project_name, version):
+    try:
+        with Path("pyproject.toml").open() as file:
+            data = toml.load(file)
+
+        data["project"]["name"] = project_name
+        data["project"]["version"] = version
+
+        with Path("pyproject.toml").open("w") as file:
+            toml.dump(data, file)
+
+        logger.info(
+            f"Updated project name to: {project_name} and version to: {version}",
+        )
+    except (FileNotFoundError, toml.TomlDecodeError) as e:
+        logger.error(f"Error reading or writing pyproject.toml: {e}")
+        raise
+
+
+def main():
+    build_type = get_build_type()
+    build_number = get_build_number()
+
+    with Path("pyproject.toml").open() as file:
+        pyproject_data = toml.load(file)
+
+    current_version = pyproject_data["project"]["version"]
+    project_name, version = construct_project_name_and_version(
+        build_type,
+        build_number,
+        current_version,
+    )
+
+    if build_type != "release":
+        update_pyproject_toml(project_name, version)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/multimodal/vision_language_model/step3/vllm/README.md b/models/multimodal/vision_language_model/step3/vllm/README.md
index ce1df9df1d573642f834fbb7e3a0c1732d34e627..11266ddc300f18172ed7b6027ac787e5fce02820 100644
--- a/models/multimodal/vision_language_model/step3/vllm/README.md
+++ b/models/multimodal/vision_language_model/step3/vllm/README.md
@@ -9,6 +9,7 @@ Step3 is cutting-edge multimodal reasoning model—built on a Mixture-of-Experts
 | GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
 | :----: | :----: | :----: |
 | MR-V100 | dev-only | 25.12 |
+| MR-V100 | 4.4.0 | 26.03 |
 
 ## Model Preparation
 
@@ -33,6 +34,51 @@ pip3 install -r requirements.txt
 
 ## Model Inference
 
+### Inference with W4A8
+
+#### Performance Test
+
+1. Set environment variables:
+```bash
+export VLLM_W8A8_MOE_USE_W4A8=1
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+2. Start server:
+```bash
+vllm serve /path/to/model --limit-mm-per-prompt '{"image":5}'  --gpu-memory-utilization 0.92 --port 12347 --trust-remote-code --disable-cascade-attn  --no-enable-prefix-caching  --max-model-len 65536   --tensor-parallel-size 4 --pipeline-parallel-size 4  --max-num-seqs 1024  --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+3. Run client (Input1024, Output1024, BS10):
+```bash
+vllm bench serve --num-prompts 4*[max-concurrency] --model /path/to/model --dataset-name random --random-input-len 1024 --random-output-len 1024 --max-concurrency 10 --host 0.0.0.0 --port 12347  --disable-tqdm --ignore-eos
+```
+
+#### Accuracy Test
+
+4. The evaluation scripts are already included in this directory:
+```bash
+# eval_dataset.py and eval_dataset_w8a8.py are in the current directory
+pip install fire
+```
+
+5. Set environment variables:
+```bash
+export VLLM_W8A8_MOE_USE_W4A8=1
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+6. Start server:
+```bash
+vllm serve /path/to/model --limit-mm-per-prompt '{"image":5}'  --gpu-memory-utilization 0.92 --port 12347 --trust-remote-code --disable-cascade-attn  --no-enable-prefix-caching  --max-model-len 65536   --tensor-parallel-size 4 --pipeline-parallel-size 4  --max-num-seqs 1024  --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+7. Run client (MMMU dataset):
+```bash
+pip install fire
+python3 eval_dataset.py --dataset_name MMMU_BETA --model /path/to/model  --ip 127.0.0.1 --port 12347 --num_workers 8
+```
+
 ### Inference with w8a8
 #### Starting w8a8 server
 ```bash
diff --git a/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md b/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md
index b59e7d8aa673153fe16a725047904f7c035453e6..8b8a596f303b60acd61cbcc50dce7e1ecfd2cd78 100644
--- a/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md
+++ b/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md
@@ -10,6 +10,7 @@ based on Qwen2.5 and Llama3 series to the community.
 
 | GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
 | :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
 | MR-V100 | 4.3.0 | 25.09 |
 | MR-V100 | 4.2.0 | 25.03 |
 
@@ -49,6 +50,31 @@ python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-7B --max-to
 vllm serve data/DeepSeek-R1-Distill-Qwen-7B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code
 ```
 
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+4. Start server (DeepSeek-R1-Distill-Qwen-7B BF16):
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=1 --max-model-len 20480 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input2048, Output1024, BS8):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 8 --random-input 2048 --max-concurrency 8 --tokenize-prompt --random-range-ratio 1 --random-output 1024
+```
+
 ## Model Results
 
 ### Benchmarking vLLM
diff --git a/models/nlp/llm/deepseek-v3.1/vllm/README.md b/models/nlp/llm/deepseek-v3.1/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..17d6d265bab98fefed187c0bbd99c19671058566
--- /dev/null
+++ b/models/nlp/llm/deepseek-v3.1/vllm/README.md
@@ -0,0 +1,79 @@
+# DeepSeek-V3.1 (vLLM)
+
+## Model Description
+
+DeepSeek-V3 is a powerful Mixture-of-Experts (MoE) language model with 671B total parameters and 37B activated parameters. It achieves excellent performance on math, code, and reasoning tasks, comparable to leading models like GPT-4 and Claude-3.5.
+
+This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3>
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_W8A8_MOE_USE_W4A8=1
+export VLLM_ENFORCE_CUDA_GRAPH=1
+export VLLM_PP_LAYER_PARTITION="16,16,16,13"
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=4 --tensor-parallel-size=4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input128, Output128, BS8):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 8 --random-input 128 --max-concurrency 8 --tokenize-prompt --random-range-ratio 1 --random-output 128
+```
+
+#### Accuracy Test
+
+6. Install evalscope:
+```bash
+pip3 install 'evalscope[app,perf]' -U
+```
+
+7. Set environment variables:
+```bash
+export VLLM_USE_MODELSCOPE=True
+```
+
+8. Start server:
+```bash
+vllm serve /path/to/model --max-num-seqs 4 --max-model-len 95600 --served-model-name DeepSeek-v3.1-int4-pack8 --trust-remote-code --disable-cascade-attn --tensor-parallel-size 8 --pipeline-parallel-size 2 --compilation-config '{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' --port 9989
+```
+
+9. Run client (MATH-500 dataset):
+```bash
+evalscope eval --model DeepSeek-v3.1-W4A8 --dataset-args '{"math_500": {"few_shot_num": 0}}'  --generation-config '{"do_sample": true, "temperature": 0.6, "max_tokens": 32768, "n": 1, "top_p": 0.95}' --datasets math_500 --eval-type openai_api --eval-batch-size 4 --api-url http://127.0.0.1:9989/v1 --timeout 12000000  --api-key EMPTY --eval-type openai_api
+```
+
+## References
+
+- [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/llm-benchmark/README.md b/models/nlp/llm/llm-benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..33aa05bf03d0c21fc64da72b190b64c586ac98a8
--- /dev/null
+++ b/models/nlp/llm/llm-benchmark/README.md
@@ -0,0 +1,308 @@
+# 安装
+
+```bash
+# pip安装依赖
+pip3 install -r requirements.txt
+```
+
+# 精度评测
+
+## 简单评测
+
+假如使用SGLang拉起模型服务，IP为：127.0.0.1，PORT为30000，在指定的若干数据集上使用默认配置评测DeepSeek模型，在任意路径下，执行`eval`命令：
+```bash
+./iluvatar_bench eval \
+ --model /data/DeepSeek-R1-AWQ \
+ --datasets gsm8k \
+ --limit 4 \
+ --eval-batch-size 8
+```
+
+### 基本参数说明
+
+- `--model`: 指定了模型在ModelScope中的model_id，可自动下载，也可使用模型的本地路径，例如/path/to/model。
+- `--datasets`: 数据集名称，支持输入多个数据集，使用空格分开，数据集将自动从modelscope下载，支持的数据集参考数[据集列表](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/index.html)。
+- `--limit`: 每个数据集子集，最大评测数据量，不填写则默认为全部评测，可用于快速验证。
+- `--eval-batch-size`: 评测批量大小，默认为1，表示并发请求数量。
+
+
+## 模型API服务评测
+
+指定模型API服务地址(api_url)和API Key(api_key)，评测部署的模型API服务，此时eval-type参数指定为server，默认参数如下：
+```bash
+--api-key='EMPTY' \
+--api-url='http://127.0.0.1:30000/v1' \
+--eval-type='server'
+```
+
+# 性能测试-1
+
+## 基本使用
+
+下面展示了用 SGLang 框架在 Bi150 上进行 DeepSeek-R1-AWQ 模型的压测示例，固定输入1024 token，输出1024 token。用户可以根据自己的需求修改参数。
+
+```bash
+./iluvatar_bench perf \
+    --parallel 1 10 50 100 200 \
+    --number 10 20 100 200 400 \
+    --model /data/DeepSeek-R1-AWQ \
+    --url http://127.0.0.1:30000/v1/completions \
+    --api openai \
+    --dataset random \
+    --max-tokens 1024 \
+    --min-tokens 1024 \
+    --prefix-length 0 \
+    --min-prompt-length 1024 \
+    --max-prompt-length 1024 \
+    --tokenizer-path /data/DeepSeek-R1-AWQ \
+    --extra-args '{"ignore_eos": true}'
+```
+
+### 参数说明
+
+- `parallel`: 请求的并发数，可以传入多个值，用空格隔开。
+- `number`: 发出的请求的总数量，可以传入多个值，用空格隔开（与`parallel`一一对应）。
+- `url`: 请求的URL地址。
+- `model`: 使用的模型名称。
+- `api`: 使用的API服务，默认为`openai`。
+- `dataset`: 数据集名称，此处为`random`，表示随机生成数据集，具体使用说明参考；更多可用的(多模态)数据集请参考数据集配置。
+- `tokenizer-path`: 模型的tokenizer路径，用于计算token数量（在random数据集中是必须的）。
+- `extra-args`: 请求中的额外的参数，传入json格式的字符串，例如`{"ignore_eos": true}`表示忽略结束token。
+
+**默认参数**
+
+其中，下列参数，属于默认参数。
+```bash
+--max-tokens=1024,
+--min-tokens=1024,
+--min-prompt-length=1024,
+--max-prompt-length=1024,
+--api='openai',
+--url='http://127.0.0.1:30000/v1/completions'
+```
+
+`max-tokens`和`min-tokens`表示最大生成长度和最小生成长度。
+`max-prompt-length`和`min-prompt-length`，表示最大提示词长度和最小提示词长度。
+
+# 性能测试-2
+
+这里使用SGLang自带的 bench_serving.py（源自vLLM），用于测量在线服务吞吐量和延迟。
+
+bench_serving.py 文件信息：
+```bash
+git log --oneline -- ./python/sglang/bench_serving.py
+88a6f9dab bench_serving support PD Disaggregation (#11542)
+```
+
+## 基本使用
+
+注：`./iluvatar_bench sgl-perf` 命令，与 `python3 bench_serving.py` 等同，都可以运行。
+
+```bash
+./iluvatar_bench sgl-perf \
+    --backend sglang \
+    --host 127.0.0.1 --port 30000 \
+    --num-prompts 1000
+```
+模型名或路径，若未设置，系统将向/v1/models请求默认模型配置。
+
+## 公共参数
+
+* `--backend backend`: sglang/vllm等后端。
+* `--model`: 模型名称或者地址。
+* 连接参数：`--host`和`--port`、或`--base-url`。
+* `--dataset-name`: sharegpt, random, random-ids, generated-shared-prefix等，不同数据集，相关的配置参数不同。
+* `--request-rate`：每秒到达的请求数（默认值 inf，表示所有请求同时到达），使用**泊松过程（Poisson process）**来模拟请求的到达时间。这意味着请求之间的时间间隔是随机的，但平均速率符合设定的值。这更真实地模拟了现实世界中随机到达的用户请求。假如每隔3.5秒，需要发送6条数据，则 `Request rate = 6 requests / 3.5 seconds ≈ 1.71 requests/second`。
+* `--request-interval`: 固定间隔时间（秒）。如果设置，此值将覆盖 `--request-rate` 的设置，并使用确定性（固定时间）的间隔调度。
+* `--max-concurrency`：最大并发请求数。表示实际处理请求的 worker 数量，虽然 `--request-rate` 参数控制请求发起的速率，但此参数控制实际允许同时执行的请求数量。。
+* `--warmup-requests`: benchmark 前的 warmup 次数。
+
+## sharegpt 数据集
+
+`sharegpt`是真实对话数据集（默认），相关的参数如下：
+* `--num-prompts`: 请求总数。
+* `--sharegpt-output-len`: 输出长度，如果没有指定，则由数据集中的样本长度决定。
+* `--sharegpt-context-len`: 设置上下文总体长度，被指定时，当 `输入 + 输出 > 最大上下文长度`时，request会被跳过。
+
+简单来说，输入长度不可以被指定，输出长度和最大上下文长度，可以被指定。
+
+注意，当出现以下情况，request同样会被跳过：
+* `prompt_len < 2` 或者 `output_len < 2`
+
+## random/random-ids数据集
+
+* `random`: **真实的文本**，来自 ShareGPT 数据集。确定一个随机的目标输入长度（例如 500 token），它会从 ShareGPT 数据集中随机选择一个真实的提示。如果提示太长（例如 1000 token），它会截断 (truncate) 提示到 500 token；如果提示太短（例如 100 token），它会重复 (repeat) 这个提示的 token，直到填满 500 token。用于模拟一个提示内容是**真实自然语言**的随机长度工作负载。
+* `random-ids`: **完全随机的 Token ID**。首先确定一个随机的目标输入长度（例如 500 token），它不会加载任何外部数据集，它会直接在 tokenizer 的词汇表（vocab）范围内随机生成 500 个 token ID。这些 ID 组合起来的文本**不具有任何语言学意义**（即"乱码"）。模拟一个提示内容是随机、无意义数据的随机长度工作负载，对于压力测试 tokenizer 和模型处理异常输入的能力很有用。
+
+相关的参数如下：
+* `--num-prompts`：要处理的请求总数
+* `--random-input-len`(default:1024): 每个请求的最大输入 token 长度。脚本会在 `[random-input-len * random-range-ratio, random-input-len + 1)`随机采样一个长度。
+* `--random-output-len`(default:1024): 每个请求的最大输出 token 长度。脚本会在 `[random-output-len * random-range-ratio, random-output-len + 1)`随机采样一个长度。
+* `--random-range-ratio`(default:0.0): 一个介于 0.0 和 1.0 之间的浮点数，用于定义随机长度的下限。如果希望输入/输出长度固定为 1024，设置为 1.0 即可。
+* `--tokenize-prompt`: 主要用于 `random` 和 `random-ids` 数据集，以便在使用 `sglang` 和 `vllm` 后端时，通过发送精确长度的 token ID 列表来进行基准测试。例如，客户端生成 `[1024个ID]` 列表，跳过解码步骤，直接将这个整数ID列表发送给服务器。服务器收到ID列表后，会跳过分词步骤，直接使用这个列表。好处就是，这保证了服务器处理的输入长度**精确地**是我们想要的1024个 token。
+
+以下是对于输入/输出长度，如何进行最大值/最小值的计算，如果不需要，跳过以下“计算公式”和“举例说明”即可。
+
+计算公式：
+* `实际输入长度=[random-input-len * random-range-ratio, random-input-len + 1)`
+* `实际输出长度=[random-output-len * random-range-ratio, random-output-len + 1)`
+
+举例说明：
+```bash
+--dataset-name random \
+--random-input-len 1024 \
+--random-output-len 1024 \
+--random-range-ratio 0.8
+```
+则，输入/输出长度大小，会从区间`[819, 1025) = [1024 * 0.8, 1024+1)`进行随机取值，这时候的输入/输出长度，可能是`833/955`.
+如果希望输入/输出长度固定为 1024，则需要把 `--random-range-ratio` 设置为 1.0.
+
+## generated-shared-prefix 数据集
+
+`generated-shared-prefix` 数据集并不是一个像 `sharegpt` 那样从外部文件加载的静态数据集，而是一个动态生成的数据集。
+
+它的核心目的是模拟一个非常重要且常见的 LLM 服务场景：大量请求共享一个长的前缀（prefix）。这通常发生在多租户、RAG（检索增强生成）或设置了复杂系统指令（system prompt）的应用中。
+
+每个生成的请求都由两部分组成：
+
+1. **共享的系统提示 (System Prompt)**：一个很长的、在组内共享的文本块。
+2. **唯一的问题 (Question)**：一个较短的、每个请求独有的文本块。
+
+完整的提示 (prompt) 会被构造成类似于 `"{system_prompt}\n\n{question}"` 的形式。
+
+相关的参数如下：
+* `--gsp-num-groups` (default: 64): 定义了要生成的**唯一系统提示 (system prompt) 的数量**。代表了基准测试中有多少个"共享前缀"的组。
+* `--gsp-prompts-per-group` (default: 16): 定义了每个组（即每个系统提示）包含多少个唯一的请求（问题）。这决定了每个共享前缀被重用的次数。总的请求数量将是 `gsp-num-groups * gsp-prompts-per-group`。
+* `--gsp-system-prompt-len` (default: 2048): 每个生成的 system prompt 的目标 token 长度。这用于模拟一个很长的前缀（例如，一个复杂的指令集或一个大的上下文文档）。
+* `--gsp-question-len` (default: 128): 每个生成的唯一问题的目标 token 长度。这模拟了用户输入的、非共享的那部分提示。
+* `--gsp-output-len` (default: 256): 基准测试中，为每个请求设置的目标输出 token 数量。这定义了模型在接收到 `system_prompt + question` 后需要生成多少内容。
+
+## 例子
+
+1. `sharegpt` (模拟真实对话)
+
+```bash
+./iluvatar_bench sgl-perf \
+    --backend sglang \
+    --host 127.0.0.1 --port 30000 \
+    --model /home/data/qwen3/Qwen3-32B \
+    --dataset-name sharegpt \
+    --host 127.0.0.1 --port 30000 \
+    --num-prompts 1000
+```
+
+2. `random` (模拟特定长度的合成负载)
+
+```bash
+./iluvatar_bench sgl-perf \
+    --backend sglang \
+    --host 127.0.0.1 --port 30000 \
+    --model /home/data/qwen3/Qwen3-32B \
+    --dataset-name random \
+    --num-prompts 1000 \
+    --random-input 2048 \
+    --random-output 128 \
+    --random-range-ratio 0.5
+```
+此命令测试1000个请求。每个请求的输入长度将在 `(2048 * 0.5)` 到 `2048` 之间随机（即 1024 到 2048 token）。输出长度将在 `(128 * 0.5)` 到 `128` 之间随机（即 64 到 128 token）。提示内容是基于ShareGPT文本填充的。
+
+3. `random-ids` (纯粹的压力测试)
+
+最极端的压力测试。它不关心提示的语言含义。它只是生成完全随机的 Token ID 来填满指定的输入长度。
+可以与 `--tokenize-prompt` 结合使用，以发送 `[1024, 512, 300, ...]` 这样的ID列表，而不是解码后的乱码字符串。这可以**100%精确地控制输入长度**，是测量纯硬件和系统吞吐量的最佳方式。
+
+```bash
+# 压力测试: 1000个请求，每个请求 *精确地* 包含1024个输入ID
+# 并请求1024个输出ID
+./iluvatar_bench sgl-perf \
+    --backend sglang \
+    --host 127.0.0.1 --port 30000 \
+    --model /home/data/qwen3/Qwen3-32B \
+    --dataset-name random-ids \
+    --num-prompts 1000 \
+    --random-input-len 1024 \
+    --random-output-len 1024 \
+    --random-range-ratio 1.0 \
+    --tokenize-prompt
+```
+`--random-range-ratio 1.0` 确保输入/输出长度不会随机化，而是精确等于 1024。`--tokenize-prompt` 确保客户端发送的是 `input_ids` 列表，而不是 `text`。这个命令是在测量服务器处理“1024-in, 1024-out”请求的纯粹性能。
+
+4. 速率控制 + 输出文件
+
+```bash
+./iluvatar_bench sgl-perf \
+    --backend sglang \
+    --host 127.0.0.1 --port 30000 \
+    --model /home/data/qwen3/Qwen3-32B \
+    --dataset-name random \
+    --random-input-len 1024 --random-output-len 1024 --random-range-ratio 1.0 \
+    --num-prompts 2000 \
+    --request-rate 100 \
+    --max-concurrency 512 \
+    --output-file sglang_random.jsonl --output-details
+```
+
+5. `generated-shared-prefix` (测试 KV Cache 性能)
+
+```bash
+./iluvatar_bench sgl-perf \
+    --backend sglang \
+    --host 127.0.0.1 --port 30000 \
+    --model /home/data/qwen3/Qwen3-32B \
+    --dataset-name generated-shared-prefix \
+    --gsp-num-groups 64 --gsp-prompts-per-group 16 \
+    --gsp-system-prompt-len 4096 --gsp-question-len 128 --gsp-output-len 256 \
+    --num-prompts 1024
+```
+此命令将生成 `64 * 16 = 1024` 个总请求。它会创建 64 个不同的、长度为 4096 token 的“系统提示”（共享前缀）。
+
+然后，对于每一个系统提示，它都会生成 16 个不同的、长度为 128 token 的“问题”。服务器在处理这1024个（被打乱的）请求时，如果其KV Cache效率高，那么它处理4096-token前缀的成本应该只支付64次，而不是1024次。
+
+## PD Disaggregation Mode性能分析
+
+**关键参数**
+
+* `--pd-separated`: 启动 PD 模式。
+* `--profile-prefill-url`:用于性能分析的 prefill worker数量。
+* `--profile-decode-url`: 用于性能分析的 decode worker数量。
+
+`--profile-prefill-url` 并且 `--profile-decode-url` 是相互排斥的 - 只能二选一。
+
+Start server
+```bash
+# set trace path
+export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
+
+# start prefill and decode servers (see PD disaggregation docs for setup)
+python3 -m sglang.launch_server --model-path /home/data/qwen3/Qwen3-32B --disaggregation-mode prefill
+python3 -m sglang.launch_server --model-path /home/data/qwen3/Qwen3-32B --disaggregation-mode decode --port 30001 --base-gpu-id 1
+
+# start router
+python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000
+```
+
+Profile Prefill Workers
+```bash
+# send profiling request targeting prefill workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000
+```
+
+Profile Decode Workers
+```bash
+# send profiling request targeting decode workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001
+```
+
+注意：
+* 这两个选项都支持用于多实例设置下的多个 worker URL：
+```bash
+# Profile multiple prefill workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000 http://127.0.0.1:30002
+
+# Profile multiple decode workers
+./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001 http://127.0.0.1:30003
+```
+
+# References
+
+- [evalscope](https://github.com/modelscope/evalscope)
\ No newline at end of file
diff --git a/models/nlp/llm/llm-benchmark/iluvatar_bench b/models/nlp/llm/llm-benchmark/iluvatar_bench
new file mode 100644
index 0000000000000000000000000000000000000000..fa6ab398bea7a1006c26b2acb6e0e5560b55fe05
--- /dev/null
+++ b/models/nlp/llm/llm-benchmark/iluvatar_bench
@@ -0,0 +1,181 @@
+#!/usr/local/bin/python3
+
+import argparse
+from argparse import ArgumentParser
+
+from evalscope import __version__
+from evalscope.cli.base import CLICommand
+
+from bench_serving import define_sgl_bench_args, run_benchmark
+
+class PerfBenchCMD(CLICommand):
+    name = 'perf'
+
+    def __init__(self, args):
+        self.args = args
+
+    @classmethod
+    def subparser_func(cls, args):
+        """
+        Function which will be called for a specific sub parser.
+        This method creates an instance of PerfBenchCMD from parsed arguments.
+        """
+        return cls(args)
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for create pipeline template command.
+        """
+        from evalscope.perf.arguments import add_argument
+
+        parser = parsers.add_parser(PerfBenchCMD.name)
+        add_argument(parser)
+        parser.set_defaults(
+            max_tokens=1024,
+            min_tokens=1024,
+            min_prompt_length=1024,
+            max_prompt_length=1024,
+            api='openai',
+            url='http://127.0.0.1:30000/v1/completions'
+        )
+        parser.set_defaults(func=PerfBenchCMD.subparser_func)
+
+    def execute(self):
+        try:
+            from evalscope.perf.main import run_perf_benchmark
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. '
+                "Please run `pip install 'evalscope[perf]'`."
+            )
+
+        run_perf_benchmark(self.args)
+
+
+class EvalCMD(CLICommand):
+    name = 'eval'
+
+    def __init__(self, args):
+        self.args = args
+
+    @classmethod
+    def subparser_func(cls, args):
+        """
+        Function which will be called for a specific sub parser.
+        This method creates an instance of EvalCMD from parsed arguments.
+        """
+        return cls(args)
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for create pipeline template command.
+        """
+        from evalscope.arguments import add_argument
+
+        parser = parsers.add_parser(EvalCMD.name)
+        add_argument(parser)
+        parser.set_defaults(
+            api_key='EMPTY',
+            api_url='http://127.0.0.1:30000/v1',
+            eval_type='server'
+        )
+        parser.set_defaults(func=EvalCMD.subparser_func)
+
+    def execute(self):
+        from evalscope.run import run_task
+
+        run_task(self.args)
+
+
+class StartAppCMD(CLICommand):
+    name = 'app'
+
+    def __init__(self, args):
+        self.args = args
+
+    @classmethod
+    def subparser_func(cls, args):
+        """
+        Function which will be called for a specific sub parser.
+        This method creates an instance of StartAppCMD from parsed arguments.
+        """
+        return cls(args)
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for create pipeline template command.
+        """
+        from evalscope.app import add_argument
+
+        parser = parsers.add_parser(StartAppCMD.name)
+        add_argument(parser)
+        parser.set_defaults(func=StartAppCMD.subparser_func)
+
+    def execute(self):
+        try:
+            from evalscope.app import create_app
+        except ImportError as e:
+            raise ImportError(
+                f'Failed to import create_app from evalscope.app, due to {e}. '
+                "Please run `pip install 'evalscope[app]'`."
+            )
+
+        create_app(self.args)
+
+class SGLPerfCMD(CLICommand):
+    name = 'sgl-perf'
+
+    def __init__(self, args):
+        self.args = args
+
+    @classmethod
+    def subparser_func(cls, args):
+        """
+        Function which will be called for a specific sub parser.
+        """
+        return cls(args)
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for sgl-perf command. """
+
+        parser = parsers.add_parser(SGLPerfCMD.name, 
+                                    help='Run SGLang performance benchmark (bench_serving.py)')
+
+        define_sgl_bench_args(parser)
+
+        parser.set_defaults(func=SGLPerfCMD.subparser_func)
+
+    def execute(self):
+        if run_benchmark is None:
+            raise ImportError(
+                "Failed to import 'run_benchmark' from 'bench_serving'. "
+                "Command 'sgl-perf' cannot execute."
+            )
+
+        run_benchmark(self.args)
+
+def run_cmd():
+    parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope <command> [<args>]')
+    parser.add_argument('-v', '--version', action='version', version=f'evalscope {__version__}')
+    subparsers = parser.add_subparsers(help='EvalScope command line helper.')
+
+    PerfBenchCMD.define_args(subparsers)
+    EvalCMD.define_args(subparsers)
+    StartAppCMD.define_args(subparsers)
+
+    # sgl-perf
+    SGLPerfCMD.define_args(subparsers)
+
+    args = parser.parse_args()
+
+    if not hasattr(args, 'func'):
+        parser.print_help()
+        exit(1)
+
+    cmd = args.func(args)
+    cmd.execute()
+
+
+if __name__ == '__main__':
+    run_cmd()
diff --git a/models/nlp/llm/llm-benchmark/requirements.txt b/models/nlp/llm/llm-benchmark/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d25d9047746399f23453ec9a29e29b92aa9db271
--- /dev/null
+++ b/models/nlp/llm/llm-benchmark/requirements.txt
@@ -0,0 +1,3 @@
+evalscope==1.0.2
+evalscope[perf]
+evalscope[app]
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md b/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d790fe79907a37f5ef61f0c8ab81f3e91c77ed82
--- /dev/null
+++ b/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md
@@ -0,0 +1,56 @@
+# Qwen3-235B-A22B-Thinking-2507 (vLLM)
+
+## Model Description
+
+Qwen3-235B-A22B is a large Mixture-of-Experts (MoE) language model with 235B total parameters and 22B activated parameters. The "Thinking" version is optimized for complex logical reasoning, math, and coding tasks with enhanced reasoning capabilities.
+
+This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://www.modelscope.cn/models/Qwen/Qwen3-235B-A22B>
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+export VLLM_W8A8_MOE_USE_W4A8=1
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=4 --tensor-parallel-size=4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input128, Output128, BS1):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 128 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 128
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md b/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d4b23188d6132f6fb1d89e3cbf245802fc98be1
--- /dev/null
+++ b/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md
@@ -0,0 +1,56 @@
+# Qwen3-30B-A3B-Thinking-2507 (vLLM)
+
+## Model Description
+
+Qwen3-30B-A3B is a Mixture-of-Experts (MoE) large language model with 30B total parameters and 3B activated parameters. The "Thinking" version is optimized for complex logical reasoning, math, and coding tasks.
+
+This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B>
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W4A8
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+export VLLM_W8A8_MOE_USE_W4A8=1
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=2 --max-model-len 4096 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input128, Output128, BS1):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 128 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 128
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-32b/vllm/README.md b/models/nlp/llm/qwen3-32b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8494587317da66ad862415d8ffd10a1c16142277
--- /dev/null
+++ b/models/nlp/llm/qwen3-32b/vllm/README.md
@@ -0,0 +1,55 @@
+# Qwen3-32B (vLLM)
+
+## Model Description
+
+Qwen3-32B is a dense large language model with 32B parameters, offering excellent performance on reasoning, instruction-following, and multilingual tasks. It supports seamless switching between thinking mode (for complex logical reasoning, math, and coding) and non-thinking mode (for efficient, general-purpose dialogue).
+
+This version supports W8A8 (Weight-8bit, Activation-8bit) and W4A16 (Weight-4bit, Activation-16bit) quantization for efficient inference.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://www.modelscope.cn/models/Qwen/Qwen3-32B>
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with W8A8/W4A16
+
+#### Performance Test
+
+1. Use the pre-copied ``llm-benchmark``:
+```bash
+cd ../../llm-benchmark
+pip3 install -r requirements.txt
+```
+
+2. Set environment variables:
+```bash
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+4. Start server:
+```bash
+vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=2 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}'
+```
+
+5. Run client (Input2048, Output1024, BS1):
+```bash
+./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 2048 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 1024
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md b/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f462989e4b6f662e1a4f02f374157fb3e9c033eb
--- /dev/null
+++ b/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md
@@ -0,0 +1,55 @@
+# Qwen3-Next-80B-A3B-Instruct (vLLM)
+
+## Model Description
+
+Qwen3-Next-80B-A3B-Instruct is a Mixture-of-Experts (MoE) large language model with 80B total parameters and 3B activated parameters. This is the next generation Qwen model with enhanced reasoning capabilities and instruction following.
+
+This version runs in BF16 precision for maximum accuracy.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.4.0 | 26.03 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://www.modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct>
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+## Model Inference
+
+### Inference with BF16
+
+#### Accuracy Test
+
+1. Install evalscope:
+```bash
+pip3 install 'evalscope[app,perf]' -U
+```
+
+2. Set environment variables:
+```bash
+export VLLM_USE_MODELSCOPE=True
+export VLLM_ENFORCE_CUDA_GRAPH=1
+```
+
+3. Start server:
+```bash
+vllm serve /path/to/model --served-model-name Qwen3-Next-80B-A3B-Instruct --trust_remote_code --port 8801 --pipeline-parallel-size 1 --tensor-parallel-size 8 --max-num-seqs 64 --max-model-len 40960 --disable-cascade-attn --gpu-memory-utilization 0.90 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' --port 9989
+```
+
+4. Run client (MMLU-Pro dataset):
+```bash
+evalscope eval --model Qwen3-Next-80B-A3B-Instruct --dataset-args '{"mmlu_pro": {"few_shot_num": 0}}'  --generation-config '{"do_sample": true, "temperature": 0.7, "max_tokens": 32768, "n": 1, "top_p": 0.8, "top_k": 20}' --datasets mmlu_pro --eval-type openai_api --eval-batch-size 64 --api-url http://127.0.0.1:9989/v1 --timeout 12000000  --api-key EMPTY --eval-type openai_api
+```
+
+## References
+
+- [Qwen3](https://github.com/QwenLM/Qwen3)
+- [vLLM](https://github.com/vllm-project/vllm)
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index 5fa94159b24ceded3fe8dcc876e4313390de496b..1eb8591fa3a56ccd51211c20095b16493a1344d7 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -10060,6 +10060,370 @@
             "type": "inference",
             "hasDemo": false,
             "demoType": ""
+        },
+        {
+            "display_name": "DeepSeek V3.1",
+            "model_name": "deepseek-v3.1",
+            "framework": "vllm",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "nlp/llm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/llm/deepseek-v3.1/vllm",
+            "readme_file": "models/nlp/llm/deepseek-v3.1/vllm/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3",
+            "need_third_part": false,
+            "precisions": [
+                "w4a8"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Qwen3 32B",
+            "model_name": "qwen3-32b",
+            "framework": "vllm",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "nlp/llm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/llm/qwen3-32b/vllm",
+            "readme_file": "models/nlp/llm/qwen3-32b/vllm/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-32B",
+            "need_third_part": false,
+            "precisions": [
+                "w8a8",
+                "w4a16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Qwen3 30B A3B Thinking",
+            "model_name": "qwen3-30b-a3b-thinking",
+            "framework": "vllm",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "nlp/llm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/llm/qwen3-30b-a3b-thinking/vllm",
+            "readme_file": "models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B",
+            "need_third_part": false,
+            "precisions": [
+                "w4a8"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Qwen3 235B A22B Thinking",
+            "model_name": "qwen3-235b-a22b-thinking",
+            "framework": "vllm",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "nlp/llm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/llm/qwen3-235b-a22b-thinking/vllm",
+            "readme_file": "models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-235B-A22B",
+            "need_third_part": false,
+            "precisions": [
+                "w4a8"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Qwen3 Next 80B A3B",
+            "model_name": "qwen3-next-80b-a3b",
+            "framework": "vllm",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "nlp/llm",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/nlp/llm/qwen3-next-80b-a3b/vllm",
+            "readme_file": "models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct",
+            "need_third_part": false,
+            "precisions": [
+                "bf16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "FLUX.1 Dev",
+            "model_name": "flux.1-dev",
+            "framework": "xdit",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "multimodal/diffusion_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/diffusion_model/flux.1-dev/xdit",
+            "readme_file": "models/multimodal/diffusion_model/flux.1-dev/xdit/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://modelscope.cn/models/black-forest-labs/FLUX.1-dev",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "HunyuanVideo",
+            "model_name": "hunyuan_video",
+            "framework": "xdit",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "multimodal/diffusion_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/diffusion_model/hunyuan_video/xdit",
+            "readme_file": "models/multimodal/diffusion_model/hunyuan_video/xdit/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://modelscope.cn/models/Tencent-Hunyuan/HunyuanVideo",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Wan2.1 T2V 14B",
+            "model_name": "wan2.1-t2v-14b",
+            "framework": "xdit",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "multimodal/diffusion_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit",
+            "readme_file": "models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B-Diffusers",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "Wan2.2 TI2V 5B",
+            "model_name": "wan2.2-ti2v-5b",
+            "framework": "xdit",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "multimodal/diffusion_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit",
+            "readme_file": "models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://www.modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B-Diffusers",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "HunyuanDiT v1.2",
+            "model_name": "hunyuandit-v1.2",
+            "framework": "xdit",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "multimodal/diffusion_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit",
+            "readme_file": "models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://modelscope.cn/models/dengcao/HunyuanDiT-v1.2-Diffusers",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
+        },
+        {
+            "display_name": "SD3 Medium",
+            "model_name": "stable-diffusion-3-medium",
+            "framework": "xdit",
+            "release_version": "26.03",
+            "release_sdk": "4.4.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "4.4.0",
+            "latest_gpgpu": "",
+            "category": "multimodal/diffusion_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit",
+            "readme_file": "models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://modelscope.cn/models/stabilityai/stable-diffusion-3-medium-diffusers",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
         }
     ]
 }
\ No newline at end of file