diff --git a/README.md b/README.md index 0af03f6635e001599e5e3944d23ac88d3e49a2ce..3f571e07c09f495845f52a4a0827a5b23ec12238 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ | DeepSeek-R1-Distill-Llama-8B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Llama-70B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Qwen-1.5B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm) | 4.3.0 | -| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.3.0 | +| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.4.0 | | DeepSeek-R1-Distill-Qwen-14B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-14b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Qwen-32B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-32b/vllm) | 4.3.0 | | DeepSeek-OCR | `Transformers` | [✅](models/multimodal/vision_language_model/deepseek-ocr/transformers) | 4.3.0 | @@ -61,7 +61,7 @@ | Qwen-7B | `vLLM` | [✅](models/nlp/llm/qwen-7b/vllm) | 4.3.0 | | Qwen-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen_vl/vllm) | 4.3.0 | | Qwen2-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_vl/vllm) | 4.3.0 | -| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.3.0 | +| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.4.0 | | Qwen1.5-7B | `vLLM` | [✅](models/nlp/llm/qwen1.5-7b/vllm) | 4.3.0 | | Qwen1.5-7B | `TGI` | [✅](models/nlp/llm/qwen1.5-7b/tgi) | 4.3.0 | | Qwen1.5-14B | `vLLM` | [✅](models/nlp/llm/qwen1.5-14b/vllm) | 4.3.0 | @@ -70,9 +70,14 @@ | Qwen2-7B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-7b/vllm) | 4.3.0 | | Qwen2-72B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-72b/vllm) | 4.3.0 | | Qwen3_Moe | `vLLM` | [✅](models/nlp/llm/qwen3-235b/vllm) | dev-only | -| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3/vllm) | 4.4.0 | +| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3-8b/vllm) | 4.4.0 | +| Qwen3-32B | `vLLM` | [✅](models/nlp/llm/qwen3-32b/vllm) | 4.4.0 | +| Qwen3-30B-A3B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-30b-a3b-thinking/vllm) | 4.4.0 | +| Qwen3-235B-A22B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-235b-a22b-thinking/vllm) | 4.4.0 | +| Qwen3-Next-80B-A3B | `vLLM` | [✅](models/nlp/llm/qwen3-next-80b-a3b/vllm) | 4.4.0 | +| DeepSeek-V3.1 | `vLLM` | [✅](models/nlp/llm/deepseek-v3.1/vllm) | 4.4.0 | | StableLM2-1.6B | `vLLM` | [✅](models/nlp/llm/stablelm/vllm) | 4.3.0 | -| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | dev-only | +| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | 4.4.0 | | Ultravox | `vLLM` | [✅](models/speech/asr/ultravox/vllm) | 4.3.0 | | Whisper | `vLLM` | [✅](models/speech/asr/whisper/vllm/) | 4.3.0 | | XLMRoberta | `vLLM` | [✅](models/multimodal/vision_language_model/xlmroberta/vllm) | 4.3.0 | @@ -323,6 +328,12 @@ | Stable Diffusion 1.5 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers) | 4.3.0 | | Stable Diffusion 2.1 | ixRT | [✅](models/multimodal/diffusion_model/stable-diffusion-2.1/diffusers) | 4.4.0 | | Stable Diffusion 3 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers) | dev-only | +| FLUX.1-Dev | xDiT | [✅](models/multimodal/diffusion_model/flux.1-dev/xdit) | 4.4.0 | +| HunyuanVideo | xDiT | [✅](models/multimodal/diffusion_model/hunyuan_video/xdit) | 4.4.0 | +| Wan2.1-T2V-14B | xDiT | [✅](models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit) | 4.4.0 | +| Wan2.2-TI2V-5B | xDiT | [✅](models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit) | 4.4.0 | +| HunyuanDiT-v1.2 | xDiT | [✅](models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit) | 4.4.0 | +| SD3-Medium | xDiT | [✅](models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit) | 4.4.0 | ### 自然语言处理(NLP) diff --git a/README_en.md b/README_en.md index 631736f9d5a0c0605ce14cca93b394a21bafc037..08f98040ca51db95758a53295ae68950232a3970 100644 --- a/README_en.md +++ b/README_en.md @@ -46,7 +46,7 @@ inference to be expanded in the future. | DeepSeek-R1-Distill-Llama-8B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-8b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Llama-70B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-llama-70b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Qwen-1.5B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-1.5b/vllm) | 4.3.0 | -| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.3.0 | +| DeepSeek-R1-Distill-Qwen-7B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm) | 4.4.0 | | DeepSeek-R1-Distill-Qwen-14B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-14b/vllm) | 4.3.0 | | DeepSeek-R1-Distill-Qwen-32B | `vLLM` | [✅](models/nlp/llm/deepseek-r1-distill-qwen-32b/vllm) | 4.3.0 | | DeepSeek-OCR | `Transformers` | [✅](models/multimodal/vision_language_model/deepseek-ocr/transformers) | 4.3.0 | @@ -71,7 +71,7 @@ inference to be expanded in the future. | Qwen-7B | `vLLM` | [✅](models/nlp/llm/qwen-7b/vllm) | 4.3.0 | | Qwen-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen_vl/vllm) | 4.3.0 | | Qwen2-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_vl/vllm) | 4.3.0 | -| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.3.0 | +| Qwen2.5-VL | `vLLM` | [✅](models/multimodal/vision_language_model/qwen2_5_vl/vllm) | 4.4.0 | | Qwen1.5-7B | `vLLM` | [✅](models/nlp/llm/qwen1.5-7b/vllm) | 4.3.0 | | Qwen1.5-7B | `TGI` | [✅](models/nlp/llm/qwen1.5-7b/tgi) | 4.3.0 | | Qwen1.5-14B | `vLLM` | [✅](models/nlp/llm/qwen1.5-14b/vllm) | 4.3.0 | @@ -80,9 +80,14 @@ inference to be expanded in the future. | Qwen2-7B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-7b/vllm) | 4.3.0 | | Qwen2-72B Instruct | `vLLM` | [✅](models/nlp/llm/qwen2-72b/vllm) | 4.3.0 | | Qwen3_Moe | `vLLM` | [✅](models/nlp/llm/qwen3-235b/vllm) | dev-only | -| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3/vllm) | 4.4.0 | +| Qwen3-8B | `vLLM` | [✅](models/nlp/llm/qwen3-8b/vllm) | 4.4.0 | +| Qwen3-32B | `vLLM` | [✅](models/nlp/llm/qwen3-32b/vllm) | 4.4.0 | +| Qwen3-30B-A3B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-30b-a3b-thinking/vllm) | 4.4.0 | +| Qwen3-235B-A22B-Thinking | `vLLM` | [✅](models/nlp/llm/qwen3-235b-a22b-thinking/vllm) | 4.4.0 | +| Qwen3-Next-80B-A3B | `vLLM` | [✅](models/nlp/llm/qwen3-next-80b-a3b/vllm) | 4.4.0 | +| DeepSeek-V3.1 | `vLLM` | [✅](models/nlp/llm/deepseek-v3.1/vllm) | 4.4.0 | | StableLM2-1.6B | `vLLM` | [✅](models/nlp/llm/stablelm/vllm) | 4.3.0 | -| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | dev-only | +| Step3 | `vLLM` | [✅](models/multimodal/vision_language_model/step3/vllm) | 4.4.0 | | Ultravox | `vLLM` | [✅](models/speech/asr/ultravox/vllm) | 4.3.0 | | Whisper | `vLLM` | [✅](models/speech/asr/whisper/vllm/) | 4.3.0 | | XLMRoberta | `vLLM` | [✅](models/multimodal/vision_language_model/xlmroberta/vllm) | 4.3.0 | @@ -332,6 +337,12 @@ inference to be expanded in the future. | Stable Diffusion 1.5 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-1.5/diffusers) | 4.3.0 | | Stable Diffusion 2.1 | ixRT | [✅](models/multimodal/diffusion_model/stable-diffusion-2.1/diffusers) | 4.4.0 | | Stable Diffusion 3 | Diffusers | [✅](models/multimodal/diffusion_model/stable-diffusion-3/diffusers) | dev-only | +| FLUX.1-Dev | xDiT | [✅](models/multimodal/diffusion_model/flux.1-dev/xdit) | 4.4.0 | +| HunyuanVideo | xDiT | [✅](models/multimodal/diffusion_model/hunyuan_video/xdit) | 4.4.0 | +| Wan2.1-T2V-14B | xDiT | [✅](models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit) | 4.4.0 | +| Wan2.2-TI2V-5B | xDiT | [✅](models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit) | 4.4.0 | +| HunyuanDiT-v1.2 | xDiT | [✅](models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit) | 4.4.0 | +| SD3-Medium | xDiT | [✅](models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit) | 4.4.0 | ### NLP #### PLM (Pre-trained Language Model) diff --git a/RELEASE.md b/RELEASE.md index 866a42207844a63ba4c95562f2fcfa0d5017717b..b0cf8a3be90c196bdb1f94c60d4823b4d081218b 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -9,6 +9,7 @@ | Release Date | Release Version | IXUCA SDK | |--------------|-----------------|-----------| +| Mar 2026 | 26.03 | v4.4.0 | | Dec 2025 | 25.12 | v4.3.0 | | Sep 2025 | 25.09 | v4.3.0 | | Jun 2025 | 25.06 | v4.2.0 | @@ -20,6 +21,104 @@ ## Release Notes +### DeepSparkInference 26.03 + +#### 模型与算法 + +* 新增了 16 个推理小模型示例,其中支持 IGIE 推理引擎的 10 个,支持 ixRT 推理引擎的 6 个。 +* 新增了 18 个大语言模型推理示例,其中 12 个使用 [vLLM](https://github.com/vllm-project/vllm),6 个使用 [xDiT](https://github.com/xdit-team/xDiT),2 个使用 ixRT。 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IGIE
MobileViT-SViT-B-32ViT-L-14
DETRRT-DETRYOLOv11m
YOLOv11sYOLOv26nYOLOv5s
DenseNet121(int8)
ixRT
Swin TransformerRepNetGrounding DINO
RT-DETRCRNNUNet
LLM
DeepSeek-V3.1 (vLLM)DeepSeek-VL2 (vLLM)DeepSeek-OCR (vLLM)
InternLM3 (vLLM)MiniCPM-V-4 (vLLM)Qwen3-8B (vLLM)
Qwen3-32B (vLLM)Qwen3-30B-A3B (vLLM)Qwen3-235B-A22B (vLLM)
Qwen3-Next-80B (vLLM)FLUX.1-Dev(xDiT)HunyuanVideo(xDiT)
Wan2.1-T2V-14B(xDiT)Wan2.2-TI2V-5B(xDiT)HunyuanDiT-v1.2(xDiT)
SD3-Medium(xDiT)CosyVoice (ixRT)Stable Diffusion 2.1 (ixRT)
+ +#### 修复更新 + +* 适配了 IXUCA SDK 4.4.0 版本的 CI 测试流程 +* 修复了 IGIE MViTv2-base 模型运行时缺少 pkg_resources 模块的问题 +* 修复了 vLLM 推理模型的 deprecated 参数错误并升级为离线推理模式 +* 修复了 DeepSeek-R1-Distill-Llama-8B 模型在 vLLM 0.11.2 版本上的兼容性问题 +* 修复了 Qwen-VL、Qwen2-VL、Qwen2.5-VL、Whisper 等模型的参数错误问题 +* 修复了 Pixtral 模型在 vLLM 0.11.2 版本上的兼容性问题 +* 修复了 ixRT RT-DETR 模型在 batchsize 为 64 时运行报错的问题 +* 修复了多个模型的 trust_remote_code 参数配置问题 +* 修复了 IGIE YOLOv8n 模型与 ultralytics 版本的兼容性问题 +* 修复了 YOLOx 模型的数据集路径问题 +* 修复了 IGIE ResNet 和 VGG16 模型在安装 TensorFlow 时的报错问题 +* 修复了 protobuf 版本导致的兼容性问题 +* 新增了 ARM 架构的核心绑定命令支持 +* 新增了 YOLOv8n ixRT 模型的 batchsize 参数支持 + +#### 版本关联 + +DeepSparkInference 26.03 对应天数软件栈 4.4.0 版本。 + +#### 感谢以下社区贡献者 + +YoungPeng,honglyua,majorli6,shengyan.zhao,yougouda,jinrui.zhang,tianyu,anders。 + ### DeepSparkInference 25.12 #### 模型与算法 diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md b/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..95bc45a8f5472ee4b248fd2d2ff41b15f6ce92b7 --- /dev/null +++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/README.md @@ -0,0 +1,61 @@ +# FLUX.1-Dev (xDiT) + +## Model Description + +FLUX.1-Dev is a state-of-the-art text-to-image diffusion model developed by Black Forest Labs. It excels at generating high-quality, detailed images from text prompts with exceptional prompt adherence and image quality. + +This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +- Model: + +### Install Dependencies + +1. Install Iluvatar CoreX adapted framework: +```bash +pip install diffusers-{version}-py3-none-any.whl +pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Model Inference + +1. Modify model path in ``run.sh``: +```bash +# The run.sh script is pre-copied in this directory +# Modify MODEL_CONFIGS to point to your model path +vim run.sh +# Update: MODEL_CONFIGS=(["Flux"]="flux_example.py /home/data/flux___1-schnell/ 28") +``` + +2. Run script: +```bash +bash run.sh +``` + +3. The model supports 512*512 and 1024*1024 image sizes. To modify: +```bash +vim run.sh +# Modify TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5" +# to TASK_ARGS="--height 512 --width 512 --no_use_resolution_binning --guidance_scale 3.5" +``` + +## References + +- [FLUX.1](https://github.com/black-forest-labs/flux) +- [xDiT](https://github.com/xdit-team/xDiT) \ No newline at end of file diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py b/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py new file mode 100644 index 0000000000000000000000000000000000000000..6a6a91e9a35cc2f4433c9303a9fbaffb0321f408 --- /dev/null +++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/flux_example.py @@ -0,0 +1,124 @@ +import logging +import time +import torch +import torch.distributed +from transformers import T5EncoderModel +from xfuser import xFuserFluxPipeline, xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + get_data_parallel_rank, + get_data_parallel_world_size, + get_runtime_state, + is_dp_last_group, + get_pipeline_parallel_world_size, + get_classifier_free_guidance_world_size, + get_tensor_model_parallel_world_size, + get_data_parallel_world_size, +) +from xfuser.model_executor.cache.diffusers_adapters import apply_cache_on_transformer +# if os.environ.get("ENABLE_IXFORMER_CONV2D", "0") == "1": +# import ixformer as ixff +# torch.nn.functional.conv2d=ixff.conv2d + + +def main(): + torch.backends.cudnn.benchmark=False + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + engine_config, input_config = engine_args.create_config() + runtime_dtype = torch.bfloat16 + engine_config.runtime_config.dtype = runtime_dtype + local_rank = get_world_group().local_rank + torch.cuda.set_device(local_rank) + text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16) + + if args.use_fp8_t5_encoder: + from optimum.quanto import freeze, qfloat8, quantize + logging.info(f"rank {local_rank} quantizing text encoder 2") + quantize(text_encoder_2, weights=qfloat8) + freeze(text_encoder_2) + + cache_args = { + "use_teacache": engine_args.use_teacache, + "use_fbcache": engine_args.use_fbcache, + "rel_l1_thresh": 0.12, + "return_hidden_states_first": False, + "num_steps": input_config.num_inference_steps, + } + # print(cache_args) + pipe = xFuserFluxPipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + engine_config=engine_config, + cache_args=cache_args, + torch_dtype=runtime_dtype, + text_encoder_2=text_encoder_2, + + ) + pipe.vae = pipe.vae.to(dtype=torch.float32) + + # pipe.vae.to(memory_format=torch.channels_last) + # for net in pipe.vae.modules(): + # net.register_forward_hook(forward_hook) + + if args.enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} sequential CPU offload enabled") + elif args.enable_model_cpu_offload: + pipe.enable_model_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} model CPU offload enabled") + else: + pipe = pipe.to(local_rank) + + parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + ## + import os + if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1": + from w8a8_linear import apply_quant_linear_i8w8o16 + pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer) + # pipe.transformer.fuse_qkv_projections() + pipe.prepare_run(input_config, steps=input_config.num_inference_steps) + + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + output = pipe( + height=input_config.height, + width=input_config.width, + prompt=input_config.prompt, + num_inference_steps=input_config.num_inference_steps, + output_type=input_config.output_type, + max_sequence_length=256, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + ) + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"tp{engine_args.tensor_parallel_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + if input_config.output_type == "pil": + dp_group_index = get_data_parallel_rank() + num_dp_groups = get_data_parallel_world_size() + dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups + if pipe.is_dp_last_group(): + for i, image in enumerate(output.images): + image_rank = dp_group_index * dp_batch_size + i + image_name = f"flux_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png" + image.save(f"./{image_name}") + print(f"image {i} saved to ./{image_name}") + + if get_world_group().rank == get_world_group().world_size - 1: + print( + f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB" + ) + get_runtime_state().destroy_distributed_env() + + +if __name__ == "__main__": + main() diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt b/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..92e80ab221aa39de5448471b3bc63121b1dbbd37 --- /dev/null +++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/requirements.txt @@ -0,0 +1,8 @@ +#diffusers +yunchang +ftfy +transformers>=4.55 +numpy==1.26.4 +imageio +imageio-ffmpeg +distvae \ No newline at end of file diff --git a/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh b/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..163e088888c3ec0cd88b0d5e3d50bdd8a1a00990 --- /dev/null +++ b/models/multimodal/diffusion_model/flux.1-dev/xdit/run.sh @@ -0,0 +1,72 @@ +# set -x +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 +export PYTHONPATH=$PWD:$PYTHONPATH + + +export ENABLE_IXFORMER_INFERENCE=1 +export ATTN_OPT_LEVEL=2 #xdit >=0.4.5 +export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4 +export ENABLE_IXFORMER_W8A8LINEAR=1 + +# Select the model type +export MODEL_TYPE="Flux" +# Configuration for different model types +# script, model_id, inference_step +declare -A MODEL_CONFIGS=( + ["Flux"]="flux_example.py /home/data/flux___1-schnell/ 28" +) + +echo ${MODEL_CONFIGS[$MODEL_TYPE]} + +# if [ -v MODEL_CONFIGS[$MODEL_TYPE] ] ; then +if [ -n "${MODEL_CONFIGS[$MODEL_TYPE]+_}" ]; then + IFS=' ' read -r SCRIPT MODEL_ID INFERENCE_STEP <<< "${MODEL_CONFIGS[$MODEL_TYPE]}" + export SCRIPT MODEL_ID INFERENCE_STEP +else + echo "Invalid MODEL_TYPE: $MODEL_TYPE" + exit 1 +fi + + + +# task args +TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5" + +# cache args +# CACHE_ARGS="--use_teacache" +# CACHE_ARGS="--use_fbcache" + +# On 8 gpus, pp=2, ulysses=2, ring=1, cfg_parallel=2 (split batch) +N_GPUS=2 +PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 1" + +# CFG_ARGS="--use_cfg_parallel" + +# By default, num_pipeline_patch = pipefusion_degree, and you can tune this parameter to achieve optimal performance. +# PIPEFUSION_ARGS="--num_pipeline_patch 8 " + +# For high-resolution images, we use the latent output type to avoid runing the vae module. Used for measuring speed. +# OUTPUT_ARGS="--output_type latent" + +# PARALLLEL_VAE="--use_parallel_vae" + +# Another compile option is `--use_onediff` which will use onediff's compiler. +# COMPILE_FLAG="--use_torch_compile" + + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$PIPEFUSION_ARGS \ +$OUTPUT_ARGS \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 1 \ +--prompt "brown dog laying on the ground with a metal bowl in front of him." \ +$CFG_ARGS \ +$PARALLLEL_VAE \ +$COMPILE_FLAG \ +$QUANTIZE_FLAG \ +$CACHE_ARGS \ + diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..25717b05019a5509b3a2a1e032adac21f3713df0 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md @@ -0,0 +1,58 @@ +# HunyuanDiT-v1.2-Diffusers (xDiT) + +## Model Description + +HunyuanDiT-v1.2 is Tencent's advanced text-to-image diffusion model, featuring improved architecture and training for high-quality image generation. It excels at generating detailed, photorealistic images from text descriptions. + +This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +1. Install Iluvatar CoreX adapted framework: +```bash +pip install diffusers-{version}-py3-none-any.whl +pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Model Inference + +1. The scripts are pre-copied in this directory. Modify model path: +```bash +vim run_hunyuandit.sh +# Update MODEL_ID to your actual model path +``` + +2. Run script: +```bash +bash run_hunyuandit.sh +``` + +3. The model supports BS=1/BS=2. Different BS prompts format: +```bash +# BS1 (default) prompt format +#--prompt "brown dog laying on the ground with a metal bowl in front of him." +# BS2 prompt format +--prompt "brown dog laying on the ground with a metal bowl in front of him." "brown dog laying on the ground with a metal bowl in front of him." +``` + +## References + +- [HunyuanDiT](https://github.com/Tencent/HunyuanDiT) +- [xDiT](https://github.com/xdit-team/xDiT) \ No newline at end of file diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py new file mode 100644 index 0000000000000000000000000000000000000000..b26a4503848bf371699693406f04ebf9e052a746 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/hunyuandit_example.py @@ -0,0 +1,92 @@ +import time +import os +import torch +import torch.distributed +from transformers import T5EncoderModel +from xfuser import xFuserHunyuanDiTPipeline, xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + is_dp_last_group, + get_data_parallel_world_size, + get_runtime_state, + get_data_parallel_rank, +) + +def main(): + + # torch.backends.cudnn.benchmark=False + + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + engine_config, input_config = engine_args.create_config() + local_rank = get_world_group().local_rank + torch.cuda.set_device(local_rank) + text_encoder_2 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_2", torch_dtype=torch.bfloat16) + if args.use_fp8_t5_encoder: + from optimum.quanto import freeze, qfloat8, quantize + print(f"rank {local_rank} quantizing text encoder 2") + quantize(text_encoder_2, weights=qfloat8) + freeze(text_encoder_2) + + pipe = xFuserHunyuanDiTPipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + engine_config=engine_config, + torch_dtype=torch.float16, + text_encoder_2=text_encoder_2, + ).to(f"cuda:{local_rank}") + pipe.vae.to(memory_format=torch.channels_last) + parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + import os + if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1": + from w8a8_linear import apply_quant_linear_i8w8o16 + pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer) + pipe.prepare_run(input_config) + + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + output = pipe( + height=input_config.height, + width=input_config.width, + prompt=input_config.prompt, + num_inference_steps=input_config.num_inference_steps, + output_type=input_config.output_type, + use_resolution_binning=input_config.use_resolution_binning, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + ) + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + if input_config.output_type == "pil": + dp_group_index = get_data_parallel_rank() + num_dp_groups = get_data_parallel_world_size() + dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups + if pipe.is_dp_last_group(): + if not os.path.exists("results"): + os.mkdir("results") + for i, image in enumerate(output.images): + image_rank = dp_group_index * dp_batch_size + i + image.save( + f"./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png" + ) + print( + f"image {i} saved to ./results/hunyuandit_result_{parallel_info}_{image_rank}_tc_{engine_args.use_torch_compile}.png" + ) + + if get_world_group().rank == get_world_group().world_size - 1: + print( + f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9:.2f} GB" + ) + get_runtime_state().destroy_distributed_env() + + +if __name__ == "__main__": + main() diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2969a4385d913c98a2cb13adfa2bb29f3d3f0938 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/requirements.txt @@ -0,0 +1,9 @@ +#diffusers +yunchang +ftfy +transformers>=4.55 +numpy==1.26.4 +imageio +imageio-ffmpeg +distvae + diff --git a/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh new file mode 100644 index 0000000000000000000000000000000000000000..c136150e280f7b1f260977707f062515974491fc --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/run_hunyuandit.sh @@ -0,0 +1,45 @@ +# set -x +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 +export PYTHONPATH=$PWD:$PYTHONPATH + +#多ring 没提升 +# export NCCL_USE_HIGHPRIORITYWARP=1 + +export ENABLE_IXFORMER_INFERENCE=1 +# export ATTN_OPT_LEVEL=2 +export ENABLE_IXFORMER_W8A8LINEAR=0 + +# Select the model type +SCRIPT=hunyuandit_example.py +MODEL_ID=/data/nlp/HunyuanDiT-v1.2-Diffusers/ +INFERENCE_STEP=20 + +mkdir -p ./results + +# task args +TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5" + +# cache args +# CACHE_ARGS="--use_teacache" +# CACHE_ARGS="--use_fbcache" + +N_GPUS=2 +PARALLEL_ARGS="--pipefusion_parallel_degree 1 --ulysses_degree 1 --ring_degree 1 --tensor_parallel_degree 1 --data_parallel_degree 1" +CFG_ARGS="--use_cfg_parallel" + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$PIPEFUSION_ARGS \ +$OUTPUT_ARGS \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 1 \ +--prompt "brown dog laying on the ground with a metal bowl in front of him." \ +$CFG_ARGS \ +$PARALLLEL_VAE \ +$COMPILE_FLAG \ +$QUANTIZE_FLAG \ +$CACHE_ARGS \ + diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md b/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1a9dd1c9e23f18807bc564b6e2efd811f10d0937 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/README.md @@ -0,0 +1,50 @@ +# HunyuanVideo (xDiT) + +## Model Description + +HunyuanVideo is Tencent's advanced text-to-video diffusion model capable of generating high-quality videos from text descriptions. It features excellent motion coherence, visual quality, and text understanding capabilities. + +This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +1. Install Iluvatar CoreX adapted framework: +```bash +pip install diffusers-{version}-py3-none-any.whl +pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Model Inference + +1. The scripts are pre-copied in this directory. Modify model path in ``run_hunyuan_video_usp_teacache.sh``: +```bash +vim run_hunyuan_video_usp_teacache.sh +# Update: MODEL_ID="/data/nlp/HunyuanVideo/" to your actual path +``` + +2. Run script: +```bash +bash run_hunyuan_video_usp_teacache.sh +``` + +## References + +- [HunyuanVideo](https://github.com/Tencent/HunyuanVideo) +- [xDiT](https://github.com/xdit-team/xDiT) \ No newline at end of file diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py new file mode 100644 index 0000000000000000000000000000000000000000..8d1c549f223d0d9906ae55129ec3f4413fbd322a --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example.py @@ -0,0 +1,333 @@ +# from https://github.com/chengzeyi/ParaAttention/blob/main/examples/run_hunyuan_video.py +import functools +from typing import Any, Dict, Union, Optional +import logging +import time + +import torch + +from diffusers import DiffusionPipeline, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel +from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.utils import scale_lora_layers, unscale_lora_layers, USE_PEFT_BACKEND +from diffusers.utils import export_to_video +from xfuser.model_executor.models.customized.hunyuan_video.tp_applicator import TensorParallelApplicator +from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank +from xfuser import xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + get_data_parallel_world_size, + get_data_parallel_rank, + get_runtime_state, + get_classifier_free_guidance_world_size, + get_classifier_free_guidance_rank, + get_cfg_group, + get_sequence_parallel_world_size, + get_sequence_parallel_rank, + get_sp_group, + is_dp_last_group, + initialize_runtime_state, + get_pipeline_parallel_world_size, +) + +from xfuser.model_executor.layers.attention_processor import xFuserHunyuanVideoAttnProcessor2_0 + +assert xFuserHunyuanVideoAttnProcessor2_0 is not None + + +def parallelize_transformer(pipe: DiffusionPipeline): + transformer = pipe.transformer + + @functools.wraps(transformer.__class__.forward) + def new_forward( + self, + hidden_states: torch.Tensor, + timestep: torch.LongTensor, + encoder_hidden_states: torch.Tensor, + encoder_attention_mask: torch.Tensor, + pooled_projections: torch.Tensor, + guidance: torch.Tensor = None, + attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = True, + ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]: + if attention_kwargs is not None: + attention_kwargs = attention_kwargs.copy() + lora_scale = attention_kwargs.pop("scale", 1.0) + else: + lora_scale = 1.0 + + if USE_PEFT_BACKEND: + # weight the lora layers by setting `lora_scale` for each PEFT layer + scale_lora_layers(self, lora_scale) + else: + if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None: + logging.warning("Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.") + + batch_size, num_channels, num_frames, height, width = hidden_states.shape + assert batch_size % get_classifier_free_guidance_world_size( + ) == 0, f"Cannot split dim 0 of hidden_states ({batch_size}) into {get_classifier_free_guidance_world_size()} parts." + + p, p_t = self.config.patch_size, self.config.patch_size_t + post_patch_num_frames = num_frames // p_t + post_patch_height = height // p + post_patch_width = width // p + + # 1. RoPE + image_rotary_emb = self.rope(hidden_states) + + # 2. Conditional embeddings + # temb = self.time_text_embed(timestep, guidance, pooled_projections) + temb, token_replace_emb = self.time_text_embed(timestep,pooled_projections, guidance) + hidden_states = self.x_embedder(hidden_states) + encoder_hidden_states = self.context_embedder(encoder_hidden_states, + timestep, + encoder_attention_mask) + + hidden_states = hidden_states.reshape(batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1) + hidden_states = hidden_states.flatten(1, 3) + + hidden_states = torch.chunk(hidden_states, + get_classifier_free_guidance_world_size(), + dim=0)[get_classifier_free_guidance_rank()] + hidden_states = torch.chunk(hidden_states, + get_sequence_parallel_world_size(), + dim=-2)[get_sequence_parallel_rank()] + + encoder_attention_mask = encoder_attention_mask[0].to(torch.bool) + encoder_hidden_states_indices = torch.arange( + encoder_hidden_states.shape[1], + device=encoder_hidden_states.device) + encoder_hidden_states_indices = encoder_hidden_states_indices[ + encoder_attention_mask] + encoder_hidden_states = encoder_hidden_states[ + ..., encoder_hidden_states_indices, :] + if encoder_hidden_states.shape[-2] % get_sequence_parallel_world_size( + ) != 0: + get_runtime_state().split_text_embed_in_sp = False + else: + get_runtime_state().split_text_embed_in_sp = True + + encoder_hidden_states = torch.chunk( + encoder_hidden_states, + get_classifier_free_guidance_world_size(), + dim=0)[get_classifier_free_guidance_rank()] + if get_runtime_state().split_text_embed_in_sp: + encoder_hidden_states = torch.chunk( + encoder_hidden_states, + get_sequence_parallel_world_size(), + dim=-2)[get_sequence_parallel_rank()] + + freqs_cos, freqs_sin = image_rotary_emb + + def get_rotary_emb_chunk(freqs): + freqs = torch.chunk(freqs, get_sequence_parallel_world_size(), dim=0)[get_sequence_parallel_rank()] + return freqs + + freqs_cos = get_rotary_emb_chunk(freqs_cos) + freqs_sin = get_rotary_emb_chunk(freqs_sin) + image_rotary_emb = (freqs_cos, freqs_sin) + + # 4. Transformer blocks + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} + + for block in self.transformer_blocks: + hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + encoder_hidden_states, + temb, + None, + image_rotary_emb, + **ckpt_kwargs, + ) + + for block in self.single_transformer_blocks: + hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + encoder_hidden_states, + temb, + None, + image_rotary_emb, + **ckpt_kwargs, + ) + + else: + for block in self.transformer_blocks: + hidden_states, encoder_hidden_states = block( + hidden_states, encoder_hidden_states, temb, None, + image_rotary_emb) + + for block in self.single_transformer_blocks: + hidden_states, encoder_hidden_states = block( + hidden_states, encoder_hidden_states, temb, None, + image_rotary_emb) + + # 5. Output projection + hidden_states = self.norm_out(hidden_states, temb) + hidden_states = self.proj_out(hidden_states) + + hidden_states = get_sp_group().all_gather(hidden_states, dim=-2) + hidden_states = get_cfg_group().all_gather(hidden_states, dim=0) + + hidden_states = hidden_states.reshape(batch_size, + post_patch_num_frames, + post_patch_height, + post_patch_width, -1, p_t, p, p) + + hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7) + hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3) + + if USE_PEFT_BACKEND: + # remove `lora_scale` from each PEFT layer + unscale_lora_layers(self, lora_scale) + + if not return_dict: + return (hidden_states, ) + + return Transformer2DModelOutput(sample=hidden_states) + + new_forward = new_forward.__get__(transformer) + transformer.forward = new_forward + + for block in transformer.transformer_blocks + transformer.single_transformer_blocks: + block.attn.processor = xFuserHunyuanVideoAttnProcessor2_0() + + +def main(): + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + + engine_config, input_config = engine_args.create_config() + local_rank = get_world_group().local_rank + + assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion." + assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for HunyuanVideo" + + transformer = HunyuanVideoTransformer3DModel.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + subfolder="transformer", + torch_dtype=torch.bfloat16, + revision="refs/pr/18", + ) + pipe = HunyuanVideoPipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + transformer=transformer, + torch_dtype=torch.float16, + revision="refs/pr/18", + ) + + initialize_runtime_state(pipe, engine_config) + get_runtime_state().set_video_input_parameters( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + batch_size=1, + num_inference_steps=input_config.num_inference_steps, + split_text_embed_in_sp=get_pipeline_parallel_world_size() == 1, + ) + + + if args.tensor_parallel_degree > 1: + tp_applicator = TensorParallelApplicator(get_tensor_model_parallel_world_size(), get_tensor_model_parallel_rank()) + tp_applicator.apply_to_model(pipe.transformer) + tp_applicator.apply_to_llamamodel(pipe.text_encoder) + + parallelize_transformer(pipe) + if args.enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} sequential CPU offload enabled") + elif args.enable_model_cpu_offload: + pipe.enable_model_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} model CPU offload enabled") + else: + device = torch.device(f"cuda:{local_rank}") + pipe = pipe.to(device) + + if args.enable_tiling: + pipe.vae.enable_tiling( + # Make it runnable on GPUs with 48GB memory + # tile_sample_min_height=128, + # tile_sample_stride_height=96, + # tile_sample_min_width=128, + # tile_sample_stride_width=96, + # tile_sample_min_num_frames=32, + # tile_sample_stride_num_frames=24, + ) + + if args.enable_slicing: + pipe.vae.enable_slicing() + + parameter_peak_memory = torch.cuda.max_memory_allocated( + device=f"cuda:{local_rank}") + + if engine_config.runtime_config.use_torch_compile: + torch._inductor.config.reorder_for_compute_comm_overlap = True + pipe.transformer.compile() + + # one step to warmup the torch compiler + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + num_inference_steps=1, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed( + input_config.seed), + ).frames[0] + + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + num_inference_steps=input_config.num_inference_steps, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed( + input_config.seed), + ).frames[0] + + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"tp{engine_args.tensor_parallel_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + if is_dp_last_group(): + resolution = f"{input_config.width}x{input_config.height}" + output_filename = f"results/hunyuan_video_{parallel_info}_{resolution}.mp4" + export_to_video(output, output_filename, fps=15) + print(f"output saved to {output_filename}") + + if get_world_group().rank == get_world_group().world_size - 1: + print( + f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9} GB" + ) + get_runtime_state().destroy_distributed_env() + + +# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 320 --width 512 --num_frames 61 --enable_tiling --enable_model_cpu_offload +# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 544 --width 960 --num_frames 129 --enable_tiling --enable_model_cpu_offload +if __name__ == "__main__": + main() diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py new file mode 100644 index 0000000000000000000000000000000000000000..ffca2c6eb0de7a1cfc9610b1aa3b32b406b6e44b --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/hunyuan_video_usp_example_teacache.py @@ -0,0 +1,180 @@ +# from https://github.com/chengzeyi/ParaAttention/blob/main/examples/run_hunyuan_video.py +import functools +from typing import Any, Dict, Union, Optional +import logging +import time + +import torch + +from diffusers import DiffusionPipeline, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel +from xfuser import xFuserHunyuanVideoPipeline +from diffusers.models.modeling_outputs import Transformer2DModelOutput +from diffusers.utils import scale_lora_layers, unscale_lora_layers, USE_PEFT_BACKEND +from diffusers.utils import export_to_video +from xfuser.model_executor.models.customized.hunyuan_video.tp_applicator import TensorParallelApplicator +from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank +from xfuser import xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + get_data_parallel_world_size, + get_data_parallel_rank, + get_runtime_state, + get_classifier_free_guidance_world_size, + get_classifier_free_guidance_rank, + get_cfg_group, + get_sequence_parallel_world_size, + get_sequence_parallel_rank, + get_sp_group, + is_dp_last_group, + initialize_runtime_state, + get_pipeline_parallel_world_size, +) + +from xfuser.model_executor.layers.attention_processor import xFuserHunyuanVideoAttnProcessor2_0 + +assert xFuserHunyuanVideoAttnProcessor2_0 is not None +from w8a8_linear import apply_quant_linear_i8w8o16 + + +def main(): + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + + engine_config, input_config = engine_args.create_config() + local_rank = get_world_group().local_rank + + assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion." + # assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for HunyuanVideo" + + transformer = HunyuanVideoTransformer3DModel.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + subfolder="transformer", + torch_dtype=torch.bfloat16, + revision="refs/pr/18", + ) + rel_l1_thresh =0.12 + if engine_args.use_fbcache: + rel_l1_thresh = 0.06 + cache_args = { + "use_teacache": engine_args.use_teacache, + "use_fbcache": engine_args.use_fbcache, + "rel_l1_thresh": rel_l1_thresh, + "return_hidden_states_first": True, + "num_steps": input_config.num_inference_steps, + } + # pipe = HunyuanVideoPipeline.from_pretrained( + pipe = xFuserHunyuanVideoPipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + transformer=transformer, + torch_dtype=torch.float16, + revision="refs/pr/18", + engine_config=engine_config, + cache_args=cache_args, + ) + + # initialize_runtime_state(pipe, engine_config) + get_runtime_state().set_video_input_parameters( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + batch_size=1, + num_inference_steps=input_config.num_inference_steps, + split_text_embed_in_sp=get_pipeline_parallel_world_size() == 1, + ) + + + if args.tensor_parallel_degree > 1: + tp_applicator = TensorParallelApplicator(get_tensor_model_parallel_world_size(), get_tensor_model_parallel_rank()) + tp_applicator.apply_to_model(pipe.transformer) + tp_applicator.apply_to_llamamodel(pipe.text_encoder) + + pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer) + pipe.text_encoder=apply_quant_linear_i8w8o16(pipe.text_encoder) + + if args.enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} sequential CPU offload enabled") + elif args.enable_model_cpu_offload: + pipe.enable_model_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} model CPU offload enabled") + else: + device = torch.device(f"cuda:{local_rank}") + pipe = pipe.to(device) + + if args.enable_tiling: + pipe.vae.enable_tiling() + + if args.enable_slicing: + pipe.vae.enable_slicing() + + parameter_peak_memory = torch.cuda.max_memory_allocated( + device=f"cuda:{local_rank}") + + if engine_config.runtime_config.use_torch_compile: + torch._inductor.config.reorder_for_compute_comm_overlap = True + pipe.transformer = torch.compile(pipe.transformer, + mode="max-autotune-no-cudagraphs") + + # one step to warmup the torch compiler + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + num_inference_steps=1, + generator=torch.Generator(device="cuda").manual_seed( + input_config.seed), + ).frames[0] + warmup =False + if warmup: + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + num_inference_steps=1, + generator=torch.Generator(device="cuda").manual_seed( + input_config.seed), + ).frames[0] + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + num_inference_steps=input_config.num_inference_steps, + generator=torch.Generator(device="cuda").manual_seed( + input_config.seed), + ).frames[0] + + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"tp{engine_args.tensor_parallel_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + if is_dp_last_group(): + resolution = f"{input_config.width}x{input_config.height}" + output_filename = f"results/hunyuan_video_{parallel_info}_{resolution}.mp4" + export_to_video(output, output_filename, fps=15) + print(f"output saved to {output_filename}") + + if get_world_group().rank == get_world_group().world_size - 1: + print( + f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, memory: {peak_memory/1e9} GB" + ) + get_runtime_state().destroy_distributed_env() + + +# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 320 --width 512 --num_frames 61 --enable_tiling --enable_model_cpu_offload +# mkdir -p results && torchrun --nproc_per_node=2 examples/hunyuan_video_usp_example.py --model tencent/HunyuanVideo --ulysses_degree 2 --num_inference_steps 30 --warmup_steps 0 --prompt "A cat walks on the grass, realistic" --height 544 --width 960 --num_frames 129 --enable_tiling --enable_model_cpu_offload +if __name__ == "__main__": + main() diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt b/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a81dd7eef2aec1fab733810b1ed8531ac1515a4 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/requirements.txt @@ -0,0 +1,9 @@ +yunchang +diffusers +ftfy +transformers>=4.55 +numpy==1.26.4 +imageio +imageio-ffmpeg +distvae + diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh new file mode 100644 index 0000000000000000000000000000000000000000..ba4dd3ed3735cae1084a6130987c3e41d8bd93f7 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -x + +export PYTHONPATH=$PWD:$PYTHONPATH +export NCCL_USE_HIGHPRIORITYWARP=1 +export ENABLE_IXFORMER_INFERENCE=1 +export ATTN_OPT_LEVEL=2 #xdit >=0.4.5 +export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4 + +SCRIPT="hunyuan_video_usp_example.py" +MODEL_ID="/data/nlp/HunyuanVideo/" +INFERENCE_STEP=50 +mkdir -p ./results + +TASK_ARGS="--height 720 --width 1280 --num_frames 133 --guidance_scale 5.0" + +N_GPUS=8 +PARALLEL_ARGS="--ulysses_degree 4 --ring_degree 2" +ENABLE_TILING="--enable_tiling" +ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload" +COMPILE_FLAG="--use_torch_compile" + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$PIPEFUSION_ARGS \ +$OUTPUT_ARGS \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 0 \ +--prompt "A cat walks on the grass, realistic" \ +$CFG_ARGS \ +$PARALLLEL_VAE \ +$ENABLE_TILING \ +$ENABLE_MODEL_CPU_OFFLOAD \ +$COMPILE_FLAG diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh new file mode 100644 index 0000000000000000000000000000000000000000..fab6e8fceeb13793e330cf90b4e9c21b1543df30 --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/run_hunyuan_video_usp_teacache.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -x + +export PYTHONPATH=$PWD:$PYTHONPATH +export NCCL_USE_HIGHPRIORITYWARP=1 +export ENABLE_IXFORMER_INFERENCE=1 +export ATTN_OPT_LEVEL=2 #xdit >=0.4.5 +export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4 + +SCRIPT="hunyuan_video_usp_example_teacache.py" +MODEL_ID="/data/nlp/HunyuanVideo/" + +INFERENCE_STEP=50 + +mkdir -p ./results + +TASK_ARGS="--height 720 --width 1280 --num_frames 129 --seed 24" + +# CogVideoX parallel configuration +N_GPUS=8 +PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 8" +ENABLE_TILING="--enable_tiling" +ENABLE_MODEL_CPU_OFFLOAD="--enable_model_cpu_offload" +# COMPILE_FLAG="--use_torch_compile" + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$PIPEFUSION_ARGS \ +$OUTPUT_ARGS \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 0 \ +--prompt "A cat walks on the grass, realistic" \ +$CFG_ARGS \ +$PARALLLEL_VAE \ +$ENABLE_TILING \ +$ENABLE_MODEL_CPU_OFFLOAD \ +$COMPILE_FLAG \ +--use_teacache + diff --git a/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py b/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py new file mode 100644 index 0000000000000000000000000000000000000000..d320f03d0eae209c68db65e1f2ea438cb79a708c --- /dev/null +++ b/models/multimodal/diffusion_model/hunyuan_video/xdit/w8a8_linear.py @@ -0,0 +1,106 @@ +import torch +from typing import Optional +from torch.nn.parameter import Parameter +from ixformer.inference.functions.w8a8 import w8a8, dynamic_scaled_int8_quant + + +def perchannel_quantize_weight_int8(weight: torch.Tensor): + weight = weight.cpu().to(torch.float32) + n_bit = 8 + eps = 1e-5 + max_int = 2**(n_bit - 1) - 1 + min_int = -(2**(n_bit - 1)-1) + max_val = weight.abs().amax(dim=-1, keepdim=True) + # max_val = max_val.clamp(min=eps) + qscale = max_val / max_int + qweight = torch.clamp(torch.round(weight * (1.0 / qscale)), min_int, + max_int).to(torch.int8) + qscale = qscale.squeeze().to(torch.float32) + return qweight, qscale +class DynamicQuantizeLinear(torch.nn.Module): + def __init__(self, + unquantized: torch.nn.Module, + output_dtype: Optional[torch.dtype] = None, + ): + + super().__init__() + assert isinstance(unquantized, torch.nn.Linear) + self.in_features = unquantized.in_features + self.out_features = unquantized.out_features + + self.device = unquantized.weight.device + self.output_dtype =output_dtype + + qweight, qscale = perchannel_quantize_weight_int8(unquantized.weight) + self.weight = Parameter(qweight.to(self.device), requires_grad=False) + self.scale = Parameter(qscale.to(self.device), requires_grad=False) + + if unquantized.bias is not None: + self.bias = unquantized.bias.to(self.device) + else: + self.register_parameter("bias", None) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + device =self.weight.device + assert x.device == device + output_dtype = x.dtype if self.output_dtype is None else self.output_dtype + inputs = torch.empty(x.shape, dtype=torch.int8, device=device) + i_scales = torch.empty(x.shape[:-1], dtype=torch.float32, device=device) + dynamic_scaled_int8_quant(inputs, x.contiguous(), i_scales) + + output = torch.empty( + (inputs.shape[:-1] + (self.weight.shape[0],)), + dtype=output_dtype, + device=device, + ) + + out = w8a8(inputs, self.weight, i_scales, self.scale,self.bias, output) + # if self.bias is not None: + # out =out +self.bias + return out + +def _is_linear(mod, *args): + # return isinstance(mod, torch.nn.Linear) and args[0] in ["to_qkv", "to_added_qkv", "proj"] + # if isinstance(mod, torch.nn.Linear): + # print(args[0]) + return isinstance(mod, torch.nn.Linear) and "transformer" in args[0] and ("attn1" in args[0] or "attn" in args[0] or "ff" in args[0] or "proj_mlp" in args[0] or "proj_out" in args[0]) + +def _is_linear_flux(mod, *args): + # return isinstance(mod, torch.nn.Linear) and args[0] in ["to_qkv", "to_added_qkv", "proj"] + # if isinstance(mod, torch.nn.Linear): + # print(args[0]) + return isinstance(mod, torch.nn.Linear) and "transformer" in args[0] and ( "attn" in args[0] or "ff" in args[0] or "proj_out" in args[0] ) + +def _is_linear_sd3(mod, *args): + return isinstance(mod, torch.nn.Linear) and "transformer" in args[0] and ("attn" in args[0] or "ff" in args[0] or "proj_out" in args[0]) + +def _is_linear_hunyuandit(mod, *args): + return isinstance(mod, torch.nn.Linear) and "blocks" in args[0] + +def _is_wan_linear(mod, *args): + return isinstance(mod, torch.nn.Linear) and ("attn1" in args[0] or "attn" in args[0] or "attn2" in args[0] or "ffn" in args[0] or "proj_out" in args[0]) + + +def apply_quant_linear_i8w8o16(model, cls=DynamicQuantizeLinear, filter_fn = None): + if filter_fn is None: + filter_fn = _is_linear + if type(model).__name__ == "FluxTransformer2DModel" or type(model).__name__ == "xFuserFluxTransformer2DWrapper": + filter_fn = _is_linear_flux + elif type(model).__name__ == "HunyuanDiT2D" or type(model).__name__ == "xFuserHunyuanDiT2DWrapper": + filter_fn = _is_linear_hunyuandit + # elif type(model).__name__ == "SD3Transformer2DModel" or type(model).__name__ == "xFuserSD3Transformer2DWrapper": + # filter_fn = _is_linear_sd3 + elif type(model).__name__ == "WanTransformer3DModel" or type(model).__name__ == "xFuserWanTransformer3DModelWrapper": + filter_fn = _is_wan_linear + # for name, child in model.named_children(): + # if filter_fn(child, name): + # setattr(model, name, cls(child)) + # else: + # apply_quant_linear_i8w8o16(child, cls, filter_fn) + for name, m in model.named_modules(): + if filter_fn(m,name): + parent_module_name, child_name = name.rsplit('.', 1) if '.' in name else ('', name) + parent_module = model.get_submodule(parent_module_name) + # print(parent_module_name,name) + setattr(parent_module, child_name, cls(m)) + return model \ No newline at end of file diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4dc2cd9c081086e96d65253f691053f126db1a5b --- /dev/null +++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md @@ -0,0 +1,50 @@ +# Stable Diffusion 3 Medium (xDiT) + +## Model Description + +Stable Diffusion 3 Medium is Stability AI's latest text-to-image diffusion model, featuring significant improvements in image quality, prompt adherence, and typography rendering. It uses a new Multimodal Diffusion Transformer (MMDiT) architecture with separate sets of weights for text and image encoders. + +This version runs on the xDiT framework, optimized for Iluvatar CoreX GPUs. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +1. Install Iluvatar CoreX adapted framework: +```bash +pip install diffusers-{version}-py3-none-any.whl +pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Model Inference + +1. The scripts are pre-copied in this directory. Modify model path: +```bash +vim run_sd3.sh +# Update MODEL_ID to your actual model path +``` + +2. Run script: +```bash +bash run_sd3.sh +``` + +## References + +- [Stable Diffusion 3](https://github.com/Stability-AI/stable-diffusion) +- [xDiT](https://github.com/xdit-team/xDiT) \ No newline at end of file diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2969a4385d913c98a2cb13adfa2bb29f3d3f0938 --- /dev/null +++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/requirements.txt @@ -0,0 +1,9 @@ +#diffusers +yunchang +ftfy +transformers>=4.55 +numpy==1.26.4 +imageio +imageio-ffmpeg +distvae + diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh new file mode 100644 index 0000000000000000000000000000000000000000..98f21c9fd429500abc55db01ac637626bbe7f546 --- /dev/null +++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/run_sd3.sh @@ -0,0 +1,45 @@ +# set -x +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 +export PYTHONPATH=$PWD:$PYTHONPATH + +#多ring 没提升 +# export NCCL_USE_HIGHPRIORITYWARP=1 + +export ENABLE_IXFORMER_INFERENCE=1 +export ATTN_OPT_LEVEL=2 #xdit >=0.4.5 +export ENABLE_IXFORMER_SAGEATTN=1 #使用 sageattention,#xdit ==0.4.4 +export ENABLE_IXFORMER_W8A8LINEAR=1 + +# Select the model type +SCRIPT=sd3_example.py +MODEL_ID=/data/nlp/stable-diffusion-3-medium-diffusers +INFERENCE_STEP=50 + +echo ${MODEL_CONFIGS[$MODEL_TYPE]} + +mkdir -p ./results + +# task args +TASK_ARGS="--height 1024 --width 1024 --no_use_resolution_binning --guidance_scale 3.5" + + +N_GPUS=4 +PARALLEL_ARGS="--pipefusion_parallel_degree 2 --ulysses_degree 1 --ring_degree 2 --tensor_parallel_degree 1 --data_parallel_degree 1" + + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$PIPEFUSION_ARGS \ +$OUTPUT_ARGS \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 1 \ +--prompt "brown dog laying on the ground with a metal bowl in front of him." \ +$CFG_ARGS \ +$PARALLLEL_VAE \ +$COMPILE_FLAG \ +$QUANTIZE_FLAG \ +$CACHE_ARGS \ + diff --git a/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc86bd64ff213a6884a1750f22e0aac67bef071 --- /dev/null +++ b/models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/sd3_example.py @@ -0,0 +1,132 @@ +import time +import os +import torch +import torch.distributed +from transformers import T5EncoderModel +from xfuser import xFuserStableDiffusion3Pipeline, xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + is_dp_last_group, + get_data_parallel_rank, + get_runtime_state, +) +from xfuser.core.distributed.parallel_state import get_data_parallel_world_size + +from apex.normalization.fused_layer_norm import FusedRMSNorm +import torch +import torch.nn as nn + + +from ixformer.inference.functions.rms_norm import rms_norm + + +class T5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Construct a layernorm module in the T5 style. No bias and no subtraction of mean. + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + # Compute variance without subtracting mean (RMSNorm) + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + # Cast back to half precision if needed + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + + return self.weight * hidden_states + + +#https://github.com/huggingface/transformers/issues/20287 fix apex, from apex. normalization import FusedRMSNorm +def replace_fused_rmsnorm_with_t5(module): + for name, child in module.named_children(): + if isinstance(child, FusedRMSNorm): + hidden_size = child.weight.shape[0] + eps = getattr(child, "eps", 1e-6) + new_ln = T5LayerNorm(hidden_size, eps=eps) + new_ln.weight.data = child.weight.data.clone() + setattr(module, name, new_ln) + else: + replace_fused_rmsnorm_with_t5(child) + +def main(): + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + engine_config, input_config = engine_args.create_config() + local_rank = get_world_group().local_rank + torch.cuda.set_device(local_rank) + text_encoder_3 = T5EncoderModel.from_pretrained(engine_config.model_config.model, subfolder="text_encoder_3", torch_dtype=torch.float16) + if args.use_fp8_t5_encoder: + from optimum.quanto import freeze, qfloat8, quantize + print(f"rank {local_rank} quantizing text encoder 2") + quantize(text_encoder_3, weights=qfloat8) + freeze(text_encoder_3) + + pipe = xFuserStableDiffusion3Pipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + engine_config=engine_config, + torch_dtype=torch.float16, + text_encoder_3=text_encoder_3, + ).to(f"cuda:{local_rank}") + + replace_fused_rmsnorm_with_t5(text_encoder_3) + parameter_peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + import os + if os.environ.get("ENABLE_IXFORMER_W8A8LINEAR", "0") == "1": + from w8a8_linear import apply_quant_linear_i8w8o16 + pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer) + + pipe.prepare_run(input_config) + + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + output = pipe( + height=input_config.height, + width=input_config.width, + prompt=input_config.prompt, + num_inference_steps=input_config.num_inference_steps, + output_type=input_config.output_type, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + ) + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_allocated(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + if input_config.output_type == "pil": + dp_group_index = get_data_parallel_rank() + num_dp_groups = get_data_parallel_world_size() + dp_batch_size = (input_config.batch_size + num_dp_groups - 1) // num_dp_groups + if pipe.is_dp_last_group(): + if not os.path.exists("results"): + os.mkdir("results") + for i, image in enumerate(output.images): + image_rank = dp_group_index * dp_batch_size + i + image.save( + f"./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png" + ) + print( + f"image {i} saved to ./results/stable_diffusion_3_result_{parallel_info}_{image_rank}.png" + ) + + if get_world_group().rank == get_world_group().world_size - 1: + print( + f"epoch time: {elapsed_time:.2f} sec, parameter memory: {parameter_peak_memory/1e9:.2f} GB, peak memory: {peak_memory/1e9:.2f} GB" + ) + + get_runtime_state().destroy_distributed_env() + + +if __name__ == "__main__": + main() diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..01a4357fba4260239c00a819d7408237777602d2 --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md @@ -0,0 +1,61 @@ +# Wan2.1-T2V-14B-Diffusers (xDiT) + +## Model Description + +Wan2.1-T2V-14B is Wan AI's large-scale text-to-video diffusion model with 14B parameters. It generates high-quality, cinematic videos from text prompts with excellent motion dynamics and visual fidelity. + +This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +1. Install Iluvatar CoreX adapted framework: +```bash +pip install diffusers-{version}-py3-none-any.whl +pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Model Inference + +1. The scripts are pre-copied in this directory. Modify model path: +```bash +vim run_wan_2.1_t2v_14b.sh +# Update MODEL_ID to your actual model path +# Modify TASK_ARGS if needed +``` + +2. Run script: +```bash +bash run_wan_2.1_t2v_14b.sh +``` + +3. The model supports BS=1/BS=2. Different BS prompts format: +```bash +# BS1 (default) prompt format +--prompt "一个虎虎生威的老虎" \ +--negative_prompt "畸形,光照不好" \ +# BS2 prompt format +--prompt "一个虎虎生威的老虎" "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \ +--negative_prompt "畸形,光照不好" "畸形,光照不好" \ +``` + +## References + +- [Wan2.1](https://github.com/Wan-Video/Wan2.1) +- [xDiT](https://github.com/xdit-team/xDiT) \ No newline at end of file diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fa5b1c5f7e309593a6e11bde46979d4c4255b4b --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/requirements.txt @@ -0,0 +1,9 @@ +#diffusers +yunchang +ftfy +transformers==4.55 +numpy==1.26.4 +imageio +imageio-ffmpeg +distvae + diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh new file mode 100644 index 0000000000000000000000000000000000000000..133c5fd10a7c2520a5bf27710d3b5e5850616537 --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/run_wan_2.1_t2v_14b.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -x +export WORD_RANK_SUPPORT_TP=1 +export ATTN_OPT_LEVEL=2 #xdit >=0.4.5 +export ENABLE_IXFORMER_SAGEATTN=1 #xdit ==0.4.4 +export TOKENIZERS_PARALLELISM=true +export PYTHONPATH=$PWD:$PYTHONPATH + +# CogVideoX configuration +SCRIPT="wan2.1_t2v_example.py" +MODEL_ID="/data/nlp/Wan2.1-T2V-14B-Diffusers/" +INFERENCE_STEP=20 + +mkdir -p ./results + +# CogVideoX specific task args +TASK_ARGS="--height 480 --width 832 --num_frames 33 --seed 33 " + +# CogVideoX parallel configuration +N_GPUS=4 +PARALLEL_ARGS="--ulysses_degree 1 --ring_degree 1 --tensor_parallel_degree 2" +CFG_ARGS="--use_cfg_parallel" + +# Uncomment and modify these as needed +# PIPEFUSION_ARGS="--num_pipeline_patch 8" +# OUTPUT_ARGS="--output_type latent" +# PARALLLEL_VAE="--use_parallel_vae" +# ENABLE_TILING="--enable_tiling" +# MODEL_OFFLOAD="--enable_model_cpu_offload" +ENABLE_CACHE="--use_teacache" +COMPILE_FLAG="--use_torch_compile" +#ENABLE_W8A8="--use_w8a8_linear" + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$ENABLE_W8A8 \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 0 \ +--prompt "A rainy night in a dense cyberpunk market, neon kanji signs flicker overhead. The camera starts shoulder-height behind a hooded courier, steadily tracking forward as he weaves through crowds of holographic umbrellas. Volumetric pink-blue backlight cuts through steam vents, puddles mirror the glow. Lens flare, shallow depth of field. Moody, Blade-Runner vibe." \ +--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \ +$ENABLE_TILING \ +$ENABLE_CACHE \ +$COMPILE_FLAG \ +$CFG_ARGS diff --git a/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py new file mode 100644 index 0000000000000000000000000000000000000000..73da9b094935591b99a81d05dca9d17809f0ec97 --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/wan2.1_t2v_example.py @@ -0,0 +1,131 @@ +import logging +import time +import torch +import torch.distributed +from diffusers import AutoencoderKLTemporalDecoder +from xfuser import xFuserWanPipeline, xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + get_data_parallel_rank, + get_data_parallel_world_size, + get_runtime_state, + is_dp_last_group, + get_world_group +) +from diffusers import WanPipeline + +from xfuser.model_executor.cache.teacache.backend import TeaCacheBackend +from xfuser.model_executor.cache.data import DiffusionCacheConfig +from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank +from diffusers.utils import export_to_video + + +def main(): + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + + engine_config, input_config = engine_args.create_config() + local_rank = get_world_group().local_rank + + assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion." + # assert engine_args.use_parallel_vae is False, "parallel VAE not implemented for CogVideo" + # assert not (engine_args.tensor_parallel_degree > 1 and engine_args.ulysses_degree > 1), "This script cannot support tensor_parallel_degree and ulysses_degree at the same time." + + pipe = xFuserWanPipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + engine_config=engine_config, + torch_dtype=torch.bfloat16, + ) + + # https://github.com/ali-vilab/TeaCache/blob/main/TeaCache4Wan2.1/teacache_generate.py#L892 + if engine_args.use_teacache: + config = DiffusionCacheConfig(rel_l1_thresh = 0.2, + coefficients = [-5784.54975374, 5449.50911966, -1811.16591783, 256.27178429, -13.02252404] + ) + backend = TeaCacheBackend(config) + backend.enable(pipe,transformer_key = "transformer") + backend.refresh(pipe, input_config.num_inference_steps, transformer_key = "transformer") + + if args.enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} sequential CPU offload enabled") + elif args.enable_model_cpu_offload: + pipe.enable_model_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} model CPU offload enabled") + else: + device = torch.device(f"cuda:{local_rank}") + pipe = pipe.to(device) + + if args.enable_tiling: + pipe.vae.enable_tiling() + + if args.enable_slicing: + pipe.vae.enable_slicing() + + if args.use_easycache: + cache_kwargs = { + "use_easycache":True, + "cache_thresh":0.02 #easy eacch thresh + } + else: + cache_kwargs = None + + + if engine_args.use_w8a8_linear: + from w8a8_linear import apply_quant_linear_i8w8o16 + pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer) + + # warmup + # output = pipe( + # height=input_config.height, + # width=input_config.width, + # num_frames=input_config.num_frames, + # prompt=input_config.prompt, + # num_inference_steps=1, + # generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + # ).frames + + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + + prompt=["一个虎虎生威的老虎","Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage"] + negative_prompt = ["畸形,光照不好","畸形,光照不好"] + + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + negative_prompt = input_config.negative_prompt, + num_inference_steps=input_config.num_inference_steps, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + cache_kwargs = cache_kwargs + ) + + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_reserved(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"tp{engine_args.tensor_parallel_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + # if is_dp_last_group(): + resolution = f"{input_config.width}x{input_config.height}" + for i, frames in enumerate(output.frames): + output_filename = f"results/wan2.1_t2v_14b_{i}_{parallel_info}_{resolution}.mp4" + export_to_video(frames, output_filename, fps=16) + print(f"output saved to {output_filename}") + + if get_world_group().rank == get_world_group().world_size - 1: + print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB") + # get_runtime_state().destroy_distributed_env() + + +if __name__ == "__main__": + main() diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ad0ac17341e34ab57603a858950e5a6f3547b2dd --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md @@ -0,0 +1,51 @@ +# Wan2.2-TI2V-5B-Diffusers (xDiT) + +## Model Description + +Wan2.2-TI2V-5B is Wan AI's image-to-video diffusion model with 5B parameters. It generates smooth, high-quality videos from input images, maintaining visual consistency and adding natural motion. + +This model runs on the xDiT framework, optimized for Iluvatar CoreX GPUs. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +1. Install Iluvatar CoreX adapted framework: +```bash +pip install diffusers-{version}-py3-none-any.whl +pip install xfuser-{version}+corex.{v.r.m}-py3-none-any.whl +``` + +2. Install dependencies: +```bash +pip install -r requirements.txt +``` + +## Model Inference + +1. The scripts are pre-copied in this directory. Modify model path: +```bash +vim run_wan_2.2_t2v_5b.sh +# Update MODEL_ID to your actual model path +# Modify TASK_ARGS if needed +``` + +2. Run script: +```bash +bash run_wan_2.2_t2v_5b.sh +``` + +## References + +- [Wan2.2](https://github.com/Wan-Video/Wan2.1) +- [xDiT](https://github.com/xdit-team/xDiT) \ No newline at end of file diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fa5b1c5f7e309593a6e11bde46979d4c4255b4b --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/requirements.txt @@ -0,0 +1,9 @@ +#diffusers +yunchang +ftfy +transformers==4.55 +numpy==1.26.4 +imageio +imageio-ffmpeg +distvae + diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh new file mode 100644 index 0000000000000000000000000000000000000000..9bea267c7861522207d604e3c115da59b6da9ff4 --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/run_wan_2.2_t2v_5b.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -x +#export ATTN_OPT_LEVEL=2 #xdit >=0.4.5 +export PYTHONPATH=$PWD:$PYTHONPATH + +# CogVideoX configuration +SCRIPT="wan2.2_t2v_example.py" +MODEL_ID="/data/nlp/Wan2.2-TI2V-5B-Diffusers/" +INFERENCE_STEP=50 + +mkdir -p ./results + +# CogVideoX specific task args +TASK_ARGS="--height 704 --width 1280 --num_frames 33 --seed 32 " + +# CogVideoX parallel configuration +N_GPUS=4 +PARALLEL_ARGS="--ulysses_degree 2 --ring_degree 1 --tensor_parallel_degree 1" +CFG_ARGS="--use_cfg_parallel" + +# Uncomment and modify these as needed +# PIPEFUSION_ARGS="--num_pipeline_patch 8" +# OUTPUT_ARGS="--output_type latent" +# PARALLLEL_VAE="--use_parallel_vae" +# ENABLE_TILING="--enable_tiling" +# MODEL_OFFLOAD="--enable_model_cpu_offload" +# COMPILE_FLAG="--use_torch_compile" + +torchrun --nproc_per_node=$N_GPUS ./$SCRIPT \ +--model $MODEL_ID \ +$PARALLEL_ARGS \ +$TASK_ARGS \ +$PIPEFUSION_ARGS \ +$OUTPUT_ARGS \ +--num_inference_steps $INFERENCE_STEP \ +--warmup_steps 0 \ +--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage" \ +--negative_prompt "色调艳丽,过曝,静态,细节模糊不清,字幕,风格,作品,画作,画面,静止,整体发灰,最差质量,低质量,JPEG压缩残留,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走" \ +$ENABLE_TILING \ +$ENABLE_CACHE \ +$COMPILE_FLAG \ +$CFG_ARGS diff --git a/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py new file mode 100644 index 0000000000000000000000000000000000000000..346b9e937cba7a0c4add5b860732fab7225e2818 --- /dev/null +++ b/models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/wan2.2_t2v_example.py @@ -0,0 +1,131 @@ +import logging +import time +import torch +import torch.distributed +from diffusers import AutoencoderKLTemporalDecoder +from xfuser import xFuserWanPipeline, xFuserArgs +from xfuser.config import FlexibleArgumentParser +from xfuser.core.distributed import ( + get_world_group, + get_data_parallel_rank, + get_data_parallel_world_size, + get_runtime_state, + is_dp_last_group, + get_world_group +) +from diffusers import WanPipeline + +from xfuser.core.distributed.parallel_state import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank +from diffusers.utils import export_to_video +from xfuser.model_executor.cache.teacache.backend import TeaCacheBackend +from xfuser.model_executor.cache.data import DiffusionCacheConfig + + +def main(): + parser = FlexibleArgumentParser(description="xFuser Arguments") + args = xFuserArgs.add_cli_args(parser).parse_args() + engine_args = xFuserArgs.from_cli_args(args) + + engine_config, input_config = engine_args.create_config() + local_rank = get_world_group().local_rank + + assert engine_args.pipefusion_parallel_degree == 1, "This script does not support PipeFusion." + + pipe = xFuserWanPipeline.from_pretrained( + pretrained_model_name_or_path=engine_config.model_config.model, + engine_config=engine_config, + torch_dtype=torch.bfloat16, + ) + + if engine_args.use_teacache: + config = DiffusionCacheConfig(rel_l1_thresh = 0.2, + coefficients = [ + 6.85271205e+04, + -9.88214072e+03, + 5.08858742e+02, + -7.39731467e+00, + 1.22746295e-01,]) + backend = TeaCacheBackend(config) + backend.enable(pipe,transformer_key = "transformer_2") + backend.refresh(pipe, input_config.num_inference_steps, transformer_key = "transformer_2") + + + if args.enable_sequential_cpu_offload: + pipe.enable_sequential_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} sequential CPU offload enabled") + elif args.enable_model_cpu_offload: + pipe.enable_model_cpu_offload(gpu_id=local_rank) + logging.info(f"rank {local_rank} model CPU offload enabled") + else: + device = torch.device(f"cuda:{local_rank}") + pipe = pipe.to(device) + + if args.enable_tiling: + pipe.vae.enable_tiling() + + if args.enable_slicing: + pipe.vae.enable_slicing() + + if engine_args.use_w8a8_linear: + from w8a8_linear import apply_quant_linear_i8w8o16 + pipe.transformer=apply_quant_linear_i8w8o16(pipe.transformer) + + # warmup + # output = pipe( + # height=input_config.height, + # width=input_config.width, + # num_frames=input_config.num_frames, + # prompt=input_config.prompt, + # num_inference_steps=1, + # generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + # ).frames + + torch.cuda.reset_peak_memory_stats() + start_time = time.time() + + + if args.use_easycache: + cache_kwargs = { + "use_easycache":True, + "cache_thresh":0.02, #easy eacch thresh + #"ret_steps":10 + } + else: + cache_kwargs = None + + output = pipe( + height=input_config.height, + width=input_config.width, + num_frames=input_config.num_frames, + prompt=input_config.prompt, + negative_prompt = input_config.negative_prompt, + num_inference_steps=input_config.num_inference_steps, + guidance_scale=input_config.guidance_scale, + generator=torch.Generator(device="cuda").manual_seed(input_config.seed), + cache_kwargs = cache_kwargs + ) + + end_time = time.time() + elapsed_time = end_time - start_time + peak_memory = torch.cuda.max_memory_reserved(device=f"cuda:{local_rank}") + + parallel_info = ( + f"dp{engine_args.data_parallel_degree}_cfg{engine_config.parallel_config.cfg_degree}_" + f"ulysses{engine_args.ulysses_degree}_ring{engine_args.ring_degree}_" + f"tp{engine_args.tensor_parallel_degree}_" + f"pp{engine_args.pipefusion_parallel_degree}_patch{engine_args.num_pipeline_patch}" + ) + # if is_dp_last_group(): + resolution = f"{input_config.width}x{input_config.height}" + for i, frames in enumerate(output.frames): + output_filename = f"results/wan2.2_t2v_{i}_{parallel_info}_{resolution}.mp4" + export_to_video(frames, output_filename, fps=16) + print(f"output saved to {output_filename}") + + if get_world_group().rank == get_world_group().world_size - 1: + print(f"epoch time: {elapsed_time:.2f} sec, memory: {peak_memory/1e9} GB") + get_runtime_state().destroy_distributed_env() + + +if __name__ == "__main__": + main() diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md index bf11ca80b8e0699868b7a002d488cfb1b50f5938..be0e86e64ef652cd7c502375791ce5181fe4c286 100644 --- a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md @@ -9,6 +9,7 @@ Qwen2.5-VL is not only proficient in recognizing common objects such as flowers, | GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | | :----: | :----: | :----: | | MR-V100 | 4.3.0 | 25.09 | +| MR-V100 | 4.4.0 | 26.03 | ## Model Preparation @@ -32,6 +33,49 @@ export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model /path/to/Qwen2.5-VL-3B-Instruct/ -tp 4 --trust-remote-code --temperature 0.0 --max-token 256 ``` +### Qwen2.5-VL-32B-Instruct (W8A8/W4A16) + +#### Performance Test + +1. Set environment variables: +```bash +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +2. Start server: +```bash +vllm serve /path/to/model --max-num-seqs 1 --max-model-len 98304 --limit_mm_per_prompt '{"image": 5}' --disable-cascade-attn --tensor-parallel-size 4 --gpu_memory_utilization 0.9 --pipeline-parallel-size 1 --host 0.0.0.0 --port 8000 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +3. Run client: +```bash +# Use the pre-copied guidellm +cd guidellm && pip install . +pip install beautifulsoup4 +cd .. +guidellm --data "prompt_tokens=512,generated_tokens=512,images=1,width=1770,height=1180" --data-type emulated --model /path/to/model --target "http://localhost:8000/v1" --max-requests 1 +``` + +### Qwen2.5-VL-72B-Instruct (W4A16) + +#### Performance Test + +1. Set environment variables: +```bash +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +2. Start server: +```bash +vllm serve /path/to/model --max-num-seqs 1 --max-model-len 98304 --limit_mm_per_prompt '{"image": 5}' --disable-cascade-attn --tensor-parallel-size 8 --gpu_memory_utilization 0.9 --pipeline-parallel-size 1 --host 0.0.0.0 --port 8000 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +3. Run client: +```bash +# Same as 32B version +guidellm --data "prompt_tokens=512,generated_tokens=512,images=1,width=1770,height=1180" --data-type emulated --model /path/to/model --target "http://localhost:8000/v1" --max-requests 1 +``` + ## Model Results ### Benchmarking vLLM diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..6ab2c6e9940c580355ebf34c530ffa4fb6b5ce83 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/pyproject.toml @@ -0,0 +1,212 @@ +[build-system] +requires = ["setuptools >= 61.0", "wheel", "build"] +build-backend = "setuptools.build_meta" + + +[tool.setuptools.packages.find] +where = ["src"] +include = ["*"] + +[tool.setuptools.package-data] +guidellm = ["*"] + + +# ************************************************ +# ********** Project Metadata ********** +# ************************************************ + +[project] +name = "guidellm" +version = "0.1.0" +description = "Guidance platform for deploying and managing large language models." +readme = { file = "README.md", content-type = "text/markdown" } +requires-python = ">=3.8.0,<4.0" +license = { file = "LICENSE" } +authors = [ { name = "Neuralmagic, Inc." } ] +urls = { homepage = "https://github.com/neuralmagic/guidellm" } +dependencies = [ + "click", + "datasets", + "ftfy>=6.0.0", + "loguru", + "numpy", + "openai", + "pydantic>=2.0.0", + "pydantic-settings>=2.0.0", + "pyyaml>=6.0.0", + "requests", + "rich", + "transformers", +] + +[project.optional-dependencies] +dev = [ + # general and configurations + "pre-commit~=3.5.0", + "scipy~=1.10", + "sphinx~=7.1.2", + "tox~=4.16.0", + + # testing + "pytest~=8.2.2", + "pytest-asyncio~=0.23.8", + "pytest-cov~=5.0.0", + "pytest-mock~=3.14.0", + "pytest-rerunfailures~=14.0", + "requests-mock~=1.12.1", + + # code quality + "mypy~=1.10.1", + "ruff~=0.5.2", + + # docs quality + "mdformat~=0.7.17", + "mdformat-footnote~=0.1.1", + "mdformat-frontmatter~=2.0.8", + "mdformat-gfm~=0.3.6", + + # type-checking + "types-click~=7.1.8", + "types-PyYAML~=6.0.1", + "types-requests~=2.32.0", + "types-toml", +] + + +[project.entry-points.console_scripts] +guidellm = "guidellm.main:generate_benchmark_report_cli" +guidellm-config = "guidellm.config:print_config" + + +# ************************************************ +# ********** Code Quality Tools ********** +# ************************************************ + +[tool.black] +line-length = 88 +target-version = ['py38'] + + +[tool.isort] +profile = "black" + + +[tool.mypy] +files = ["src/guidellm", "tests"] +python_version = '3.8' +warn_redundant_casts = true +warn_unused_ignores = false +show_error_codes = true +namespace_packages = true +exclude = ["venv", ".tox"] + +# Silence "type import errors" as our 3rd-party libs does not have types +# Check: https://mypy.readthedocs.io/en/latest/config_file.html#import-discovery +follow_imports = 'silent' + +[[tool.mypy.overrides]] +module = ["datasets.*"] +ignore_missing_imports=true + + +[tool.ruff] +line-length = 88 +indent-width = 4 +exclude = ["build", "dist", "env", ".venv"] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" + +[tool.ruff.lint] +ignore = [ + "PLR0913", + "TCH001", + "COM812", + "ISC001", + "TCH002", + "PLW1514", # allow Path.open without encoding + "RET505", # allow `else` blocks + "RET506" # allow `else` blocks + +] +select = [ + # Rules reference: https://docs.astral.sh/ruff/rules/ + + # Code Style / Formatting + "E", # pycodestyle: checks adherence to PEP 8 conventions including spacing, indentation, and line length + "W", # pycodestyle: checks adherence to PEP 8 conventions including spacing, indentation, and line length + "A", # flake8-builtins: prevents shadowing of Python built-in names + "C", # Convention: ensures code adheres to specific style and formatting conventions + "COM", # flake8-commas: enforces the correct use of trailing commas + "ERA", # eradicate: detects commented-out code that should be removed + "I", # isort: ensures imports are sorted in a consistent manner + "ICN", # flake8-import-conventions: enforces import conventions for better readability + "N", # pep8-naming: enforces PEP 8 naming conventions for classes, functions, and variables + "NPY", # NumPy: enforces best practices for using the NumPy library + "PD", # pandas-vet: enforces best practices for using the pandas library + "PT", # flake8-pytest-style: enforces best practices and style conventions for pytest tests + "PTH", # flake8-use-pathlib: encourages the use of pathlib over os.path for file system operations + "Q", # flake8-quotes: enforces consistent use of single or double quotes + "TCH", # flake8-type-checking: enforces type checking practices and standards + "TID", # flake8-tidy-imports: enforces tidy and well-organized imports + "RUF022", # flake8-ruff: enforce sorting of __all__ in modules + + # Code Structure / Complexity + "C4", # flake8-comprehensions: improves readability and performance of list, set, and dict comprehensions + "C90", # mccabe: checks for overly complex code using cyclomatic complexity + "ISC", # flake8-implicit-str-concat: prevents implicit string concatenation + "PIE", # flake8-pie: identifies and corrects common code inefficiencies and mistakes + "R", # Refactor: suggests improvements to code structure and readability + "SIM", # flake8-simplify: simplifies complex expressions and improves code readability + + # Code Security / Bug Prevention + "ARG", # flake8-unused-arguments: detects unused function and method arguments + "ASYNC", # flake8-async: identifies incorrect or inefficient usage patterns in asynchronous code + "B", # flake8-bugbear: detects common programming mistakes and potential bugs + "BLE", # flake8-blind-except: prevents blind exceptions that catch all exceptions without handling + "E", # Error: detects and reports errors in the code + "F", # Pyflakes: detects unused imports, shadowed imports, undefined variables, and various formatting errors in string operations + "INP", # flake8-no-pep420: prevents implicit namespace packages by requiring __init__.py + "PGH", # pygrep-hooks: detects deprecated and dangerous code patterns + "PL", # Pylint: comprehensive source code analyzer for enforcing coding standards and detecting errors + "RSE", # flake8-raise: ensures exceptions are raised correctly + "S", # flake8-bandit: detects security issues and vulnerabilities in the code + "SLF", # flake8-self: prevents incorrect usage of the self argument in class methods + "T10", # flake8-debugger: detects the presence of debugging tools such as pdb + "T20", # flake8-print: detects print statements left in the code + "UP", # pyupgrade: automatically upgrades syntax for newer versions of Python + "W", # Warning: provides warnings about potential issues in the code + "YTT", # flake8-2020: identifies code that will break with future Python releases + + # Code Documentation + "FIX", # flake8-fixme: detects FIXMEs and other temporary comments that should be resolved +] + +[tool.ruff.lint.extend-per-file-ignores] +"tests/**/*.py" = [ + "S101", # asserts allowed in tests + "ARG", # Unused function args allowed in tests + "PLR2004", # Magic value used in comparison + "TCH002", # No import only type checking in tests + "SLF001", # enable private member access in tests + "S105", # allow hardcoded passwords in tests + "S311", # allow standard pseudo-random generators in tests + "PT011", # allow generic exceptions in tests + "N806", # allow uppercase variable names in tests + "PGH003", # allow general ignores in tests + "S106", # allow hardcoded passwords in tests + "PLR0915", # allow complext statements in tests +] + +[tool.ruff.lint.isort] +known-first-party = ["guidellm", "tests"] + + +[tool.pytest.ini_options] +addopts = '-s -vvv --cache-clear' +markers = [ + "smoke: quick tests to check basic functionality", + "sanity: detailed tests to ensure major functions work correctly", + "regression: tests to ensure that new changes do not break existing functionality" +] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b10b4455ae29b9476829955b81bdfef07f515b25 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/__init__.py @@ -0,0 +1,20 @@ +""" +Guidellm is a package that provides an easy and intuitive interface for +evaluating and benchmarking large language models (LLMs). +""" + +# flake8: noqa + +import os + +import transformers # type: ignore + +os.environ["TOKENIZERS_PARALLELISM"] = "false" # Silence warnings for tokenizers +transformers.logging.set_verbosity_error() # Silence warnings for transformers + + +from .config import settings +from .logger import configure_logger, logger +from .main import generate_benchmark_report + +__all__ = ["configure_logger", "logger", "settings", "generate_benchmark_report"] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..13910180a77e3958a18da428932bce45aeff538a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/__init__.py @@ -0,0 +1,12 @@ +from .base import Backend, BackendEngine, BackendEnginePublic, GenerativeResponse +from .openai import OpenAIBackend +from .aiohttp import AiohttpBackend + +__all__ = [ + "Backend", + "BackendEngine", + "BackendEnginePublic", + "GenerativeResponse", + "OpenAIBackend", + "AiohttpBackend" +] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py new file mode 100644 index 0000000000000000000000000000000000000000..fbbd97158fab0f547a812534eb06152e83366328 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/aiohttp.py @@ -0,0 +1,180 @@ +import base64 +import io +from typing import AsyncGenerator, Dict, List, Optional +from loguru import logger + +import aiohttp +import json + +from guidellm.backend.base import Backend, GenerativeResponse +from guidellm.config import settings +from guidellm.core import TextGenerationRequest + +__all__ = ["AiohttpBackend"] + +@Backend.register("aiohttp_server") +class AiohttpBackend(Backend): + """ + An aiohttp-based backend implementation for LLM requests. + + This class provides an interface to communicate with a server hosting + an LLM API using aiohttp for asynchronous requests. + """ + + def __init__( + self, + openai_api_key: Optional[str] = None, + target: Optional[str] = None, + model: Optional[str] = None, + timeout: Optional[float] = None, + **request_args, + ): + self._request_args: Dict = request_args + self._api_key: str = openai_api_key or settings.aiohttp.api_key + + if not self._api_key: + err = ValueError( + "`GUIDELLM__AIOHTTP__API_KEY` environment variable or " + "--openai-api-key CLI parameter must be specified for the " + "aiohttp backend." + ) + logger.error("{}", err) + raise err + + base_url = target or settings.aiohttp.base_url + self._api_url = f"{base_url}/chat/completions" + + if not base_url: + err = ValueError( + "`GUIDELLM__AIOHTTP__BASE_URL` environment variable or " + "target parameter must be specified for the OpenAI backend." + ) + logger.error("{}", err) + raise err + + self._timeout = aiohttp.ClientTimeout(total=timeout or settings.request_timeout) + self._model = model + + super().__init__(type_="aiohttp_backend", target=base_url, model=self._model) + logger.info("aiohttp {} Backend listening on {}", self._model, base_url) + + async def make_request( + self, + request: TextGenerationRequest, + ) -> AsyncGenerator[GenerativeResponse, None]: + """ + Make a request to the aiohttp backend. + + Sends a prompt to the LLM server and streams the response tokens. + + :param request: The text generation request to submit. + :type request: TextGenerationRequest + :yield: A stream of GenerativeResponse objects. + :rtype: AsyncGenerator[GenerativeResponse, None] + """ + + async with aiohttp.ClientSession(timeout=self._timeout) as session: + logger.debug("Making request to aiohttp backend with prompt: {}", request.prompt) + + request_args = {} + if request.output_token_count is not None: + request_args.update( + { + "max_completion_tokens": request.output_token_count, + "stop": None, + "ignore_eos": True, + } + ) + elif settings.aiohttp.max_gen_tokens and settings.aiohttp.max_gen_tokens > 0: + request_args.update( + { + "max_tokens": settings.aiohttp.max_gen_tokens, + } + ) + + request_args.update(self._request_args) + + messages = self._build_messages(request) + + payload = { + "model": self._model, + "messages": messages, + "stream": True, + **request_args, + } + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self._api_key}", + } + + try: + async with session.post(url=self._api_url, json=payload, headers=headers) as response: + if response.status != 200: + error_message = await response.text() + logger.error("Request failed: {} - {}", response.status, error_message) + raise Exception(f"Failed to generate response: {error_message}") + + token_count = 0 + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") + if chunk == "[DONE]": + # Final response + yield GenerativeResponse( + type_="final", + prompt=request.prompt, + output_token_count=token_count, + prompt_token_count=request.prompt_token_count, + ) + else: + # Intermediate token response + token_count += 1 + data = json.loads(chunk) + delta = data["choices"][0]["delta"] + token = delta["content"] + yield GenerativeResponse( + type_="token_iter", + add_token=token, + prompt=request.prompt, + output_token_count=token_count, + prompt_token_count=request.prompt_token_count, + ) + except Exception as e: + logger.error("Error while making request: {}", e) + raise + + def available_models(self) -> List[str]: + """ + Retrieve a list of available models from the server. + """ + # This could include an API call to `self._api_url/models` if the server supports it. + logger.warning("Fetching available models is not implemented for aiohttp backend.") + return [] + + def validate_connection(self): + """ + Validate the connection to the backend server. + """ + logger.info("Connection validation is not explicitly implemented for aiohttp backend.") + + def _build_messages(self, request: TextGenerationRequest) -> Dict: + if request.number_images == 0: + messages = [{"role": "user", "content": request.prompt}] + else: + content = [] + for image in request.images: + stream = io.BytesIO() + im_format = image.image.format or "PNG" + image.image.save(stream, format=im_format) + im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8") + image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"} + content.append({"type": "image_url", "image_url": image_url}) + + content.append({"type": "text", "text": request.prompt}) + messages = [{"role": "user", "content": content}] + + return messages diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py new file mode 100644 index 0000000000000000000000000000000000000000..a165859454ac462a8dfedade0240a7e118acf50a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/base.py @@ -0,0 +1,320 @@ +import asyncio +import functools +from abc import ABC, abstractmethod +from typing import AsyncGenerator, Dict, List, Literal, Optional, Type, Union + +from loguru import logger +from pydantic import BaseModel +from transformers import ( # type: ignore # noqa: PGH003 + AutoTokenizer, + PreTrainedTokenizer, +) + +from guidellm.core import TextGenerationRequest, TextGenerationResult + +__all__ = ["Backend", "BackendEngine", "BackendEnginePublic", "GenerativeResponse"] + + +BackendEnginePublic = Literal["openai_server", "aiohttp_server"] +BackendEngine = Union[BackendEnginePublic, Literal["test"]] + + +class GenerativeResponse(BaseModel): + """ + A model representing a response from a generative AI backend. + + :param type_: The type of response, either 'token_iter' for intermediate + token output or 'final' for the final result. + :type type_: Literal["token_iter", "final"] + :param add_token: The token to add to the output + (only applicable if type_ is 'token_iter'). + :type add_token: Optional[str] + :param prompt: The original prompt sent to the backend. + :type prompt: Optional[str] + :param output: The final generated output (only applicable if type_ is 'final'). + :type output: Optional[str] + :param prompt_token_count: The number of tokens in the prompt. + :type prompt_token_count: Optional[int] + :param output_token_count: The number of tokens in the output. + :type output_token_count: Optional[int] + """ + + type_: Literal["token_iter", "final"] + add_token: Optional[str] = None + prompt: Optional[str] = None + output: Optional[str] = None + prompt_token_count: Optional[int] = None + output_token_count: Optional[int] = None + + +class Backend(ABC): + """ + Abstract base class for generative AI backends. + + This class provides a common interface for creating and interacting with different + generative AI backends. Subclasses should implement the abstract methods to + define specific backend behavior. + + :cvar _registry: A dictionary that maps BackendEngine types to backend classes. + :type _registry: Dict[BackendEngine, Type[Backend]] + :param type_: The type of the backend. + :type type_: BackendEngine + :param target: The target URL for the backend. + :type target: str + :param model: The model used by the backend. + :type model: str + """ + + _registry: Dict[BackendEngine, "Type[Backend]"] = {} + + @classmethod + def register(cls, backend_type: BackendEngine): + """ + A decorator to register a backend class in the backend registry. + + :param backend_type: The type of backend to register. + :type backend_type: BackendEngine + :return: The decorated backend class. + :rtype: Type[Backend] + """ + + def inner_wrapper(wrapped_class: Type["Backend"]): + cls._registry[backend_type] = wrapped_class + logger.info("Registered backend type: {}", backend_type) + return wrapped_class + + return inner_wrapper + + @classmethod + def create(cls, backend_type: BackendEngine, **kwargs) -> "Backend": + """ + Factory method to create a backend instance based on the backend type. + + :param backend_type: The type of backend to create. + :type backend_type: BackendEngine + :param kwargs: Additional arguments for backend initialization. + :return: An instance of a subclass of Backend. + :rtype: Backend + :raises ValueError: If the backend type is not registered. + """ + + logger.info("Creating backend of type {}", backend_type) + + if backend_type not in cls._registry: + err = ValueError(f"Unsupported backend type: {backend_type}") + logger.error("{}", err) + raise err + + return Backend._registry[backend_type](**kwargs) + + def __init__(self, type_: BackendEngine, target: str, model: str): + """ + Base constructor for the Backend class. + Calls into test_connection to ensure the backend is reachable. + Ensure all setup is done in the subclass constructor before calling super. + + :param type_: The type of the backend. + :param target: The target URL for the backend. + :param model: The model used by the backend. + """ + self._type = type_ + self._target = target + self._model = model + + self.test_connection() + + @property + def default_model(self) -> str: + """ + Get the default model for the backend. + + :return: The default model. + :rtype: str + :raises ValueError: If no models are available. + """ + return _cachable_default_model(self) + + @property + def type_(self) -> BackendEngine: + """ + Get the type of the backend. + + :return: The type of the backend. + :rtype: BackendEngine + """ + return self._type + + @property + def target(self) -> str: + """ + Get the target URL for the backend. + + :return: The target URL. + :rtype: str + """ + return self._target + + @property + def model(self) -> str: + """ + Get the model used by the backend. + + :return: The model name. + :rtype: str + """ + return self._model + + def model_tokenizer(self) -> PreTrainedTokenizer: + """ + Get the tokenizer for the backend model. + + :return: The tokenizer instance. + """ + return AutoTokenizer.from_pretrained(self.model) + + def test_connection(self) -> bool: + """ + Test the connection to the backend by running a short text generation request. + If successful, returns True, otherwise raises an exception. + + :return: True if the connection is successful. + :rtype: bool + :raises ValueError: If the connection test fails. + """ + try: + asyncio.get_running_loop() + is_async = True + except RuntimeError: + is_async = False + + if is_async: + logger.warning("Running in async mode, cannot test connection") + return True + + try: + request = TextGenerationRequest( + prompt="Test connection", output_token_count=5 + ) + + asyncio.run(self.submit(request)) + return True + except Exception as err: + raise_err = RuntimeError( + f"Backend connection test failed for backend type={self.type_} " + f"with target={self.target} and model={self.model} with error: {err}" + ) + logger.error(raise_err) + raise raise_err from err + + async def submit(self, request: TextGenerationRequest) -> TextGenerationResult: + """ + Submit a text generation request and return the result. + + This method handles the request submission to the backend and processes + the response in a streaming fashion if applicable. + + :param request: The request object containing the prompt + and other configurations. + :type request: TextGenerationRequest + :return: The result of the text generation request. + :rtype: TextGenerationResult + :raises ValueError: If no response is received from the backend. + """ + + logger.debug("Submitting request with prompt: {}", request.prompt) + + result = TextGenerationResult(request=request) + result.start(request.prompt) + received_final = False + + async for response in self.make_request(request): + logger.debug("Received response: {}", response) + if response.type_ == "token_iter": + result.output_token(response.add_token if response.add_token else "") + elif response.type_ == "final": + if received_final: + err = ValueError( + "Received multiple final responses from the backend." + ) + logger.error(err) + raise err + + result.end( + output=response.output, + prompt_token_count=response.prompt_token_count, + output_token_count=response.output_token_count, + ) + received_final = True + else: + err = ValueError( + f"Invalid response received from the backend of type: " + f"{response.type_} for {response}" + ) + logger.error(err) + raise err + + if not received_final: + err = ValueError("No final response received from the backend.") + logger.error(err) + raise err + + logger.info("Request completed with output: {}", result.output) + + return result + + @abstractmethod + async def make_request( + self, + request: TextGenerationRequest, + ) -> AsyncGenerator[GenerativeResponse, None]: + """ + Abstract method to make a request to the backend. + + Subclasses must implement this method to define how requests are handled + by the backend. + + :param request: The request object containing the prompt and + other configurations. + :type request: TextGenerationRequest + :yield: A generator yielding responses from the backend. + :rtype: AsyncGenerator[GenerativeResponse, None] + """ + yield None # type: ignore # noqa: PGH003 + + @abstractmethod + def available_models(self) -> List[str]: + """ + Abstract method to get the available models for the backend. + + Subclasses must implement this method to provide the list of models + supported by the backend. + + :return: A list of available models. + :rtype: List[str] + :raises NotImplementedError: If the method is not implemented by a subclass. + """ + raise NotImplementedError + + +@functools.lru_cache(maxsize=1) +def _cachable_default_model(backend: Backend) -> str: + """ + Get the default model for a backend using LRU caching. + + This function caches the default model to optimize repeated lookups. + + :param backend: The backend instance for which to get the default model. + :type backend: Backend + :return: The default model. + :rtype: str + :raises ValueError: If no models are available. + """ + logger.debug("Getting default model for backend: {}", backend) + models = backend.available_models() + if models: + logger.debug("Default model: {}", models[0]) + return models[0] + + err = ValueError("No models available.") + logger.error(err) + raise err diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py new file mode 100644 index 0000000000000000000000000000000000000000..9843fc1a06ac7fbecd2198b7317af8545a07c81a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/backend/openai.py @@ -0,0 +1,192 @@ +import base64 +import io +from typing import AsyncGenerator, Dict, List, Optional + +from loguru import logger +from openai import AsyncOpenAI, OpenAI + +from guidellm.backend.base import Backend, GenerativeResponse +from guidellm.config import settings +from guidellm.core import TextGenerationRequest + +__all__ = ["OpenAIBackend"] + + +@Backend.register("openai_server") +class OpenAIBackend(Backend): + """ + An OpenAI backend implementation for generative AI results. + + This class provides an interface to communicate with the + OpenAI server for generating responses based on given prompts. + + :param openai_api_key: The API key for OpenAI. + If not provided, it will default to the key from settings. + :type openai_api_key: Optional[str] + :param target: The target URL string for the OpenAI server. + :type target: Optional[str] + :param model: The OpenAI model to use, defaults to the first available model. + :type model: Optional[str] + :param request_args: Additional arguments for the OpenAI request. + :type request_args: Dict[str, Any] + """ + + def __init__( + self, + openai_api_key: Optional[str] = None, + target: Optional[str] = None, + model: Optional[str] = None, + **request_args, + ): + self._request_args: Dict = request_args + api_key: str = openai_api_key or settings.openai.api_key + + if not api_key: + err = ValueError( + "`GUIDELLM__OPENAI__API_KEY` environment variable or " + "--openai-api-key CLI parameter must be specified for the " + "OpenAI backend." + ) + logger.error("{}", err) + raise err + + base_url = target or settings.openai.base_url + + if not base_url: + err = ValueError( + "`GUIDELLM__OPENAI__BASE_URL` environment variable or " + "target parameter must be specified for the OpenAI backend." + ) + logger.error("{}", err) + raise err + + self._async_client = AsyncOpenAI(api_key=api_key, base_url=base_url) + self._client = OpenAI(api_key=api_key, base_url=base_url) + self._model = model or self.default_model + + super().__init__(type_="openai_server", target=base_url, model=self._model) + logger.info("OpenAI {} Backend listening on {}", self._model, base_url) + + async def make_request( + self, + request: TextGenerationRequest, + ) -> AsyncGenerator[GenerativeResponse, None]: + """ + Make a request to the OpenAI backend. + + This method sends a prompt to the OpenAI backend and streams + the response tokens back. + + :param request: The text generation request to submit. + :type request: TextGenerationRequest + :yield: A stream of GenerativeResponse objects. + :rtype: AsyncGenerator[GenerativeResponse, None] + """ + + logger.debug("Making request to OpenAI backend with prompt: {}", request.prompt) + + request_args: Dict = { + "n": 1, # Number of completions for each prompt + } + + if request.output_token_count is not None: + request_args.update( + { + "max_tokens": request.output_token_count, + "stop": None, + "extra_body": { + "ignore_eos": True, + } + } + ) + elif settings.openai.max_gen_tokens and settings.openai.max_gen_tokens > 0: + request_args.update( + { + "max_tokens": settings.openai.max_gen_tokens, + } + ) + + request_args.update(self._request_args) + + messages = self._build_messages(request) + + stream = await self._async_client.chat.completions.create( + model=self.model, + messages=messages, + stream=True, + **request_args, + ) + + token_count = 0 + async for chunk in stream: + choice = chunk.choices[0] + token = choice.delta.content or "" + + if choice.finish_reason is not None: + yield GenerativeResponse( + type_="final", + prompt=request.prompt, + prompt_token_count=request.prompt_token_count, + output_token_count=token_count, + ) + break + + token_count += 1 + yield GenerativeResponse( + type_="token_iter", + add_token=token, + prompt=request.prompt, + prompt_token_count=request.prompt_token_count, + output_token_count=token_count, + ) + + def available_models(self) -> List[str]: + """ + Get the available models for the backend. + + This method queries the OpenAI API to retrieve a list of available models. + + :return: A list of available models. + :rtype: List[str] + :raises openai.OpenAIError: If an error occurs while retrieving models. + """ + + try: + return [model.id for model in self._client.models.list().data] + except Exception as error: + logger.error("Failed to retrieve available models: {}", error) + raise error + + def validate_connection(self): + """ + Validate the connection to the OpenAI backend. + + This method checks that the OpenAI backend is reachable and + the API key is valid. + + :raises openai.OpenAIError: If the connection is invalid. + """ + + try: + self._client.models.list() + except Exception as error: + logger.error("Failed to validate OpenAI connection: {}", error) + raise error + + def _build_messages(self, request: TextGenerationRequest) -> Dict: + if request.number_images == 0: + messages = [{"role": "user", "content": request.prompt}] + else: + content = [] + for image in request.images: + stream = io.BytesIO() + im_format = image.image.format or "PNG" + image.image.save(stream, format=im_format) + im_b64 = base64.b64encode(stream.getvalue()).decode("utf-8") + image_url = {"url": f"data:image/{im_format.lower()};base64,{im_b64}"} + content.append({"type": "image_url", "image_url": image_url}) + + content.append({"type": "text", "text": request.prompt}) + messages = [{"role": "user", "content": content}] + + return messages diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py new file mode 100644 index 0000000000000000000000000000000000000000..c81f67a6666490dd48a1eddba4dad6c90e2bf08a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/config.py @@ -0,0 +1,239 @@ +import json +from enum import Enum +from typing import Dict, List, Optional, Sequence + +from pydantic import BaseModel, Field, model_validator +from pydantic_settings import BaseSettings, SettingsConfigDict + +__all__ = [ + "DatasetSettings", + "EmulatedDataSettings", + "Environment", + "LoggingSettings", + "OpenAISettings", + "print_config", + "ReportGenerationSettings", + "Settings", + "reload_settings", + "settings", +] + + +class Environment(str, Enum): + """ + Enum for the supported environments + """ + + LOCAL = "local" + DEV = "dev" + STAGING = "staging" + PROD = "prod" + + +ENV_REPORT_MAPPING = { + Environment.PROD: "https://guidellm.neuralmagic.com/local-report/index.html", + Environment.STAGING: "https://staging.guidellm.neuralmagic.com/local-report/index.html", + Environment.DEV: "https://dev.guidellm.neuralmagic.com/local-report/index.html", + Environment.LOCAL: "tests/dummy/report.html", +} + + +class LoggingSettings(BaseModel): + """ + Logging settings for the application + """ + + disabled: bool = False + clear_loggers: bool = True + console_log_level: str = "WARNING" + log_file: Optional[str] = None + log_file_level: Optional[str] = None + + +class DatasetSettings(BaseModel): + """ + Dataset settings for the application + """ + + preferred_data_columns: List[str] = Field( + default_factory=lambda: [ + "prompt", + "instruction", + "input", + "inputs", + "question", + "context", + "text", + "content", + "body", + "data", + ] + ) + preferred_data_splits: List[str] = Field( + default_factory=lambda: ["test", "tst", "validation", "val", "train"] + ) + + +class EmulatedDataSettings(BaseModel): + """ + Emulated data settings for the application to use + """ + + source: str = "http://localhost:666/aimages/1342-0.txt" + filter_start: str = "It is a truth universally acknowledged, that a" + filter_end: str = "CHISWICK PRESS:--CHARLES WHITTINGHAM AND CO." + clean_text_args: Dict[str, bool] = Field( + default_factory=lambda: { + "fix_encoding": True, + "clean_whitespace": True, + "remove_empty_lines": True, + "force_new_line_punctuation": True, + } + ) + image_source: List[str] = "http://localhost:666/aimages/pg1-images.html" + + +class OpenAISettings(BaseModel): + """ + OpenAI settings for the application to connect to the API + for OpenAI server based pathways + """ + + # OpenAI API key. + api_key: str = "invalid_token" + + # OpenAI-compatible server URL + # NOTE: The default value is default address of llama.cpp web server + base_url: str = "http://localhost:8000/v1" + + max_gen_tokens: int = 4096 + + +class AiohttpSettings(OpenAISettings): + pass + +class ReportGenerationSettings(BaseModel): + """ + Report generation settings for the application + """ + + source: str = "" + report_html_match: str = "window.report_data = {};" + report_html_placeholder: str = "{}" + + +class Settings(BaseSettings): + """ + All the settings are powered by pydantic_settings and could be + populated from the .env file. + + The format to populate the settings is next + + ```sh + export GUIDELLM__LOGGING__DISABLED=true + export GUIDELLM__OPENAI__API_KEY=****** + ``` + """ + + model_config = SettingsConfigDict( + env_prefix="GUIDELLM__", + env_nested_delimiter="__", + extra="ignore", + validate_default=True, + env_file=".env", + ) + + # general settings + env: Environment = Environment.PROD + request_timeout: int = 30 + max_concurrency: int = 512 + num_sweep_profiles: int = 9 + logging: LoggingSettings = LoggingSettings() + + # Data settings + dataset: DatasetSettings = DatasetSettings() + emulated_data: EmulatedDataSettings = EmulatedDataSettings() + + # Request settings + openai: OpenAISettings = OpenAISettings() + aiohttp: AiohttpSettings = AiohttpSettings() + + # Report settings + report_generation: ReportGenerationSettings = ReportGenerationSettings() + + @model_validator(mode="after") + @classmethod + def set_default_source(cls, values): + if not values.report_generation.source: + values.report_generation.source = ENV_REPORT_MAPPING.get(values.env) + + return values + + def generate_env_file(self) -> str: + """ + Generate the .env file from the current settings + """ + return Settings._recursive_generate_env( + self, + self.model_config["env_prefix"], # type: ignore # noqa: PGH003 + self.model_config["env_nested_delimiter"], # type: ignore # noqa: PGH003 + ) + + @staticmethod + def _recursive_generate_env(model: BaseModel, prefix: str, delimiter: str) -> str: + env_file = "" + add_models = [] + for key, value in model.model_dump().items(): + if isinstance(value, BaseModel): + # add nested properties to be processed after the current level + add_models.append((key, value)) + continue + + dict_values = ( + { + f"{prefix}{key.upper()}{delimiter}{sub_key.upper()}": sub_value + for sub_key, sub_value in value.items() + } + if isinstance(value, dict) + else {f"{prefix}{key.upper()}": value} + ) + + for tag, sub_value in dict_values.items(): + if isinstance(sub_value, Sequence) and not isinstance(sub_value, str): + value_str = ",".join(f'"{item}"' for item in sub_value) + env_file += f"{tag}=[{value_str}]\n" + elif isinstance(sub_value, Dict): + value_str = json.dumps(sub_value) + env_file += f"{tag}={value_str}\n" + elif not sub_value: + env_file += f"{tag}=\n" + else: + env_file += f'{tag}="{sub_value}"\n' + + for key, value in add_models: + env_file += Settings._recursive_generate_env( + value, f"{prefix}{key.upper()}{delimiter}", delimiter + ) + return env_file + + +settings = Settings() + + +def reload_settings(): + """ + Reload the settings from the environment variables + """ + new_settings = Settings() + settings.__dict__.update(new_settings.__dict__) + + +def print_config(): + """ + Print the current configuration settings + """ + print(f"Settings: \n{settings.generate_env_file()}") # noqa: T201 + + +if __name__ == "__main__": + print_config() diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e738aa769737a158dccb697a36d61697488d6b55 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/__init__.py @@ -0,0 +1,24 @@ +from .distribution import Distribution +from .report import GuidanceReport +from .request import TextGenerationRequest +from .result import ( + RequestConcurrencyMeasurement, + TextGenerationBenchmark, + TextGenerationBenchmarkReport, + TextGenerationError, + TextGenerationResult, +) +from .serializable import Serializable, SerializableFileType + +__all__ = [ + "Distribution", + "GuidanceReport", + "RequestConcurrencyMeasurement", + "Serializable", + "SerializableFileType", + "TextGenerationBenchmark", + "TextGenerationBenchmarkReport", + "TextGenerationError", + "TextGenerationRequest", + "TextGenerationResult", +] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py new file mode 100644 index 0000000000000000000000000000000000000000..3f770528c3bcc1a1ba0049797ee59067f816d9a0 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/distribution.py @@ -0,0 +1,190 @@ +from typing import List, Sequence + +import numpy as np +from loguru import logger +from pydantic import Field + +from guidellm.core.serializable import Serializable + +__all__ = ["Distribution"] + + +class Distribution(Serializable): + """ + A class to represent a statistical distribution and perform various + statistical analyses. + """ + + data: Sequence[float] = Field( + default_factory=list, + description="The data points of the distribution.", + ) + + def __str__(self): + return f"Distribution({self.describe()})" + + def __len__(self): + return len(self.data) + + @property + def mean(self) -> float: + """ + Calculate and return the mean of the distribution. + :return: The mean of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate mean.") + return 0.0 + + mean_value = np.mean(self.data).item() + logger.debug(f"Calculated mean: {mean_value}") + return mean_value + + @property + def median(self) -> float: + """ + Calculate and return the median of the distribution. + :return: The median of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate median.") + return 0.0 + + median_value = np.median(self.data).item() + logger.debug(f"Calculated median: {median_value}") + return median_value + + @property + def variance(self) -> float: + """ + Calculate and return the variance of the distribution. + :return: The variance of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate variance.") + return 0.0 + + variance_value = np.var(self.data).item() + logger.debug(f"Calculated variance: {variance_value}") + return variance_value + + @property + def std_deviation(self) -> float: + """ + Calculate and return the standard deviation of the distribution. + :return: The standard deviation of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate standard deviation.") + return 0.0 + + std_deviation_value = np.std(self.data).item() + logger.debug(f"Calculated standard deviation: {std_deviation_value}") + return std_deviation_value + + def percentile(self, percentile: float) -> float: + """ + Calculate and return the specified percentile of the distribution. + :param percentile: The desired percentile to calculate (0-100). + :return: The specified percentile of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate percentile.") + return 0.0 + + percentile_value = np.percentile(self.data, percentile).item() + logger.debug(f"Calculated {percentile}th percentile: {percentile_value}") + return percentile_value + + def percentiles(self, percentiles: List[float]) -> List[float]: + """ + Calculate and return the specified percentiles of the distribution. + :param percentiles: A list of desired percentiles to calculate (0-100). + :return: A list of the specified percentiles of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate percentiles.") + return [0.0] * len(percentiles) + + percentiles_values: List[float] = np.percentile(self.data, percentiles).tolist() # type: ignore # noqa: PGH003 + logger.debug(f"Calculated percentiles {percentiles}: {percentiles_values}") + return percentiles_values + + @property + def min(self) -> float: + """ + Return the minimum value of the distribution. + :return: The minimum value of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate minimum.") + return 0.0 + + min_value: float = np.min(self.data).item() # type: ignore # noqa: PGH003 + logger.debug(f"Calculated min: {min_value}") + return min_value + + @property + def max(self) -> float: + """ + Return the maximum value of the distribution. + :return: The maximum value of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate maximum.") + return 0.0 + + max_value: float = np.max(self.data).item() # type: ignore # noqa: PGH003 + logger.debug(f"Calculated max: {max_value}") + return max_value + + @property + def range(self) -> float: + """ + Calculate and return the range of the distribution (max - min). + :return: The range of the distribution. + """ + if not self.data: + logger.warning("No data points available to calculate range.") + return 0.0 + + range_value = self.max - self.min + logger.debug(f"Calculated range: {range_value}") + return range_value + + def describe(self) -> dict: + """ + Return a dictionary describing various statistics of the distribution. + :return: A dictionary with statistical summaries of the distribution. + """ + description = { + "mean": self.mean, + "median": self.median, + "variance": self.variance, + "std_deviation": self.std_deviation, + "percentile_indices": [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99], + "percentile_values": self.percentiles( + [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 99], + ), + "min": self.min, + "max": self.max, + "range": self.range, + } + logger.debug(f"Generated description: {description}") + return description + + def add_data(self, new_data: Sequence[float]): + """ + Add new data points to the distribution. + :param new_data: A list of new numerical data points to add. + """ + self.data = list(self.data) + list(new_data) + logger.debug(f"Added new data: {new_data}") + + def remove_data(self, remove_data: Sequence[float]): + """ + Remove specified data points from the distribution. + :param remove_data: A list of numerical data points to remove. + """ + self.data = [item for item in self.data if item not in remove_data] + logger.debug(f"Removed data: {remove_data}") diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py new file mode 100644 index 0000000000000000000000000000000000000000..c48eed561d4eaad4a84dc934264ed4b68d17830a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/report.py @@ -0,0 +1,311 @@ +import time +from datetime import datetime +from typing import List, Optional + +from loguru import logger +from pydantic import Field +from rich.console import Console, Group +from rich.live import Live +from rich.panel import Panel +from rich.table import Table + +from guidellm.core.result import TextGenerationBenchmark, TextGenerationBenchmarkReport +from guidellm.core.serializable import Serializable + +__all__ = ["GuidanceReport"] + + +def _create_benchmark_report_details(report: TextGenerationBenchmarkReport) -> str: + """ + Create a detailed string representation of a benchmark report. + + :param report: The benchmark report to generate details for. + :type report: TextGenerationBenchmarkReport + :return: A string containing the backend, data, rate, and limits of + the benchmark report. + :rtype: str + """ + backend = ( + f"Backend(type={report.args.get('backend_type', 'N/A')}, " + f"target={report.args.get('target', 'N/A')}, " + f"model={report.args.get('model', 'N/A')})" + ) + data = ( + f"Data(type={report.args.get('data_type', 'N/A')}, " + f"source={report.args.get('data', 'N/A')}, " + f"tokenizer={report.args.get('tokenizer', 'N/A')})" + ) + rate = ( + f"Rate(type={report.args.get('mode', 'N/A')}, " + f"rate={report.args.get('rate', 'N/A')})" + ) + limits = ( + f"Limits(max_number={report.args.get('max_number', 'N/A')} requests, " + f"max_duration={report.args.get('max_duration', 'N/A')} sec)" + ) + + logger.debug( + "Created benchmark report details for backend={}, data={}, rate={}, limits={}", + backend, + data, + rate, + limits, + ) + + return backend + "\n" + data + "\n" + rate + "\n" + limits + "\n" + + +def _benchmark_rate_id(benchmark: TextGenerationBenchmark) -> str: + """ + Generate a string identifier for a benchmark rate. + + :param benchmark: The benchmark for which to generate the rate ID. + :type benchmark: TextGenerationBenchmark + :return: A string representing the benchmark rate ID. + :rtype: str + """ + rate_id = ( + f"{benchmark.mode}@{benchmark.rate:.2f} req/sec" + if benchmark.rate + else f"{benchmark.mode}" + ) + logger.debug("Generated benchmark rate ID: {}", rate_id) + return rate_id + + +def _create_benchmark_report_requests_summary( + report: TextGenerationBenchmarkReport, +) -> Table: + """ + Create a table summarizing the requests of a benchmark report. + + :param report: The benchmark report to summarize. + :type report: TextGenerationBenchmarkReport + :return: A rich Table object summarizing the requests. + :rtype: Table + """ + table = Table( + "Benchmark", + "Requests Completed", + "Request Failed", + "Duration", + "Start Time", + "End Time", + title="[magenta]Requests Data by Benchmark[/magenta]", + title_style="bold", + title_justify="left", + show_header=True, + ) + + for benchmark in report.benchmarks_sorted: + start_time_str = ( + datetime.fromtimestamp(benchmark.start_time).strftime("%H:%M:%S") + if benchmark.start_time + else "N/A" + ) + end_time_str = ( + datetime.fromtimestamp(benchmark.end_time).strftime("%H:%M:%S") + if benchmark.end_time + else "N/A" + ) + + table.add_row( + _benchmark_rate_id(benchmark), + f"{benchmark.request_count}/{benchmark.total_count}", + f"{benchmark.error_count}/{benchmark.total_count}", + f"{benchmark.duration:.2f} sec", + f"{start_time_str}", + f"{end_time_str}", + ) + logger.debug("Created requests summary table for the report.") + return table + + +def _create_benchmark_report_data_tokens_summary( + report: TextGenerationBenchmarkReport, +) -> Table: + """ + Create a table summarizing data tokens of a benchmark report. + + :param report: The benchmark report to summarize. + :type report: TextGenerationBenchmarkReport + :return: A rich Table object summarizing the data tokens. + :rtype: Table + """ + table = Table( + "Benchmark", + "Prompt", + "Prompt (1%, 5%, 50%, 95%, 99%)", + "Output", + "Output (1%, 5%, 50%, 95%, 99%)", + title="[magenta]Tokens Data by Benchmark[/magenta]", + title_style="bold", + title_justify="left", + show_header=True, + ) + + for benchmark in report.benchmarks_sorted: + table.add_row( + _benchmark_rate_id(benchmark), + f"{benchmark.prompt_token:.2f}", + ", ".join( + f"{percentile:.1f}" + for percentile in benchmark.prompt_token_percentiles + ), + f"{benchmark.output_token:.2f}", + ", ".join( + f"{percentile:.1f}" + for percentile in benchmark.output_token_percentiles + ), + ) + logger.debug("Created data tokens summary table for the report.") + return table + + +def _create_benchmark_report_dist_perf_summary( + report: TextGenerationBenchmarkReport, +) -> Table: + """ + Create a table summarizing distribution performance of a benchmark report. + + :param report: The benchmark report to summarize. + :type report: TextGenerationBenchmarkReport + :return: A rich Table object summarizing the performance statistics. + :rtype: Table + """ + table = Table( + "Benchmark", + "Request Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (sec)", + "Time to First Token [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)", + "Inter Token Latency [1%, 5%, 10%, 50%, 90%, 95%, 99%] (ms)", + title="[magenta]Performance Stats by Benchmark[/magenta]", + title_style="bold", + title_justify="left", + show_header=True, + ) + + for benchmark in report.benchmarks_sorted: + table.add_row( + _benchmark_rate_id(benchmark), + ", ".join( + f"{percentile:.2f}" + for percentile in benchmark.request_latency_percentiles + ), + ", ".join( + f"{percentile * 1000:.1f}" + for percentile in benchmark.time_to_first_token_percentiles + ), + ", ".join( + f"{percentile * 1000:.1f}" + for percentile in benchmark.inter_token_latency_percentiles + ), + ) + logger.debug("Created distribution performance summary table for the report.") + return table + + +def _create_benchmark_report_summary(report: TextGenerationBenchmarkReport) -> Table: + """ + Create a summary table for a benchmark report. + + :param report: The benchmark report to summarize. + :type report: TextGenerationBenchmarkReport + :return: A rich Table object summarizing overall performance. + :rtype: Table + """ + table = Table( + "Benchmark", + "Requests per Second", + "Request Latency", + "Time to First Token", + "Inter Token Latency", + "Output Token Throughput", + title="[magenta]Performance Summary by Benchmark[/magenta]", + title_style="bold", + title_justify="left", + show_header=True, + ) + + for benchmark in report.benchmarks_sorted: + table.add_row( + _benchmark_rate_id(benchmark), + f"{benchmark.completed_request_rate:.2f} req/sec", + f"{benchmark.request_latency:.2f} sec", + f"{benchmark.time_to_first_token:.2f} ms", + f"{benchmark.inter_token_latency:.2f} ms", + f"{benchmark.output_token_throughput:.2f} tokens/sec", + ) + logger.debug("Created overall performance summary table for the report.") + return table + + +class GuidanceReport(Serializable): + """ + A class to manage the guidance reports that include the benchmarking details, + potentially across multiple runs, for saving and loading from disk. + + :param benchmarks: The list of benchmarking reports. + :type benchmarks: List[TextGenerationBenchmarkReport] + """ + + benchmarks: List[TextGenerationBenchmarkReport] = Field( + default_factory=list, description="The list of benchmark reports." + ) + + def print( + self, save_path: Optional[str] = None, continual_refresh: bool = False + ) -> None: + """ + Print the guidance report to the console. + + :param save_path: Optional path to save the report to disk. + :type save_path: Optional[str] + :param continual_refresh: Whether to continually refresh the report. + :type continual_refresh: bool + :return: None + """ + logger.info("Printing guidance report to console with save_path={}", save_path) + report_viz = Panel( + Group( + *[ + Panel( + Group( + _create_benchmark_report_details(benchmark), + "", + _create_benchmark_report_requests_summary(benchmark), + "", + _create_benchmark_report_data_tokens_summary(benchmark), + "", + _create_benchmark_report_dist_perf_summary(benchmark), + "", + _create_benchmark_report_summary(benchmark), + ), + title=( + f"[bold magenta]Benchmark Report " + f"{index + 1}[/bold magenta]" + ), + expand=True, + title_align="left", + ) + for index, benchmark in enumerate(self.benchmarks) + ], + ), + title=( + "[bold cyan]GuideLLM Benchmarks Report[/bold cyan] [italic]" + f"({save_path})[/italic]" + ), + expand=True, + title_align="left", + ) + console = Console() + + if continual_refresh: + logger.info("Starting live report with continual refresh.") + with Live(report_viz, refresh_per_second=1, console=console) as live: + while True: + live.update(report_viz) + time.sleep(1) + else: + console.print(report_viz) + + logger.info("Guidance report printing completed.") diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py new file mode 100644 index 0000000000000000000000000000000000000000..06d0f37c8640e637591e199722567dacbf04102b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/request.py @@ -0,0 +1,65 @@ +import uuid +from typing import Any, Dict, List, Optional, Tuple + +from pydantic import Field + +from guidellm.core.serializable import Serializable +from guidellm.utils import ImageDescriptor + + +class TextGenerationRequest(Serializable): + """ + A class to represent a text generation request for generative AI workloads. + """ + + id: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="The unique identifier for the request.", + ) + prompt: str = Field(description="The input prompt for the text generation.") + images: Optional[List[ImageDescriptor]] = Field( + default=None, + description="Input images.", + ) + prompt_token_count: Optional[int] = Field( + default=None, + description="The number of tokens in the input prompt.", + ) + output_token_count: Optional[int] = Field( + default=None, + description="The number of tokens to generate.", + ) + params: Dict[str, Any] = Field( + default_factory=dict, + description="The parameters for the text generation request.", + ) + + @property + def number_images(self) -> int: + if self.images is None: + return 0 + else: + return len(self.images) + + @property + def image_resolution(self) -> List[Tuple[int, int]]: + if self.images is None: + return None + else: + return [im.size for im in self.images] + + + def __str__(self) -> str: + prompt_short = ( + self.prompt[:32] + "..." + if self.prompt and len(self.prompt) > 32 # noqa: PLR2004 + else self.prompt + ) + + return ( + f"TextGenerationRequest(id={self.id}, " + f"prompt={prompt_short}, prompt_token_count={self.prompt_token_count}, " + f"output_token_count={self.output_token_count}, " + f"params={self.params})" + f"image_resolution={self.image_resolution}" + ) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py new file mode 100644 index 0000000000000000000000000000000000000000..aebd1763728192228e7115c5842c4a8cec7fc0fe --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/result.py @@ -0,0 +1,637 @@ +from time import time +from typing import Any, Dict, List, Literal, Optional, Union + +from loguru import logger +from pydantic import Field, computed_field + +from guidellm.core.distribution import Distribution +from guidellm.core.request import TextGenerationRequest +from guidellm.core.serializable import Serializable + +__all__ = [ + "RequestConcurrencyMeasurement", + "TextGenerationBenchmark", + "TextGenerationBenchmarkReport", + "TextGenerationError", + "TextGenerationResult", +] + + +class TextGenerationResult(Serializable): + """ + A class to represent the result of a text generation request + for generative AI workloads. + """ + + request: TextGenerationRequest = Field( + description="The text generation request used to generate the result.", + ) + prompt: str = Field( + default_factory=str, + description="The input prompt for the text generation.", + ) + prompt_word_count: int = Field( + default=0, + description="The number of words in the input prompt.", + ) + prompt_token_count: int = Field( + default=0, + description="The number of tokens in the input prompt.", + ) + output: str = Field( + default_factory=str, + description="The generated output for the text generation.", + ) + output_word_count: int = Field( + default=0, + description="The number of words in the output.", + ) + output_token_count: int = Field( + default=0, + description="The number of tokens in the output.", + ) + last_time: Optional[float] = Field( + default=None, + description="The last time recorded.", + ) + first_token_set: bool = Field( + default=False, + description="Whether the first token time is set.", + ) + start_time: Optional[float] = Field( + default=None, + description="The start time of the text generation.", + ) + end_time: Optional[float] = Field( + default=None, + description="The end time of the text generation.", + ) + first_token_time: Optional[float] = Field( + default=None, + description="The time taken to decode the first token.", + ) + decode_times: Distribution = Field( + default_factory=Distribution, + description="The distribution of decode times.", + ) + + def start(self, prompt: str): + """ + Start the text generation by recording the prompt and start time. + + :param prompt: The input prompt for the text generation. + :type prompt: str + """ + self.prompt = prompt + self.prompt_word_count = len(prompt.split()) + self.prompt_token_count = len(prompt) # Token count placeholder + self.start_time = time() + self.last_time = time() + self.first_token_set = False + + logger.info("Text generation started with prompt: '{}'", prompt) + + def output_token(self, token: str): + """ + Add a token to the output and record the decode time. + + :param token: The decoded token. + :type token: str + """ + self._check_recording_started() + + if self.last_time is None: + raise ValueError( + "last time is not specified. " + "Did you call `text_generation_benchmark.start()`?" + ) + + current_counter = time() + + if not self.first_token_set: + self.first_token_time = current_counter - self.last_time + self.first_token_set = True + logger.debug(f"First token decode time: {self.first_token_time}") + else: + decode_time = current_counter - self.last_time + self.decode_times.add_data([decode_time]) + logger.debug(f"Token '{token}' decoded in {decode_time} seconds") + + self.last_time = current_counter + self.output += token + logger.debug("Added token {} to output", token) + + def end( + self, + output: Optional[str] = None, + prompt_token_count: Optional[int] = None, + output_token_count: Optional[int] = None, + ): + """ + End the text generation by recording the output and end time. + + :param output: The generated output for the text generation. + :type output: str + :param prompt_token_count: Optional token count for the prompt, + defaults to word count. + :type prompt_token_count: Optional[int] + :param output_token_count: Optional token count for the output, + defaults to word count. + :type output_token_count: Optional[int] + """ + self._check_recording_started() + self.end_time = time() + + if output: + self.output = output + + self.output_word_count = len(self.output.split()) + self.output_token_count = output_token_count or self.output_word_count + self.prompt_token_count = prompt_token_count or self.prompt_word_count + + logger.info(f"Text generation ended with output: '{self.output}'") + + def _check_recording_started( + self, + ): + if self.start_time is None: + raise ValueError( + "start time is not specified. " + "Did you make the `text_generation_benchmark.start()`?", + ) + + +class TextGenerationError(Serializable): + """ + A class to represent an error that occurred during a text generation request + for generative AI workloads. + """ + + request: TextGenerationRequest = Field( + description="The text generation request that resulted in an error.", + ) + message: str = Field( + description="The error message that occurred during text generation.", + ) + + +class RequestConcurrencyMeasurement(Serializable): + """ + A dataclass to represent the concurrency measurement of a request. + """ + + time: float = Field(description="The time of the measurement.") + completed: int = Field(description="The number of completed requests.") + errored: int = Field(description="The number of errored requests.") + processing: int = Field(description="The number of processing requests.") + + +class TextGenerationBenchmark(Serializable): + """ + A class to represent a report of text generation requests + (results and errors) for generative AI workloads. + This is a set of results and errors for a specific mode and rate. + """ + + mode: Literal["asynchronous", "synchronous", "throughput"] = Field( + description="The generation mode, one of 'async', 'sync', or 'throughput'." + ) + rate: Optional[float] = Field( + default=None, + description="The requested rate of requests per second.", + ) + results: List[TextGenerationResult] = Field( + default_factory=list, + description="The results of the text generation requests.", + ) + errors: List[TextGenerationError] = Field( + default_factory=list, + description="The errors of the text generation requests.", + ) + concurrencies: List[RequestConcurrencyMeasurement] = Field( + default_factory=list, + description="The concurrency measurements of the requests.", + ) + + def __iter__(self): + """ + Provide an iterator interface to iterate over the results. + + :return: An iterator over the results. + """ + return iter(self.results) + + @computed_field # type: ignore[misc] + @property + def request_count(self) -> int: + """ + Get the number of requests in the result. + + :return: The number of requests. + :rtype: int + """ + return len(self.results) + + @computed_field # type: ignore[misc] + @property + def error_count(self) -> int: + """ + Get the number of errors in the result. + + :return: The number of errors. + :rtype: int + """ + return len(self.errors) + + @computed_field # type: ignore[misc] + @property + def total_count(self) -> int: + """ + Get the total number of requests in the result. + + :return: The total number of requests. + :rtype: int + """ + return self.request_count + self.error_count + + @computed_field # type: ignore[misc] + @property + def start_time(self) -> Optional[float]: + """ + Get the start time of the first request in the result. + + :return: The start time of the first request. + :rtype: Optional[float] + """ + if not self.results: + return None + + return self.results[0].start_time + + @computed_field # type: ignore[misc] + @property + def end_time(self) -> Optional[float]: + """ + Get the end time of the last request in the result. + + :return: The end time of the last request. + :rtype: Optional[float] + """ + if not self.results: + return None + + return self.results[-1].end_time + + @computed_field # type: ignore[misc] + @property + def duration(self) -> float: + """ + Get the duration of the result in seconds. + + :return: The duration of the result. + :rtype: float + """ + if not self.results or not self.start_time or not self.end_time: + return 0.0 + + return self.end_time - self.start_time + + @computed_field # type: ignore[misc] + @property + def completed_request_rate(self) -> float: + """ + Get the rate of requests per second in the result. + + :return: The rate of requests per second. + :rtype: float + """ + if not self.results or not self.duration: + return 0.0 + + return len(self.results) / self.duration + + @computed_field # type: ignore[misc] + @property + def request_latency(self) -> float: + """ + Get the average request latency in seconds. + + :return: The average request latency in seconds. + :rtype: float + """ + if not self.results: + return 0.0 + + return self.request_latency_distribution.mean + + @property + def request_latency_distribution(self) -> Distribution: + """ + Get the distribution of request latencies. + + :return: The distribution of request latencies. + :rtype: Distribution + """ + return Distribution( + data=[ + result.end_time - result.start_time + for result in self.results + if result.end_time is not None and result.start_time is not None + ] + ) + + @computed_field # type: ignore[misc] + @property + def request_latency_percentiles(self) -> List[float]: + """ + Get standard percentiles of request latency in seconds. + + :return: List of percentile request latency in seconds + :rtype: List[float] + """ + return self.request_latency_distribution.percentiles([1, 5, 10, 50, 90, 95, 99]) + + + @computed_field # type: ignore[misc] + @property + def time_to_first_token(self) -> float: + """ + Get the time taken to decode the first token in milliseconds. + + :return: The time taken to decode the first token in milliseconds. + :rtype: float + """ + if not self.results: + return 0.0 + + return 1000 * self.ttft_distribution.mean + + @property + def ttft_distribution(self) -> Distribution: + """ + Get the distribution of time taken to decode the first token. + + :return: The distribution of time taken to decode the first token. + :rtype: Distribution + """ + return Distribution( + data=[ + result.first_token_time + for result in self.results + if result.first_token_time is not None + ] + ) + + @computed_field # type: ignore[misc] + @property + def time_to_first_token_percentiles(self) -> List[float]: + """ + Get standard percentiles for time taken to decode the first token + in milliseconds. + + :return: List of percentile time taken to decode the first token + in milliseconds. + :rtype: List[float] + """ + return self.ttft_distribution.percentiles([1, 5, 10, 50, 90, 95, 99]) + + @computed_field # type: ignore[misc] + @property + def inter_token_latency(self) -> float: + """ + Get the average time between tokens in milliseconds. + + :return: The average time between tokens. + :rtype: float + """ + if not self.results: + return 0.0 + + return 1000 * self.itl_distribution.mean + + @property + def itl_distribution(self) -> Distribution: + """ + Get the distribution of time between tokens. + + :return: The distribution of time between tokens. + :rtype: Distribution + """ + return Distribution( + data=[ + decode for result in self.results for decode in result.decode_times.data + ] + ) + + @computed_field # type: ignore[misc] + @property + def inter_token_latency_percentiles(self) -> List[float]: + """ + Get standard percentiles for the time between tokens in milliseconds. + + :return: List of percentiles for the average time between tokens. + :rtype: List[float] + """ + return self.itl_distribution.percentiles([1, 5, 10, 50, 90, 95, 99]) + + @computed_field # type: ignore[misc] + @property + def output_token_throughput(self) -> float: + """ + Get the average token throughput in tokens per second. + + :return: The average token throughput. + :rtype: float + """ + if not self.results or not self.duration: + return 0.0 + + total_tokens = sum(result.output_token_count for result in self.results) + + return total_tokens / self.duration + + @computed_field # type: ignore[misc] + @property + def prompt_token(self) -> float: + """ + Get the average number of prompt tokens. + + :return: The average number of prompt tokens. + :rtype: float + """ + return self.prompt_token_distribution.mean + + @property + def prompt_token_distribution(self) -> Distribution: + """ + Get the distribution of prompt token counts. + + :return: The distribution of prompt token counts. + :rtype: Distribution + """ + return Distribution(data=[result.prompt_token_count for result in self.results]) + + @computed_field # type: ignore[misc] + @property + def prompt_token_percentiles(self) -> List[float]: + """ + Get standard percentiles for number of prompt tokens. + + :return: List of percentiles of number of prompt tokens. + :rtype: List[float] + """ + return self.prompt_token_distribution.percentiles([1, 5, 50, 95, 99]) + + @computed_field # type: ignore[misc] + @property + def output_token(self) -> float: + """ + Get the average number of output tokens. + + :return: The average number of output tokens. + :rtype: float + """ + return self.output_token_distribution.mean + + @property + def output_token_distribution(self) -> Distribution: + """ + Get the distribution of output token counts. + + :return: The distribution of output token counts. + :rtype: Distribution + """ + return Distribution(data=[result.output_token_count for result in self.results]) + + @computed_field # type: ignore[misc] + @property + def output_token_percentiles(self) -> List[float]: + """ + Get standard percentiles for number of output tokens. + + :return: List of percentiles of number of output tokens. + :rtype: List[float] + """ + return self.output_token_distribution.percentiles([1, 5, 50, 95, 99]) + + @computed_field # type: ignore[misc] + @property + def overloaded(self) -> bool: + if ( + self.rate is None + or not self.results + or not self.concurrencies + or len(self.concurrencies) < 2 # noqa: PLR2004 + ): + # if rate was not set, sync mode is assumed, + # or we have less than 2 data points, + # then we cannot be overloaded by definition + return False + + # if the calculated rate is less than 75% of the requested rate, + # safe to assume the system is overloaded + return self.completed_request_rate < 0.75 * self.rate + + def request_started(self): + """ + Record the start of a generation request. + """ + if not self.concurrencies: + self.concurrencies = [ + RequestConcurrencyMeasurement( + time=time(), + completed=0, + errored=0, + processing=1, + ), + ] + else: + last = self.concurrencies[-1] + self.concurrencies.append( + RequestConcurrencyMeasurement( + time=time(), + completed=last.completed, + errored=last.errored, + processing=last.processing + 1, + ), + ) + + logger.info("Text generation request started") + + def request_completed( + self, + result: Union[TextGenerationResult, TextGenerationError], + ): + """ + Record the completion of a text generation request. + + :param result: The completed result or error. + :type result: Union[TextGenerationResult, TextGenerationError] + """ + if not self.concurrencies: + raise ValueError("Request completed without starting") + + if isinstance(result, TextGenerationError): + is_error = True + self.errors.append(result) + logger.info( + "Text generation request resulted in error: {}", + result.message, + ) + else: + if not result.start_time or not result.end_time: + raise ValueError("Start time and End time are not defined") + + is_error = False + self.results.append(result) + logger.info("Text generation request completed successfully: {}", result) + + last = self.concurrencies[-1] + self.concurrencies.append( + RequestConcurrencyMeasurement( + time=time(), + completed=last.completed + (not is_error), + errored=last.errored + is_error, + processing=last.processing - 1, + ) + ) + + +class TextGenerationBenchmarkReport(Serializable): + """ + A class to represent a report of text generation benchmarks + for generative AI workloads. + This is a collection of benchmarks for different modes and rates. + """ + + benchmarks: List[TextGenerationBenchmark] = Field( + default_factory=list, + description="The benchmarks of text generation requests.", + ) + args: Dict[str, Any] = Field( + default_factory=dict, + description="The arguments used for the benchmarks.", + ) + + def __iter__(self): + return iter(self.benchmarks) + + @property + def benchmarks_sorted(self) -> List[TextGenerationBenchmark]: + """ + Get the list of benchmarks sorted by request rate. + + :return: The sorted list of benchmarks. + :rtype: List[TextGenerationBenchmark] + """ + return sorted(self.benchmarks, key=lambda x: x.completed_request_rate) + + def add_benchmark(self, benchmark: TextGenerationBenchmark): + """ + Add a result to the report. + + :param benchmark: The result to add. + :type benchmark: TextGenerationBenchmark + """ + self.benchmarks.append(benchmark) + logger.debug("Added result: {}", benchmark) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py new file mode 100644 index 0000000000000000000000000000000000000000..1e6b2944ebe0877ed813f3bc5a41147b91b60092 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/core/serializable.py @@ -0,0 +1,169 @@ +from pathlib import Path +from typing import Any, Literal, Union, get_args + +import yaml +from loguru import logger +from pydantic import BaseModel, ConfigDict + +__all__ = ["Serializable", "SerializableFileType"] + + +SerializableFileType = Literal["yaml", "json"] + + +class Serializable(BaseModel): + """ + A base class for models that require YAML and JSON serialization and + deserialization. + """ + + model_config = ConfigDict( + extra="forbid", + use_enum_values=True, + validate_assignment=True, + from_attributes=True, + ) + + def __init__(self, /, **data: Any) -> None: + super().__init__(**data) + logger.debug( + "Initialized new instance of {} with data: {}", + self.__class__.__name__, + data, + ) + + def to_yaml(self) -> str: + """ + Serialize the model to a YAML string. + + :return: YAML string representation of the model. + """ + logger.debug("Serializing to YAML... {}", self) + + return yaml.dump(self.model_dump()) + + @classmethod + def from_yaml(cls, data: str): + """ + Deserialize a YAML string to a model instance. + + :param data: YAML string to deserialize. + :return: An instance of the model. + """ + logger.debug("Deserializing from YAML... {}", data) + + return cls.model_validate(yaml.safe_load(data)) + + def to_json(self) -> str: + """ + Serialize the model to a JSON string. + + :return: JSON string representation of the model. + """ + logger.debug("Serializing to JSON... {}", self) + + return self.model_dump_json() + + @classmethod + def from_json(cls, data: str): + """ + Deserialize a JSON string to a model instance. + + :param data: JSON string to deserialize. + :return: An instance of the model. + """ + logger.debug("Deserializing from JSON... {}", data) + + return cls.model_validate_json(data) + + def save_file( + self, + path: Union[str, Path], + type_: SerializableFileType = "yaml", + ) -> str: + """ + Save the model to a file in either YAML or JSON format. + + :param path: Path to the exact file or the containing directory. + If it is a directory, the file name will be inferred from the class name. + :param type_: Optional type to save ('yaml' or 'json'). + If not provided and the path has an extension, + it will be inferred to save in that format. + If not provided and the path does not have an extension, + it will save in YAML format. + :return: The path to the saved file. + """ + logger.debug("Saving to file... {} with format: {}", path, type_) + + if isinstance(path, str): + path = Path(path) + + if path.suffix: + # is a file + ext = path.suffix[1:].lower() + if type_ not in get_args(SerializableFileType): + raise ValueError( + f"Unsupported file extension: {type_}. " + f"Expected one of {SerializableFileType} " + f"for {path}" + ) + type_ = ext # type: ignore # noqa: PGH003 + else: + # is a directory + file_name = f"{self.__class__.__name__.lower()}.{type_}" + path = path / file_name + + path.parent.mkdir(parents=True, exist_ok=True) + + with path.open("w") as file: + if type_ == "yaml": + file.write(self.to_yaml()) + elif type_ == "json": + file.write(self.to_json()) + else: + raise ValueError( + f"Unsupported file extension: {type_}" + f"Expected one of {SerializableFileType} " + f"for {path}" + ) + + logger.info("Successfully saved {} to {}", self.__class__.__name__, path) + + return str(path) + + @classmethod + def load_file(cls, path: Union[str, Path]): + """ + Load a model from a file in either YAML or JSON format. + + :param path: Path to the file. + :return: An instance of the model. + """ + logger.debug("Loading from file... {}", path) + + if isinstance(path, str): + path = Path(path) + + if not path.exists(): + raise FileNotFoundError(f"File not found: {path}") + + if not path.is_file(): + raise ValueError(f"Path is not a file: {path}") + + extension = path.suffix[1:].lower() + + with path.open() as file: + data = file.read() + + if extension == "yaml": + obj = cls.from_yaml(data) + elif extension == "json": + obj = cls.from_json(data) + else: + raise ValueError( + f"Unsupported file extension: {extension}" + f"Expected one of {SerializableFileType} " + f"for {path}" + ) + + return obj diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d5858d072bfd8ae7e1092259ccba537f69e65743 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/__init__.py @@ -0,0 +1,10 @@ +from .base import Executor, ExecutorResult +from .profile_generator import Profile, ProfileGenerationMode, ProfileGenerator + +__all__ = [ + "Executor", + "ExecutorResult", + "Profile", + "ProfileGenerationMode", + "ProfileGenerator", +] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py new file mode 100644 index 0000000000000000000000000000000000000000..865ab30de412797485b49d2175ac7f94ac3900ba --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/base.py @@ -0,0 +1,213 @@ +from dataclasses import dataclass +from typing import AsyncGenerator, Optional, Sequence, Union + +from loguru import logger + +from guidellm.backend import Backend +from guidellm.core import TextGenerationBenchmarkReport +from guidellm.executor.profile_generator import ( + Profile, + ProfileGenerationMode, + ProfileGenerator, +) +from guidellm.request import RequestGenerator +from guidellm.scheduler import Scheduler, SchedulerResult + +__all__ = ["Executor", "ExecutorResult"] + + +@dataclass +class ExecutorResult: + """ + Data class representing the result of executing tasks in the Executor. + + :param completed: Indicates whether all tasks have completed. + :type completed: bool + :param count_total: Total number of profiles. + :type count_total: int + :param count_completed: Number of completed profiles. + :type count_completed: int + :param report: A report report for text generation. + :type report: TextGenerationBenchmarkReport + :param scheduler_result: Optional scheduler result for the last task. + :type scheduler_result: Optional[SchedulerResult] + """ + + completed: bool + count_total: int + count_completed: int + generation_modes: Sequence[ProfileGenerationMode] + report: TextGenerationBenchmarkReport + scheduler_result: Optional[SchedulerResult] = None + current_index: Optional[int] = None + current_profile: Optional[Profile] = None + + +class Executor: + """ + The Executor class manages the execution of tasks based on a given profile + generation mode and rate. It orchestrates the interaction between the backend, + request generator, and profile generator, and runs benchmarks accordingly. + + :param backend: The backend to run tasks against. + :type backend: Backend + :param request_generator: The generator that creates requests for execution. + :type request_generator: RequestGenerator + :param mode: The mode for profile generation (e.g., sweep, synchronous). + :type mode: ProfileGenerationMode + :param rate: The list of rates for load generation, or None. + :type rate: Optional[List[float]] + :param max_number: Maximum number of requests to generate for the scheduler + (a single report run), or None. + :type max_number: Optional[int] + :param max_duration: Maximum duration for generating requests for the scheduler, + (a single report run), or None. + :type max_duration: Optional[float] + """ + + def __init__( + self, + backend: Backend, + request_generator: RequestGenerator, + mode: ProfileGenerationMode = "sweep", + rate: Optional[Union[float, Sequence[float]]] = None, + max_number: Optional[int] = None, + max_duration: Optional[float] = None, + ): + self._backend = backend + self._generator = request_generator + self._max_number = max_number + self._max_duration = max_duration + self._profile_generator = ProfileGenerator(mode=mode, rate=rate) + logger.info("Executor initialized with mode: {}, rate: {}", mode, rate) + + @property + def backend(self) -> Backend: + """ + Returns the backend being used by the Executor. + + :return: Backend + :rtype: Backend + """ + return self._backend + + @property + def request_generator(self) -> RequestGenerator: + """ + Returns the request generator used by the Executor. + + :return: RequestGenerator + :rtype: RequestGenerator + """ + return self._generator + + @property + def profile_generator(self) -> ProfileGenerator: + """ + Returns the profile generator for generating profiles during execution. + + :return: ProfileGenerator + :rtype: ProfileGenerator + """ + return self._profile_generator + + @property + def max_number(self) -> Optional[int]: + """ + Returns the maximum number of requests to generate. + + :return: Maximum number of requests or None. + :rtype: Optional[int] + """ + return self._max_number + + @property + def max_duration(self) -> Optional[float]: + """ + Returns the maximum duration for generating requests. + + :return: Maximum duration in seconds or None. + :rtype: Optional[float] + """ + return self._max_duration + + async def run(self) -> AsyncGenerator[ExecutorResult, None]: + """ + Runs the Executor, generating and scheduling tasks based on the profile + generation mode. Yields results incrementally. + + :rtype: AsyncGenerator[ExecutorResult, None] + """ + report = TextGenerationBenchmarkReport() + report.args = { + # backend args + "backend_type": self.backend.type_, + "target": self.backend.target, + "model": self.backend.model, + # data args + "data_type": self.request_generator.type_, + "data": self.request_generator.source, + "tokenizer": self.request_generator.tokenizer.name_or_path, + # rate args + "mode": self.profile_generator.mode, + "rate": self.profile_generator.rates, + # limits args + "max_number": self.max_number, + "max_duration": self.max_duration, + } + profile_index = -1 + logger.info("Starting Executor run") + + yield ExecutorResult( + completed=False, + count_total=len(self.profile_generator), + count_completed=0, + generation_modes=self.profile_generator.profile_generation_modes, + report=report, + ) + + while profile := self.profile_generator.next(report): + logger.debug("Generated profile: {}", profile) + scheduler = Scheduler( + generator=self.request_generator, + worker=self.backend, + mode=profile.load_gen_mode, + rate=profile.load_gen_rate, + max_number=self.max_number or profile.args.get("max_number", None), + max_duration=self.max_duration, + ) + profile_index += 1 + + logger.info( + "Scheduling tasks with mode: {}, rate: {}", + profile.load_gen_mode, + profile.load_gen_rate, + ) + + async for scheduler_result in scheduler.run(): + if scheduler_result.completed: + report.add_benchmark(scheduler_result.benchmark) + logger.debug( + "Benchmark added for scheduler result: {}", + scheduler_result.benchmark, + ) + + yield ExecutorResult( + completed=False, + count_total=len(self.profile_generator), + count_completed=len(report.benchmarks), + generation_modes=self.profile_generator.profile_generation_modes, + report=report, + scheduler_result=scheduler_result, + current_index=profile_index, + current_profile=profile, + ) + + logger.info("Executor run completed") + yield ExecutorResult( + completed=True, + count_total=len(self.profile_generator), + count_completed=len(report.benchmarks), + generation_modes=self.profile_generator.profile_generation_modes, + report=report, + ) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..757646cf668ed1be6983a955ead5d81460c6d71e --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/executor/profile_generator.py @@ -0,0 +1,350 @@ +from typing import Any, Dict, List, Literal, Optional, Sequence, Union, get_args + +import numpy as np +from loguru import logger +from numpy._typing import NDArray +from pydantic import Field + +from guidellm.config import settings +from guidellm.core import TextGenerationBenchmark, TextGenerationBenchmarkReport +from guidellm.core.serializable import Serializable +from guidellm.scheduler import LoadGenerationMode + +__all__ = [ + "Profile", + "ProfileGenerationMode", + "ProfileGenerator", +] + +ProfileGenerationMode = Literal[ + "sweep", "synchronous", "throughput", "constant", "poisson" +] + + +class Profile(Serializable): + """ + A data class representing a profile for load generation. + + :param load_gen_mode: The mode of load generation (e.g., constant, poisson). + :type load_gen_mode: LoadGenerationMode + :param load_gen_rate: The rate of load generation, if applicable. + :type load_gen_rate: Optional[float] + :param args: Additional arguments for the profile. + :type args: Optional[Dict[str, Any]] + """ + + load_gen_mode: LoadGenerationMode + load_gen_rate: Optional[float] = None + args: Dict[str, Any] = Field(default_factory=dict) + + +class ProfileGenerator: + """ + Generates profiles based on different load generation modes. + + :param mode: The mode for profile generation (e.g., sweep, synchronous). + :type mode: ProfileGenerationMode + :param rate: The rate(s) for load generation; could be a float or list of floats. + :type rate: Optional[Union[float, Sequence[float]]] + """ + + def __init__( + self, + mode: ProfileGenerationMode, + rate: Optional[Union[float, Sequence[float]]] = None, + ): + if mode not in get_args(ProfileGenerationMode): + err = ValueError( + f"{mode} is not a valid Profile Generation Mode. " + f"Valid options are {get_args(ProfileGenerationMode)}" + ) + logger.error(err) + raise err + + self._mode = mode + + if self._mode in ("sweep", "throughput", "synchronous"): + if rate is not None: + err = ValueError(f"Rates are not applicable for {self._mode} mode") + logger.error(err) + raise err + self._rates = None + else: + if not rate: + err = ValueError(f"Rates are required for {self._mode} mode") + logger.error(err) + raise err + self._rates = rate if isinstance(rate, Sequence) else [rate] + + for rt in self._rates: + if rt <= 0: + err = ValueError( + f"Rate must be > 0 for mode: {self._mode}. Given: {rt}" + ) + logger.error(err) + raise err + + self._generated_count = 0 + + def __len__(self) -> int: + """ + Returns the number of profiles to generate based on the mode and rates. + + :return: The number of profiles. + :rtype: int + """ + if self._mode == "sweep": + return settings.num_sweep_profiles + 2 + + if self._mode in ("throughput", "synchronous"): + return 1 + + if not self._rates: + raise ValueError(f"Rates are required for {self._mode} mode") + + return len(self._rates) + + @property + def mode(self) -> ProfileGenerationMode: + """ + Returns the current mode of profile generation. + + :return: The profile generation mode. + :rtype: ProfileGenerationMode + """ + return self._mode + + @property + def rates(self) -> Optional[Sequence[float]]: + """ + Returns the list of rates for load generation, if any. + + :return: Sequence of rates or None if not applicable. + :rtype: Optional[Sequence[float]] + """ + return self._rates + + @property + def generated_count(self) -> int: + """ + Returns the current count of generated profiles. + + :return: The current count of generated profiles. + :rtype: int + """ + return self._generated_count + + @property + def profile_generation_modes(self) -> Sequence[ProfileGenerationMode]: + """ + Return the list of profile modes to be run in the report. + + :return: Sequence of profile modes to be run in the report. + :rtype: Sequence[ProfileGenerationMode] + """ + if self._mode == "sweep": + return ["synchronous", "throughput"] + ["constant"] * ( # type: ignore # noqa: PGH003 + settings.num_sweep_profiles + ) + + if self._mode in ["throughput", "synchronous"]: + return [self._mode] + + if self._rates is None: + raise ValueError(f"Rates are required for {self._mode} mode") + + if self._mode in ["constant", "poisson"]: + return [self._mode] * len(self._rates) + + raise ValueError(f"Invalid mode: {self._mode}") + + def next(self, current_report: TextGenerationBenchmarkReport) -> Optional[Profile]: + """ + Generates the next profile based on the current mode and report. + + :param current_report: The current report report. + :type current_report: TextGenerationBenchmarkReport + :return: The generated profile or None if no more profiles. + :rtype: Optional[Profile] + """ + logger.debug( + "Generating the next profile with mode: {}, current report: {}", + self.mode, + current_report, + ) + + if self.mode in ["constant", "poisson"]: + if not self.rates: + err = ValueError(f"Rates are required for {self.mode} mode") + logger.error(err) + raise err + + profile = self.create_fixed_rate_profile( + self.generated_count, + self.mode, + self.rates, + ) + elif self.mode == "synchronous": + profile = self.create_synchronous_profile(self.generated_count) + elif self.mode == "throughput": + profile = self.create_throughput_profile(self.generated_count) + elif self.mode == "sweep": + profile = self.create_sweep_profile( + self.generated_count, + sync_benchmark=( + current_report.benchmarks[0] if current_report.benchmarks else None + ), + throughput_benchmark=( + current_report.benchmarks[1] + if len(current_report.benchmarks) > 1 + else None + ), + ) + else: + err = ValueError(f"Invalid mode: {self.mode}") + logger.error(err) + raise err + + self._generated_count += 1 + logger.info( + "Generated profile: {}, total generated count: {}", + profile, + self._generated_count, + ) + return profile + + @staticmethod + def create_fixed_rate_profile( + index: int, mode: ProfileGenerationMode, rates: Sequence[float] + ) -> Optional[Profile]: + """ + Creates a profile with a fixed rate. + + :param index: The index of the rate in the list. + :type index: int + :param mode: The mode for profile generation (e.g., constant, poisson). + :type mode: ProfileGenerationMode + :param rates: The list of rates for load generation. + :type rates: Sequence[float] + :return: The generated profile or None if index is out of range. + :rtype: Optional[Profile] + """ + modes_map: Dict[str, LoadGenerationMode] = { + "constant": "constant", + "poisson": "poisson", + } + + if mode not in modes_map: + err = ValueError(f"Invalid mode: {mode}") + logger.error(err) + raise err + + profile = ( + Profile( + load_gen_mode=modes_map[mode], + load_gen_rate=rates[index], + ) + if index < len(rates) + else None + ) + logger.debug("Created fixed rate profile: {}", profile) + return profile + + @staticmethod + def create_synchronous_profile(index: int) -> Optional[Profile]: + """ + Creates a profile with synchronous mode. + + :param index: The index of the profile to create. + :type index: int + :return: The generated profile or None if index is out of range. + :rtype: Optional[Profile] + """ + profile = ( + Profile( + load_gen_mode="synchronous", + load_gen_rate=None, + ) + if index < 1 + else None + ) + logger.debug("Created synchronous profile: {}", profile) + return profile + + @staticmethod + def create_throughput_profile(index: int) -> Optional[Profile]: + """ + Creates a profile with throughput mode. + + :param index: The index of the profile to create. + :type index: int + :return: The generated profile or None if index is out of range. + :rtype: Optional[Profile] + """ + profile = ( + Profile( + load_gen_mode="throughput", + load_gen_rate=None, + ) + if index < 1 + else None + ) + logger.debug("Created throughput profile: {}", profile) + return profile + + @staticmethod + def create_sweep_profile( + index: int, + sync_benchmark: Optional[TextGenerationBenchmark], + throughput_benchmark: Optional[TextGenerationBenchmark], + ) -> Optional[Profile]: + """ + Creates a profile with sweep mode, generating profiles between + synchronous and throughput benchmarks. + + :param index: The index of the profile to create. + :type index: int + :param sync_benchmark: The synchronous report data. + :type sync_benchmark: Optional[TextGenerationBenchmark] + :param throughput_benchmark: The throughput report data. + :type throughput_benchmark: Optional[TextGenerationBenchmark] + :return: The generated profile or None if index is out of range. + :rtype: Optional[Profile] + """ + if index < 0 or index >= settings.num_sweep_profiles + 2: + return None + + if index == 0: + return ProfileGenerator.create_synchronous_profile(0) + + if not sync_benchmark: + err = ValueError("Synchronous report is required for sweep mode") + logger.error(err) + raise err + + if index == 1: + throughput_profile: Profile = ProfileGenerator.create_throughput_profile(0) # type: ignore # noqa: PGH003 + # set the max number of requests to 5 times the number of requests + # incase it is not set for the sweep to limit the number of requests + throughput_profile.args = {"max_number": sync_benchmark.request_count * 5} + return throughput_profile + + if not throughput_benchmark: + err = ValueError("Throughput report is required for sweep mode") + logger.error(err) + raise err + + min_rate = sync_benchmark.completed_request_rate + max_rate = throughput_benchmark.completed_request_rate + intermediate_rates: List[NDArray] = list( + np.linspace(min_rate, max_rate, settings.num_sweep_profiles + 1) + )[1:] + + return Profile( + load_gen_mode="constant", + load_gen_rate=( + float(load_gen_rate) + if (load_gen_rate := intermediate_rates[index - 2]) + else 1.0 # the fallback value + ), + ) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..f26966c029ac8e173031822233e971ec7512144b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/logger.py @@ -0,0 +1,83 @@ +""" +Logger configuration for GuideLLM. + +This module provides a flexible logging configuration using the loguru library. +It supports console and file logging with options to configure via environment +variables or direct function calls. + +Environment Variables: + - GUIDELLM__LOGGING__DISABLED: Disable logging (default: false). + - GUIDELLM__LOGGING__CLEAR_LOGGERS: Clear existing loggers + from loguru (default: true). + - GUIDELLM__LOGGING__LOG_LEVEL: Log level for console logging + (default: none, options: DEBUG, INFO, WARNING, ERROR, CRITICAL). + - GUIDELLM__LOGGING__FILE: Path to the log file for file logging + (default: guidellm.log if log file level set else none) + - GUIDELLM__LOGGING__FILE_LEVEL: Log level for file logging + (default: INFO if log file set else none). + +Usage: + from guidellm import logger, configure_logger, LoggerConfig + + # Configure metrics with default settings + configure_logger( + config=LoggingConfig + disabled=False, + clear_loggers=True, + console_log_level="DEBUG", + log_file=None, + log_file_level=None, + ) + ) + + logger.debug("This is a debug message") + logger.info("This is an info message") +""" + +import sys + +from loguru import logger + +from guidellm.config import LoggingSettings, settings + +__all__ = ["configure_logger", "logger"] + + +def configure_logger(config: LoggingSettings = settings.logging): + """ + Configure the metrics for LLM Compressor. + This function sets up the console and file logging + as per the specified or default parameters. + + Note: Environment variables take precedence over the function parameters. + + :param config: The configuration for the logger to use. + :type config: LoggerConfig + """ + + if config.disabled: + logger.disable("guidellm") + return + + logger.enable("guidellm") + + if config.clear_loggers: + logger.remove() + + # log as a human readable string with the time, function, level, and message + logger.add( + sys.stdout, + level=config.console_log_level.upper(), + format="{time} | {function} | {level} - {message}", + ) + + if config.log_file or config.log_file_level: + log_file = config.log_file or "guidellm.log" + log_file_level = config.log_file_level or "INFO" + # log as json to the file for easier parsing + logger.add(log_file, level=log_file_level.upper(), serialize=True) + + +# invoke logger setup on import with default values +# enabling console logging with INFO and disabling file logging +configure_logger() diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py new file mode 100644 index 0000000000000000000000000000000000000000..4748b12d92126698ad18e55388db5e6491293cb6 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/main.py @@ -0,0 +1,341 @@ +import asyncio +from typing import Literal, Optional, Union, get_args + +import click +from loguru import logger + +from guidellm.backend import Backend, BackendEnginePublic +from guidellm.core import GuidanceReport, TextGenerationBenchmarkReport +from guidellm.executor import Executor, ProfileGenerationMode +from guidellm.request import ( + EmulatedRequestGenerator, + FileRequestGenerator, + TransformersDatasetRequestGenerator, +) +from guidellm.request.base import RequestGenerator +from guidellm.utils import BenchmarkReportProgress, cli_params + +__all__ = ["generate_benchmark_report"] + + +@click.command() +@click.option( + "--target", + type=str, + required=True, + help=( + "The target path or url for the backend to evaluate. " + "Ex: 'http://localhost:8000/v1'" + ), +) +@click.option( + "--backend", + type=click.Choice(get_args(BackendEnginePublic)), + default="openai_server", + help=( + "The backend to use for benchmarking. " + "The default is OpenAI Server enabling compatability with any server that " + "follows the OpenAI spec including vLLM." + ), +) +@click.option( + "--model", + type=str, + default=None, + help=( + "The Model to use for benchmarking. If not provided, it will use " + "the first available model provided the backend supports listing models." + ), +) +@click.option( + "--data", + type=str, + required=True, + help=( + "The data source to use for benchmarking. " + "Depending on the data-type, it should be a " + "path to a data file containing prompts to run (ex: data.txt), " + "a HuggingFace dataset name (ex: 'neuralmagic/LLM_compression_calibration'), " + "or a configuration for emulated data " + "(ex: 'prompt_tokens=128,generated_tokens=128')." + ), +) +@click.option( + "--data-type", + type=click.Choice(["emulated", "file", "transformers"]), + required=True, + help=( + "The type of data to use for benchmarking. " + "Use 'emulated' for synthetic data, 'file' for a file, or 'transformers' " + "for a HuggingFace dataset. Specify the data source with the --data flag." + ), +) +@click.option( + "--tokenizer", + type=str, + default=None, + help=( + "The tokenizer to use for calculating the number of prompt tokens. " + "This should match the tokenizer used by the model." + "By default, it will use the --model flag to determine the tokenizer. " + "If not provided and the model is not available, will raise an error. " + "Ex: 'neuralmagic/Meta-Llama-3.1-8B-quantized.w8a8'" + ), +) +@click.option( + "--rate-type", + type=click.Choice(get_args(ProfileGenerationMode)), + default="sweep", + help=( + "The type of request rate to use for benchmarking. " + "Use sweep to run a full range from synchronous to throughput (default), " + "synchronous for sending requests one after the other, " + "throughput to send requests as fast as possible, " + "constant for a fixed request rate, " + "or poisson for a real-world variable request rate." + ), +) +@click.option( + "--rate", + type=float, + default=None, + help=( + "The request rate to use for constant and poisson rate types. " + "To run multiple, provide the flag multiple times. " + ), + multiple=True, +) +@click.option( + "--max-seconds", + type=int, + default=120, + help=( + "The maximum number of seconds for each benchmark run. " + "Either max-seconds, max-requests, or both must be set. " + "The default is 120 seconds. " + "Note, this is the maximum time for each rate supplied, not the total time. " + "This value should be large enough to allow for " + "the server's performance to stabilize." + ), +) +@click.option( + "--max-requests", + type=cli_params.MAX_REQUESTS, + default=None, + help=( + "The maximum number of requests for each benchmark run. " + "Either max-seconds, max-requests, or both must be set. " + "Note, this is the maximum number of requests for each rate supplied, " + "not the total number of requests. " + "This value should be large enough to allow for " + "the server's performance to stabilize." + ), +) +@click.option( + "--output-path", + type=str, + default=None, + help=( + "The output path to save the output report to for loading later. " + "Ex: guidance_report.json. " + "The default is None, meaning no output is saved and results are only " + "printed to the console." + ), +) +@click.option( + "--enable-continuous-refresh", + is_flag=True, + default=False, + help=( + "Enable continual refreshing of the output table in the CLI " + "until the user exits. " + ), +) +def generate_benchmark_report_cli( + target: str, + backend: BackendEnginePublic, + model: Optional[str], + data: Optional[str], + data_type: Literal["emulated", "file", "transformers"], + tokenizer: Optional[str], + rate_type: ProfileGenerationMode, + rate: Optional[float], + max_seconds: Optional[int], + max_requests: Union[Literal["dataset"], int, None], + output_path: str, + enable_continuous_refresh: bool, +): + """ + Generate a benchmark report for a specified backend and dataset. + """ + generate_benchmark_report( + target=target, + backend=backend, + model=model, + data=data, + data_type=data_type, + tokenizer=tokenizer, + rate_type=rate_type, + rate=rate, + max_seconds=max_seconds, + max_requests=max_requests, + output_path=output_path, + cont_refresh_table=enable_continuous_refresh, + ) + + +def generate_benchmark_report( + target: str, + data: Optional[str], + data_type: Literal["emulated", "file", "transformers"], + backend: BackendEnginePublic="openai_server", + model: Optional[str]=None, + tokenizer: Optional[str]=None, + rate_type: ProfileGenerationMode="sweep", + rate: Optional[float]=None, + max_seconds: Optional[int]=120, + max_requests: Union[Literal["dataset"], int, None]=None, + output_path: str=None, + cont_refresh_table: bool=False, +) -> GuidanceReport: + """ + Generate a benchmark report for a specified backend and dataset. + + :param target: The target URL or path for the backend to evaluate. + :param backend: The backend type to use for benchmarking. + :param model: The model to benchmark; + defaults to the first available if not specified. + :param data: The data source for benchmarking, + which may be a path, dataset name, or config. + :param data_type: The type of data to use, + such as 'emulated', 'file', or 'transformers'. + :param tokenizer: The tokenizer to use for token counting, + defaulting to Llama 3.1 if not provided. + :param rate_type: The rate type for requests during benchmarking. + :param rate: The specific request rate for constant and poisson rate types. + :param max_seconds: Maximum duration for each benchmark run in seconds. + :param max_requests: Maximum number of requests per benchmark run. + :param output_path: Path to save the output report file. + :param cont_refresh_table: Continually refresh the table in the CLI + until the user exits. + """ + logger.info( + "Generating benchmark report with target: {}, backend: {}", target, backend + ) + + # Create backend + backend_inst = Backend.create( + backend_type=backend, + target=target, + model=model, + ) + + request_generator: RequestGenerator + + # Create tokenizer and request generator + tokenizer_inst = tokenizer + if not tokenizer_inst: + try: + tokenizer_inst = backend_inst.model_tokenizer() + except Exception as err: + raise ValueError( + "Could not load model's tokenizer, " + "--tokenizer must be provided for request generation" + ) from err + + if data_type == "emulated": + request_generator = EmulatedRequestGenerator( + config=data, tokenizer=tokenizer_inst + ) + elif data_type == "file": + request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer_inst) + elif data_type == "transformers": + request_generator = TransformersDatasetRequestGenerator( + dataset=data, tokenizer=tokenizer_inst + ) + else: + raise ValueError(f"Unknown data type: {data_type}") + + if data_type == "emulated" and max_requests == "dataset": + raise ValueError("Cannot use 'dataset' for emulated data") + + # Create executor + executor = Executor( + backend=backend_inst, + request_generator=request_generator, + mode=rate_type, + rate=rate if rate_type in ("constant", "poisson") else None, + max_number=( + len(request_generator) if max_requests == "dataset" else max_requests + ), + max_duration=max_seconds, + ) + + # Run executor + logger.debug( + "Running executor with args: {}", + { + "backend": backend, + "request_generator": request_generator, + "mode": rate_type, + "rate": rate, + "max_number": max_requests, + "max_duration": max_seconds, + }, + ) + report = asyncio.run(_run_executor_for_result(executor)) + + # Save and print report + guidance_report = GuidanceReport() + guidance_report.benchmarks.append(report) + + if output_path: + guidance_report.save_file(output_path) + + guidance_report.print( + save_path=output_path if output_path is not None else "stdout", + continual_refresh=cont_refresh_table, + ) + + return guidance_report + + +async def _run_executor_for_result(executor: Executor) -> TextGenerationBenchmarkReport: + report = None + progress = BenchmarkReportProgress() + started = False + + async for result in executor.run(): + if not started: + progress.start(result.generation_modes) # type: ignore # noqa: PGH003 + started = True + + if result.current_index is not None: + description = f"{result.current_profile.load_gen_mode}" # type: ignore # noqa: PGH003 + if result.current_profile.load_gen_mode in ("constant", "poisson"): # type: ignore # noqa: PGH003 + description += f"@{result.current_profile.load_gen_rate:.2f} req/s" # type: ignore # noqa: PGH003 + + progress.update_benchmark( + index=result.current_index, + description=description, + completed=result.scheduler_result.completed, # type: ignore # noqa: PGH003 + completed_count=result.scheduler_result.count_completed, # type: ignore # noqa: PGH003 + completed_total=result.scheduler_result.count_total, # type: ignore # noqa: PGH003 + start_time=result.scheduler_result.benchmark.start_time, # type: ignore # noqa: PGH003 + req_per_sec=result.scheduler_result.benchmark.completed_request_rate, # type: ignore # noqa: PGH003 + ) + + if result.completed: + report = result.report + break + + progress.finish() + + if not report: + raise ValueError("No report generated by executor") + + return report + + +if __name__ == "__main__": + generate_benchmark_report_cli() diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4feca91cdbbe9a137bd8ad404394116a50868360 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/__init__.py @@ -0,0 +1,13 @@ +from .base import GenerationMode, RequestGenerator +from .emulated import EmulatedConfig, EmulatedRequestGenerator +from .file import FileRequestGenerator +from .transformers import TransformersDatasetRequestGenerator + +__all__ = [ + "EmulatedConfig", + "EmulatedRequestGenerator", + "FileRequestGenerator", + "GenerationMode", + "RequestGenerator", + "TransformersDatasetRequestGenerator", +] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd303e605f7043408c7751733c75e7429caa726 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/base.py @@ -0,0 +1,200 @@ +import contextlib +import threading +import time +from abc import ABC, abstractmethod +from queue import Empty, Full, Queue +from typing import Iterator, Literal, Union + +from loguru import logger +from transformers import ( # type: ignore # noqa: PGH003 + AutoTokenizer, + PreTrainedTokenizer, +) + +from guidellm.core.request import TextGenerationRequest + +__all__ = ["GenerationMode", "RequestGenerator"] + + +GenerationMode = Literal["async", "sync"] + + +class RequestGenerator(ABC): + """ + A base class for request generators that generate result requests. + + :param type_: The type of the request generator. + :type type_: str + :param source: The data source for the request generator. + :type source: str + :param tokenizer: The tokenizer instance or the name/config to use + for tokenizing prompts. + :type tokenizer: Union[str, PreTrainedTokenizer] + :param mode: The generation mode, either 'async' or 'sync'. + :type mode: GenerationMode + :param async_queue_size: The size of the request queue. + :type async_queue_size: int + """ + + def __init__( + self, + type_: str, + source: str, + tokenizer: Union[str, PreTrainedTokenizer], + mode: GenerationMode = "async", + async_queue_size: int = 50, + ): + self._type = type_ + self._source = source + self._async_queue_size: int = async_queue_size + self._mode: str = mode + self._queue: Queue = Queue(maxsize=async_queue_size) + self._stop_event: threading.Event = threading.Event() + + if not tokenizer: + err = "Tokenizer must be provided for request generation" + logger.error(err) + raise ValueError(err) + + self._tokenizer = ( + AutoTokenizer.from_pretrained(tokenizer) + if isinstance(tokenizer, str) + else tokenizer + ) + logger.info("Tokenizer initialized for request generation: {}", self._tokenizer) + + if self._mode == "async": + self._thread = threading.Thread(target=self._populate_queue, daemon=True) + self._thread.start() + logger.info( + "RequestGenerator started in async mode with queue size: {}", + self._async_queue_size, + ) + + def __repr__(self) -> str: + """ + Return a string representation of the RequestGenerator. + + :return: String representation of the RequestGenerator. + :rtype: str + """ + return ( + f"RequestGenerator(" + f"mode={self._mode}, " + f"async_queue_size={self._async_queue_size}, " + f"tokenizer={self._tokenizer})" + ) + + def __iter__(self) -> Iterator[TextGenerationRequest]: + """ + Provide an iterator interface to generate new requests. + + :return: An iterator over result requests. + :rtype: Iterator[TextGenerationRequest] + """ + if self.mode == "async": + while not self._stop_event.is_set(): + try: + item = self._queue.get_nowait() + self._queue.task_done() + yield item + except Empty: + time.sleep(0.01) + continue + else: + while not self._stop_event.is_set(): + yield self.create_item() + + @abstractmethod + def __len__(self) -> int: + """ + Abstract method to get the length of the collection to be generated. + """ + + @abstractmethod + def create_item(self) -> TextGenerationRequest: + """ + Abstract method to create a new result request item. + + :return: A new result request. + :rtype: TextGenerationRequest + """ + + @property + def type_(self) -> str: + """ + Get the type of the request generator. + + :return: The type of the request generator. + :rtype: str + """ + return self._type + + @property + def source(self) -> str: + """ + Get the data source for the request generator. + + :return: The data source. + :rtype: str + """ + return self._source + + @property + def tokenizer(self) -> PreTrainedTokenizer: + """ + Get the tokenizer instance. + + :return: The tokenizer instance. + :rtype: PreTrainedTokenizer + """ + return self._tokenizer + + @property + def mode(self) -> str: + """ + Get the generation mode. + + :return: The generation mode. + :rtype: str + """ + return self._mode + + @property + def async_queue_size(self) -> int: + """ + Get the size of the request queue. + + :return: The size of the request queue. + :rtype: int + """ + return self._async_queue_size + + def stop(self): + """ + Stop the background task that populates the queue. + """ + logger.info("Stopping RequestGenerator...") + self._stop_event.set() + if self._mode == "async": + self._thread.join() + logger.info("RequestGenerator stopped") + + def _populate_queue(self): + """ + Populate the request queue in the background. + """ + + while not self._stop_event.is_set(): + with contextlib.suppress(Full): + if self._queue.qsize() < self._async_queue_size: + item = self.create_item() + self._queue.put(item, timeout=0.1) + logger.debug( + "Item added to queue. Current queue size: {}", + self._queue.qsize(), + ) + else: + time.sleep(0.1) + + logger.info("RequestGenerator stopped populating queue") diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py new file mode 100644 index 0000000000000000000000000000000000000000..02f564a1ceecd9e977ce0b8d5c37a0394adeb69a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/emulated.py @@ -0,0 +1,416 @@ +import json +import math +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import numpy as np +from loguru import logger +from transformers import PreTrainedTokenizer # type: ignore # noqa: PGH003 + +from guidellm.config import settings +from guidellm.core.request import TextGenerationRequest +from guidellm.request.base import GenerationMode, RequestGenerator +from guidellm.utils import clean_text, filter_text, load_images, load_text, split_text + +__all__ = ["EmulatedConfig", "EmulatedRequestGenerator", "EndlessTokens"] + + +@dataclass +class EmulatedConfig: + """ + Configuration for emulated text generation requests. + + Args: + prompt_tokens (int): Number of prompt tokens. + prompt_tokens_variance (Optional[int]): Variance for prompt tokens. + prompt_tokens_min (Optional[int]): Minimum number of prompt tokens. + prompt_tokens_max (Optional[int]): Maximum number of prompt tokens. + generated_tokens (Optional[int]): Number of generated tokens. + generated_tokens_variance (Optional[int]): Variance for generated tokens. + generated_tokens_min (Optional[int]): Minimum number of generated tokens. + generated_tokens_max (Optional[int]): Maximum number of generated tokens. + images (Optional[int]): Number of images. + width (Optional[int]): Width of images. + height (Optional[int]): Height of images. + """ + + @staticmethod + def create_config(config: Optional[Union[str, Path, Dict]]) -> "EmulatedConfig": + """ + Create an EmulatedConfig instance from a configuration source. + + :param config: Configuration source, can be a dictionary, JSON string, + key=value string, or file path. + :type config: Union[str, Path, Dict] + :return: An instance of EmulatedConfig. + :rtype: EmulatedConfig + :raises FileNotFoundError: If the configuration file is not found. + :raises ValueError: If the configuration format is invalid. + """ + if not config: + logger.debug("Creating default configuration") + return EmulatedConfig(prompt_tokens=1024, generated_tokens=256, images=0) + + if isinstance(config, dict): + logger.debug("Loading configuration from dict: {}", config) + return EmulatedConfig(**config) + + if isinstance(config, Path) or ( + isinstance(config, str) and (config.endswith(".json") or "{" in config) + ): + logger.debug("Loading configuration from json: {}", config) + + if isinstance(config, str) and "{" in config: + json_text = config.strip() + else: + if isinstance(config, str): + config = Path(config) + + if not config.exists(): + raise FileNotFoundError(f"Configuration file not found: {config}") + + json_text = config.read_text(encoding="utf-8") + + json_dict = json.loads(json_text) + + return EmulatedConfig(**json_dict) + + if isinstance(config, str) and "=" in config: + logger.debug("Loading configuration from csv string: {}", config) + items = config.split(",") + config_dict = {} + for item in items: + key_value = item.strip().split("=") + if len(key_value) != 2: # noqa: PLR2004 + raise ValueError(f"Unexpected format for item: {item}") + key = key_value[0].strip() + value = ( + int(key_value[1].strip()) + if key_value[1].isnumeric() + else key_value[1] + ) + config_dict[key] = value + + return EmulatedConfig(**config_dict) # type: ignore # noqa: PGH003 + + raise ValueError( + f"Invalid configuration given for creation of EmulatedConfig: {config}" + ) + + prompt_tokens: int + prompt_tokens_variance: Optional[int] = None + prompt_tokens_min: Optional[int] = None + prompt_tokens_max: Optional[int] = None + + generated_tokens: Optional[int] = None + generated_tokens_variance: Optional[int] = None + generated_tokens_min: Optional[int] = None + generated_tokens_max: Optional[int] = None + + images: int = 0 + width: int = None + height: int = None + + @property + def prompt_tokens_range(self) -> Tuple[int, int]: + """ + Get the range (min, max) of prompt tokens to generate. + + :return: The range of prompt tokens. + :rtype: Tuple[int, int] + """ + return self._token_range( + self.prompt_tokens, + self.prompt_tokens_variance, + self.prompt_tokens_min, + self.prompt_tokens_max, + ) + + @property + def output_tokens_range(self) -> Tuple[int, int]: + """ + Get the range (min, max) of output tokens to generate. + + :return: The range of generated tokens. + :rtype: Tuple[int, int] + """ + if not self.generated_tokens: + return 0, 0 + + return self._token_range( + self.generated_tokens, + self.generated_tokens_variance, + self.generated_tokens_min, + self.generated_tokens_max, + ) + + def sample_prompt_tokens(self, rng: np.random.Generator) -> int: + """ + Sample the number of prompt tokens to generate. + + :param rng: The random number generator to use. + :type rng: np.random.Generator + :return: The number of prompt tokens to create. + :rtype: int + """ + return self._sample_tokens( + self.prompt_tokens, + self.prompt_tokens_variance, + self.prompt_tokens_min, + self.prompt_tokens_max, + rng, + ) + + def sample_output_tokens(self, rng: np.random.Generator) -> Optional[int]: + """ + Sample the number of output tokens to generate. + + :param rng: The random number generator to use. + :type rng: np.random.Generator + :return: The number of output tokens to generate. + :rtype: Optional[int] + """ + if not self.generated_tokens: + return None + + return self._sample_tokens( + self.generated_tokens, + self.generated_tokens_variance, + self.generated_tokens_min, + self.generated_tokens_max, + rng, + ) + + @staticmethod + def _sample_tokens( + base: int, + variance: Optional[int], + min_tokens: Optional[int], + max_tokens: Optional[int], + rng: np.random.Generator, + ) -> int: + min_tokens, max_tokens = EmulatedConfig._token_range( + base, variance, min_tokens, max_tokens + ) + + if min_tokens == max_tokens: + return min_tokens + + if not variance: + return rng.integers(min_tokens, max_tokens + 1) + + rand = rng.normal(base, math.sqrt(variance)) + + return int(min(max(rand, min_tokens), max_tokens)) + + @staticmethod + def _token_range( + base: int, + variance: Optional[int], + min_tokens: Optional[int], + max_tokens: Optional[int], + ) -> Tuple[int, int]: + if not variance: + return ( + min_tokens or base, + max_tokens or base, + ) + + min_tokens = min_tokens if min_tokens and min_tokens > 0 else 1 + max_tokens = ( + max_tokens if max_tokens and max_tokens > base else base + 5 * variance + ) + + return min_tokens, max_tokens + + +class EndlessTokens(List[str]): + """ + A list subclass that allows for endless data generation. + """ + + def __init__( + self, + data: Union[str, Path], + filter_start: Optional[Union[str, int]] = None, + filter_end: Optional[Union[str, int]] = None, + clean_text_args: Optional[Dict[str, bool]] = None, + ): + """ + Initialize EndlessDataWords with data. + + :param data: Source text data. + :type data: str + """ + logger.debug("Loading data from: {}", data) + data = load_text(data) + data = filter_text(data, filter_start, filter_end) + data = ( + clean_text(data) + if not clean_text_args + else clean_text(data, **clean_text_args) + ) + self._tokens, self._token_separators, self._line_indices = split_text(data) + + super().__init__(self._tokens) + + @property + def line_indices(self) -> List[int]: + """ + Get the list of start indices for lines. + + :return: List of start indices. + :rtype: List[int] + """ + return self._line_indices + + def create_text(self, start: int, length: int) -> str: + """ + Create a text snippet from the specified range. + + :param start: Start index. + :type start: int + :param length: Length of the snippet. + :type length: int + :return: Text snippet. + :rtype: str + """ + start = start % len(self) + text = "" + buff_token_sep = "" + + for counter in range(length): + index = (start + counter) % len(self) + text += buff_token_sep + self[index] + buff_token_sep = self._token_separators[index] + + return text + + +class EmulatedRequestGenerator(RequestGenerator): + """ + A request generator that generates emulated requests based on a configuration. + + :param config: The configuration string, file path, or dictionary. + :type config: Union[str, Dict, Path] + :param random_seed: The random seed to use for generating requests. + :type random_seed: Optional[int] + :param tokenizer: The tokenizer instance or the name/config to use + for tokenizing prompts. + :type tokenizer: Optional[Union[str, PreTrainedTokenizer]] + :param mode: The generation mode, either 'async' or 'sync'. + :type mode: GenerationMode + :param async_queue_size: The size of the request queue. + :type async_queue_size: int + """ + + def __init__( + self, + config: Optional[Union[str, Path, Dict]], + random_seed: Optional[int] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + mode: GenerationMode = "async", + async_queue_size: int = 50, + ): + """ + Initialize EmulatedRequestGenerator with configuration and tokenizer. + + :param config: Configuration source, can be a dictionary, + JSON string, or file path. + :type config: Optional[Union[str, Path, Dict]] + :param random_seed: Optional seed for random number generator. + :type random_seed: Optional[int] + :param tokenizer: Tokenizer instance or configuration for tokenizing prompts. + :type tokenizer: Optional[Union[str, PreTrainedTokenizer]] + :param mode: Mode of request generation, either 'async' or 'sync'. + :type mode: str + :param async_queue_size: Size of the asynchronous queue. + :type async_queue_size: int + """ + self._config = EmulatedConfig.create_config(config) + self._tokens = EndlessTokens( + settings.emulated_data.source, + settings.emulated_data.filter_start, + settings.emulated_data.filter_end, + ) + if self._config.images > 0: + self._images = load_images(settings.emulated_data.image_source, [self._config.width, self._config.height]) + self._rng = np.random.default_rng(random_seed) + + # NOTE: Must be after all the parameters since the queue population + # function requires attributes above + super().__init__( + type_="emulated", + source=str(config), + tokenizer=tokenizer, + mode=mode, + async_queue_size=async_queue_size, + ) + + def __len__(self) -> int: + raise NotImplementedError( + "Can't get the length of the emulated dataset. " + "Check the `--data-type` CLI parameter." + ) + + def create_item(self) -> TextGenerationRequest: + """ + Create a new text generation request item from the data. + + :return: A new text generation request. + :rtype: TextGenerationRequest + """ + logger.debug("Creating new text generation request") + target_prompt_token_count = self._config.sample_prompt_tokens(self._rng) + prompt = self.sample_prompt(target_prompt_token_count) + images = self.sample_images() + prompt_token_count = len(self.tokenizer.tokenize(prompt)) + output_token_count = self._config.sample_output_tokens(self._rng) + logger.debug("Generated prompt: {}", prompt) + + return TextGenerationRequest( + prompt=prompt, + prompt_token_count=prompt_token_count, + output_token_count=output_token_count, + images=images, + ) + + def sample_prompt(self, tokens: int) -> str: + """ + Sample a prompt with the specified number of tokens. + + :param tokens: Number of tokens for the prompt. + :type tokens: int + :return: Sampled prompt text. + :rtype: str + """ + start_line_index = self._rng.integers(0, len(self._tokens.line_indices)) + + # binary search to find the proper number of tokens for the prompt + # this is because tokenizers differ in tokenization behavior + left = 0 + right = left + 5 * tokens + + while left < right: + mid = (left + right) // 2 + prompt = self._tokens.create_text(start_line_index, mid) + token_count = len(self.tokenizer.tokenize(prompt)) + + if token_count == tokens: + return prompt + + if token_count < tokens: + left = mid + 1 + else: + right = mid + + return self._tokens.create_text(start_line_index, left) + + + def sample_images(self): + image_indices = self._rng.choice( + len(self._images), size=self._config.images, replace=False, + ) + + return [self._images[i] for i in image_indices] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py new file mode 100644 index 0000000000000000000000000000000000000000..b187f7b46b343311daa32fe465d60fff163beff2 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/file.py @@ -0,0 +1,83 @@ +from pathlib import Path +from typing import Optional, Union + +from loguru import logger +from transformers import PreTrainedTokenizer # type: ignore # noqa: PGH003 + +from guidellm.config import settings +from guidellm.core.request import TextGenerationRequest +from guidellm.request.base import GenerationMode, RequestGenerator +from guidellm.utils import load_text_lines + +__all__ = ["FileRequestGenerator"] + + +class FileRequestGenerator(RequestGenerator): + """ + A request generator implementation for files. + + :param path: The path to the file containing the data. + :type path: Optional[Union[str, Path]] + :param tokenizer: The tokenizer instance or the name/config to use + for tokenizing prompts. + :type tokenizer: Union[str, PreTrainedTokenizer] + :param mode: The generation mode, either 'async' or 'sync'. + :type mode: str + :param async_queue_size: The size of the request queue. + :type async_queue_size: int + """ + + def __init__( + self, + path: Optional[Union[str, Path]], + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + mode: GenerationMode = "async", + async_queue_size: int = 50, + ): + if not path: + raise ValueError("File path must be provided for FileRequestGenerator") + + self._path = path + self._data = load_text_lines( + path, + filters=settings.dataset.preferred_data_columns, + ) + self._iterator = iter(self._data) + + # NOTE: Must be after all the parameters since the queue population + # function requires attributes above + super().__init__( + type_="file", + source=str(path), + tokenizer=tokenizer, + mode=mode, + async_queue_size=async_queue_size, + ) + + def __len__(self) -> int: + """ + Return the number of text lines. + """ + + return len(self._data) + + def create_item(self) -> TextGenerationRequest: + """ + Create a new result request item from the data. + + :return: A new result request. + :rtype: TextGenerationRequest + """ + logger.debug("Creating new request item from file data") + + try: + data = next(self._iterator) + except StopIteration: + self._iterator = iter(self._data) + data = next(self._iterator) + + token_count = len(self.tokenizer.tokenize(data)) + request = TextGenerationRequest(prompt=data, prompt_token_count=token_count) + logger.debug("Created new TextGenerationRequest: {}", request) + + return request diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..3fd24040d3e59a95a69a2b829552bbca83bc5338 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/request/transformers.py @@ -0,0 +1,103 @@ +from pathlib import Path +from typing import Optional, Union + +from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict +from loguru import logger +from transformers import PreTrainedTokenizer # type: ignore # noqa: PGH003 + +from guidellm.core.request import TextGenerationRequest +from guidellm.request.base import GenerationMode, RequestGenerator +from guidellm.utils import ( + load_transformers_dataset, + resolve_transformers_dataset_column, +) + +__all__ = ["TransformersDatasetRequestGenerator"] + + +class TransformersDatasetRequestGenerator(RequestGenerator): + """ + A request generator implementation for Hugging Face datasets. + + :param dataset: The name of the Hugging Face dataset to use or the path + to a local dataset. + :type dataset_name: str + :param split: The split of the dataset to use (e.g., 'train', 'test'). + :type split: str + :param column: The column/field to use for generating requests. + :type column: str + :param tokenizer: The tokenizer instance or the name/config to use + for tokenizing prompts. + :type tokenizer: Union[str, PreTrainedTokenizer] + :param mode: The generation mode, either 'async' or 'sync'. + :type mode: str + :param async_queue_size: The size of the request queue. + :type async_queue_size: int + """ + + def __init__( + self, + dataset: Union[ + str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset + ], + split: Optional[str] = None, + column: Optional[str] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + mode: GenerationMode = "async", + async_queue_size: int = 50, + **kwargs, + ): + self._dataset = dataset + self._split = split + self._column = column + self._kwargs = kwargs + + self._hf_dataset: Union[Dataset, IterableDataset] = load_transformers_dataset( + dataset, split=split, **kwargs + ) + self._hf_column = resolve_transformers_dataset_column( + self._hf_dataset, column=column + ) + self._hf_dataset_iterator = iter(self._hf_dataset) + + # NOTE: Must be after all the parameters since the queue population + # function requires attributes above + super().__init__( + type_="transformers_dataset", + source=str(dataset), + tokenizer=tokenizer, + mode=mode, + async_queue_size=async_queue_size, + ) + + def __len__(self) -> int: + if not isinstance(self._hf_dataset, Dataset): + raise ValueError("Can't get dataset size for IterableDataset object") + else: + return len(self._hf_dataset) + + def create_item(self) -> TextGenerationRequest: + """ + Create a new result request item from the dataset. + + :return: A new result request. + :rtype: TextGenerationRequest + """ + + logger.debug("Creating new request item from dataset") + + try: + data = next(self._hf_dataset_iterator) + except StopIteration: + self._hf_dataset_iterator = iter(self._hf_dataset) + data = next(self._hf_dataset_iterator) + + prompt = data[self._hf_column] + token_count = len(self.tokenizer.tokenize(prompt)) + request = TextGenerationRequest( + prompt=prompt, + prompt_token_count=token_count, + ) + logger.debug(f"Created new TextGenerationRequest: {request}") + + return request diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b3b4ac50c647ad1adea9ad4368ac58727c11bc19 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/__init__.py @@ -0,0 +1,4 @@ +from .base import Scheduler, SchedulerResult +from .load_generator import LoadGenerationMode, LoadGenerator + +__all__ = ["LoadGenerationMode", "LoadGenerator", "Scheduler", "SchedulerResult"] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py new file mode 100644 index 0000000000000000000000000000000000000000..602166b01a88d5e9525f3277b2b39602e3a82fd6 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/base.py @@ -0,0 +1,374 @@ +import asyncio +import math +import time +from dataclasses import dataclass +from typing import AsyncGenerator, Literal, Optional, Union, get_args + +from loguru import logger + +from guidellm.backend import Backend +from guidellm.config import settings +from guidellm.core import ( + TextGenerationBenchmark, + TextGenerationError, + TextGenerationRequest, + TextGenerationResult, +) +from guidellm.request import RequestGenerator +from guidellm.scheduler.load_generator import LoadGenerationMode, LoadGenerator + +__all__ = ["Scheduler", "SchedulerResult"] + + +@dataclass +class SchedulerResult: + """ + Represents the result of a single task execution within the Scheduler. + + :param completed: Indicates if the task is completed. + :type completed: bool + :param count_total: Total number of tasks to be executed. + :type count_total: int + :param count_completed: Number of tasks that have been completed so far. + :type count_completed: int + :param report: Benchmark data for the task execution. + :type benchmark: TextGenerationBenchmark + :param current_result: The result of the current request, if any. + :type current_result: Optional[Union[TextGenerationResult, Exception]] + """ + + completed: bool + count_total: int + count_completed: int + benchmark: TextGenerationBenchmark + current_result: Optional[Union[TextGenerationResult, TextGenerationError]] = None + + +class Scheduler: + """ + Schedules and manages the execution of tasks for text generation requests. + + :param generator: The request generator that produces text generation requests. + :type generator: RequestGenerator + :param worker: The backend worker that processes the requests. + :type worker: Backend + :param mode: The mode of load generation (e.g., synchronous, asynchronous). + :type mode: LoadGenerationMode + :param rate: The rate at which requests are generated, if applicable. + :type rate: Optional[float] + :param max_number: Maximum number of requests to be processed. + :type max_number: Optional[int] + :param max_duration: Maximum duration in seconds for which requests + should be processed. + :type max_duration: Optional[float] + + :raises ValueError: If neither max_number nor max_duration is specified or + if they are not positive. + """ + + def __init__( + self, + generator: RequestGenerator, + worker: Backend, + mode: LoadGenerationMode = "synchronous", + rate: Optional[float] = None, + max_number: Optional[int] = None, + max_duration: Optional[float] = None, + ): + logger.info( + "Scheduler initialized with params: generator={}, worker={}, mode={}, " + "rate={}, max_number={}, max_duration={}", + generator, + worker, + mode, + rate, + max_number, + max_duration, + ) + + if mode not in get_args(LoadGenerationMode): + err = ValueError( + f"{mode} is not a valid Load Generation Mode. " + f"Valid options are {get_args(LoadGenerationMode)}" + ) + logger.error(err) + raise err + + if not max_number and not max_duration: + err = ValueError("Either max_number or max_duration must be specified") + logger.error(err) + raise err + + if max_number and max_number <= 0: + err = ValueError(f"max_number must be > 0, given: {max_number}") + logger.error(err) + raise err + + if max_duration and max_duration <= 0: + err = ValueError(f"max_duration must be > 0, given: {max_duration}") + logger.error(err) + raise err + + if mode in ["constant", "poisson"] and not rate: + err = ValueError(f"Rate must be > 0 for mode: {mode}. Given: {rate}") + logger.error(err) + raise err + + self._generator = generator + self._worker = worker + self._mode = mode + self._rate = rate + self._max_number = max_number + self._max_duration = max_duration + + self._load_generator = LoadGenerator(mode, rate) + + @property + def generator(self) -> RequestGenerator: + """ + The request generator that produces text generation requests. + + :return: The request generator instance. + :rtype: RequestGenerator + """ + return self._generator + + @property + def worker(self) -> Backend: + """ + The backend worker that processes the requests. + + :return: The backend worker instance. + :rtype: Backend + """ + return self._worker + + @property + def mode(self) -> LoadGenerationMode: + """ + The mode of load generation (e.g., synchronous, asynchronous). + + :return: The load generation mode. + :rtype: LoadGenerationMode + """ + return self._mode + + @property + def rate(self) -> Optional[float]: + """ + The rate at which requests are generated, if applicable. + + :return: The rate of request generation. + :rtype: Optional[float] + """ + return self._rate + + @property + def max_number(self) -> Optional[int]: + """ + Maximum number of requests to be processed. + + :return: The maximum number of requests. + :rtype: Optional[int] + """ + return self._max_number + + @property + def max_duration(self) -> Optional[float]: + """ + Maximum duration in seconds for which requests should be processed. + + :return: The maximum duration in seconds. + :rtype: Optional[float] + """ + return self._max_duration + + @property + def load_generator(self) -> LoadGenerator: + """ + The load generator responsible for generating load based on mode and rate. + + :return: The load generator instance. + :rtype: LoadGenerator + """ + return self._load_generator + + @property + def benchmark_mode(self) -> Literal["asynchronous", "synchronous", "throughput"]: + """ + The report mode for the scheduler. + + :return: The report mode. + :rtype: Literal["asynchronous", "synchronous", "throughput"] + """ + if self._mode == "synchronous": + return "synchronous" + + if self._mode == "throughput": + return "throughput" + + return "asynchronous" + + async def run(self) -> AsyncGenerator[SchedulerResult, None]: + """ + Run the scheduler to process requests based on the configured mode, rate, + maximum number, and maximum duration. + + :yield: The result of each task executed by the scheduler. + :rtype: Generator[SchedulerResult, None, None] + """ + logger.info("Starting Scheduler run") + + benchmark = TextGenerationBenchmark(mode=self.benchmark_mode, rate=self.rate) + start_time = time.time() + end_time = start_time + self.max_duration if self.max_duration else math.inf + max_number = float(self.max_number) if self.max_number else math.inf + runner = self._run_sync if self._mode == "synchronous" else self._run_async + count_total = ( + self.max_number + if self.max_number + else round(self.max_duration) + if self.max_duration + else 0 + ) + + # yield initial result for progress tracking + yield SchedulerResult( + completed=False, + count_total=count_total, + count_completed=0, + benchmark=benchmark, + ) + + run_count = 0 + async for res in runner(benchmark, end_time, max_number): + run_count += 1 + count_completed = ( + min(run_count, self.max_number) + if self.max_number + else round(time.time() - start_time) + if self.max_duration + else 0 + ) + + yield SchedulerResult( + completed=False, + count_total=count_total, + count_completed=count_completed, + benchmark=benchmark, + current_result=res, + ) + + logger.info("Scheduler run completed") + + yield SchedulerResult( + completed=True, + count_total=count_total, + count_completed=( + benchmark.request_count + benchmark.error_count + if self.max_number + else round(time.time() - start_time) + if self.max_duration + else 0 + ), + benchmark=benchmark, + ) + + async def _run_sync( + self, benchmark: TextGenerationBenchmark, end_time: float, max_number: float + ) -> AsyncGenerator[Union[TextGenerationResult, TextGenerationError], None]: + for index, (request, submit_at) in enumerate( + zip(self.generator, self.load_generator.times()) + ): + if index >= max_number or time.time() >= end_time: + break + + logger.debug( + "Running synchronous request={} at submit_at={}", + request, + submit_at, + ) + benchmark.request_started() + result = await self._submit_task_coroutine(request, submit_at, end_time) + if result is not None: + benchmark.request_completed(result) + logger.debug("Request completed with output: {}", result) + yield result + + async def _run_async( + self, benchmark: TextGenerationBenchmark, end_time: float, max_number: float + ) -> AsyncGenerator[Union[TextGenerationResult, TextGenerationError], None]: + tasks = [] + completed = 0 + + for index, (request, submit_at) in enumerate( + zip(self.generator, self.load_generator.times()) + ): + while (index + 1 - completed) >= settings.max_concurrency: + await asyncio.sleep(0.1) + + if index >= max_number or time.time() >= end_time or submit_at >= end_time: + break + + logger.debug( + "Running asynchronous request={} at submit_at={}", + request, + submit_at, + ) + + def _completed(_task: asyncio.Task) -> None: + nonlocal completed + completed += 1 + _res = _task.result() + + if _res: + benchmark.request_completed(_res) + logger.debug("Request completed: {}", _res) + + benchmark.request_started() + task = asyncio.create_task( + self._submit_task_coroutine(request, submit_at, end_time) + ) + task.add_done_callback(_completed) + tasks.append(task) + + # release control to the event loop for other tasks + await asyncio.sleep(0.001) + + for compl_task in asyncio.as_completed(tasks): + task_res = await compl_task + if task_res is not None: + yield task_res + + async def _submit_task_coroutine( + self, request: TextGenerationRequest, submit_at: float, end_time: float + ) -> Optional[Union[TextGenerationResult, TextGenerationError]]: + try: + if submit_at > end_time: + logger.info( + "Request {} submission time {} is greater than end time {}", + request, + submit_at, + end_time, + ) + raise asyncio.TimeoutError( + f"Request submission time {submit_at} " + f"is greater than end time {end_time}" + ) + + if submit_at > time.time(): + await asyncio.sleep(submit_at - time.time()) + + timeout = ( + end_time - time.time() if end_time and end_time < math.inf else None + ) + + return await asyncio.wait_for(self._worker.submit(request), timeout=timeout) + except asyncio.TimeoutError as exc: + logger.info("Request {} timed out: {}", request, exc) + + return None + except Exception as exc: # noqa: BLE001 + logger.warning("Request {} failed: {}", request, exc) + + return TextGenerationError(request=request, message=str(exc)) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..f629752ab2e1b961615ece6ba1e90f48274e89a2 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/scheduler/load_generator.py @@ -0,0 +1,196 @@ +import time +from typing import Generator, Literal, Optional, get_args + +import numpy as np +from loguru import logger + +__all__ = ["LoadGenerationMode", "LoadGenerator"] + +LoadGenerationMode = Literal["synchronous", "constant", "poisson", "throughput"] + + +class LoadGenerator: + """ + Load Generator class that generates timestamps for load generation. + + This class supports multiple load generation modes: "constant", "poisson", + "throughput", and "synchronous". Each mode has its own method for generating + timestamps based on the rate provided during initialization. + + :param mode: The mode of load generation. Valid options are "constant", + "poisson", "throughput", and "synchronous". + :type mode: LoadGenerationMode + :param rate: The rate at which to generate timestamps. This value is + interpreted differently depending on the mode. + :type rate: float + + :raises ValueError: If an invalid mode is provided. + """ + + def __init__(self, mode: LoadGenerationMode, rate: Optional[float] = None): + """ + Initialize the Load Generator with the mode and rate. + + :param mode: The mode of load generation ("constant", "poisson", "throughput", + or "synchronous"). + :type mode: LoadGenerationMode + :param rate: The rate at which to generate timestamps. In the "constant" + mode, this represents the frequency of events. In the "poisson" mode, + it represents the average frequency. + :type rate: Optional[float] + """ + if mode not in get_args(LoadGenerationMode): + error = ValueError( + f"{mode} is not a valid Load Generation Mode. " + f"Valid options are {get_args(LoadGenerationMode)}" + ) + logger.error(error) + raise error + + if mode not in ["synchronous", "throughput"] and (rate is None or rate <= 0): + error = ValueError(f"Rate must be > 0 for mode: {mode}. Given: {rate}") + logger.error(error) + raise error + + self._mode = mode + self._rate = rate + logger.debug( + "Initialized LoadGenerator with mode: {mode}, rate: {rate}", + mode=mode, + rate=rate, + ) + + @property + def mode(self) -> LoadGenerationMode: + """ + Get the mode of load generation. + + :return: The mode of load generation. + :rtype: LoadGenerationMode + """ + return self._mode + + @property + def rate(self) -> Optional[float]: + """ + Get the rate of load generation. + + :return: The rate of load generation. + :rtype: Optional[float] + """ + return self._rate + + def times(self) -> Generator[float, None, None]: + """ + Generate timestamps for load generation based on the selected mode. + + :return: A generator that yields timestamps at which each load + should be initiated. + :rtype: Generator[float, None, None] + + :raises ValueError: If the mode is invalid. + """ + logger.debug(f"Generating timestamps using mode: {self._mode}") + + if self._mode == "throughput": + yield from self.throughput_times() + elif self._mode == "constant": + yield from self.constant_times() + elif self._mode == "poisson": + yield from self.poisson_times() + elif self._mode == "synchronous": + yield from self.synchronous_times() + else: + logger.error(f"Invalid mode encountered: {self._mode}") + raise ValueError(f"Invalid mode: {self._mode}") + + def synchronous_times(self) -> Generator[float, None, None]: + """ + Generate invalid timestamps for the "synchronous" mode. + + :return: A generator that yields a constant invalid timestamp (-1.0). + :rtype: Generator[float, None, None] + """ + logger.debug("Generating invalid timestamps for synchronous mode") + while True: + yield -1.0 + + def throughput_times(self) -> Generator[float, None, None]: + """ + Generate timestamps at the maximum rate possible, returning the current time. + + :return: A generator that yields the current time in seconds. + :rtype: Generator[float, None, None] + """ + logger.debug("Generating timestamps at throughput rate") + while True: + yield time.time() + + def constant_times(self) -> Generator[float, None, None]: + """ + Generate timestamps at a constant rate based on the specified rate. + + :return: A generator that yields timestamps incremented by 1/rate seconds. + :rtype: Generator[float, None, None] + """ + logger.debug("Generating constant rate timestamps with rate: {}", self._rate) + + if self._rate is None or self._rate == 0: + raise ValueError( + "Rate must be > 0 for constant mode, given: {}", self._rate + ) + + start_time = time.time() + time_increment = 1.0 / self._rate + counter = 0 + + while True: + yield_time = start_time + time_increment * counter + logger.debug(f"Yielding timestamp: {yield_time}") + yield yield_time + counter += 1 + + def poisson_times(self) -> Generator[float, None, None]: + """ + Generate timestamps based on a Poisson process, where the number + of requests to be sent per second is drawn from a Poisson distribution. + The inter arrival time between requests is exponentially distributed. + + :return: A generator that yields timestamps based on a Poisson distribution. + :rtype: Generator[float, None, None] + """ + logger.debug("Generating Poisson rate timestamps with rate: {}", self._rate) + + if self._rate is None or self._rate == 0: + raise ValueError("Rate must be > 0 for poisson mode, given: {}", self._rate) + + time_tracker = time.time() + rng = np.random.default_rng() + time_increment = 1.0 + + while True: + num_requests = rng.poisson(self._rate) + + if num_requests == 0: + yield time_tracker + time_increment + else: + inter_arrival_times = rng.exponential(1.0 / self._rate, num_requests) + logger.debug( + "Calculated new inter-arrival times for poisson process: {}", + inter_arrival_times, + ) + arrival_time_tracker = time_tracker + + for arrival_time in inter_arrival_times: + arrival_time_tracker += arrival_time + + if arrival_time_tracker > time_tracker + time_increment: + logger.debug( + "Arrival time tracker: {} is greater than current time", + arrival_time_tracker, + ) + break + + yield arrival_time_tracker + + time_tracker += time_increment # Move on to the next time period diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..eb4931bdabcddce94d443a809876950424c803f5 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/__init__.py @@ -0,0 +1,43 @@ +from .images import ImageDescriptor, load_images +from .injector import create_report, inject_data +from .progress import BenchmarkReportProgress +from .text import ( + clean_text, + filter_text, + is_path, + is_path_like, + is_url, + load_text, + load_text_lines, + parse_text_objects, + split_lines_by_punctuation, + split_text, +) +from .transformers import ( + load_transformers_dataset, + resolve_transformers_dataset, + resolve_transformers_dataset_column, + resolve_transformers_dataset_split, +) + +__all__ = [ + "BenchmarkReportProgress", + "clean_text", + "create_report", + "filter_text", + "inject_data", + "is_path", + "is_path_like", + "is_url", + "load_text", + "load_text_lines", + "load_transformers_dataset", + "parse_text_objects", + "resolve_transformers_dataset", + "resolve_transformers_dataset_column", + "resolve_transformers_dataset_split", + "split_lines_by_punctuation", + "split_text", + "ImageDescriptor", + "load_images", +] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py new file mode 100644 index 0000000000000000000000000000000000000000..4e8800d2abf8df387de691bda21073c643f9129b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/cli_params.py @@ -0,0 +1,34 @@ +""" +This module includes custom CLI parameters for the `click` package. +""" + +from typing import Any, Optional + +from click import Context, Parameter, ParamType + +__all__ = ["MAX_REQUESTS"] + + +class MaxRequestsType(ParamType): + """ + Catch the `dataset` string parameter to determine the behavior of the Scheduler. + """ + + name = "max_requests" + + def convert( + self, value: Any, param: Optional[Parameter], ctx: Optional[Context] + ) -> Any: + if isinstance(value, int): + return value + + try: + return int(value) + except ValueError: + if value == "dataset": + return value + else: + self.fail(f"{value} is not a valid integer or 'dataset'", param, ctx) + + +MAX_REQUESTS = MaxRequestsType() diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py new file mode 100644 index 0000000000000000000000000000000000000000..fb66d4321309c39f92b6e1cf3ce737f7bf5c2f4c --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/images.py @@ -0,0 +1,80 @@ +from io import BytesIO +from typing import List, Optional, Tuple +from urllib.parse import urljoin + +import requests +from bs4 import BeautifulSoup +from loguru import logger +from PIL import Image +from pydantic import ConfigDict, Field, computed_field + +from guidellm.config import settings +from guidellm.core.serializable import Serializable + +__all__ = ["load_images", "ImageDescriptor"] + +class ImageDescriptor(Serializable): + """ + A class to represent image data in serializable format. + """ + model_config = ConfigDict(arbitrary_types_allowed=True) + + url: Optional[str] = Field(description="url address for image.") + image: Image.Image = Field(description="PIL image", exclude=True) + filename: Optional[int] = Field( + default=None, + description="Image filename.", + ) + + @computed_field # type: ignore[misc] + @property + def image_resolution(self) -> Tuple[int, int]: + if self.image is None: + return None + else: + return self.image.size + + +def load_images(data: str, image_resolution: Optional[List[int]]) -> List[ImageDescriptor]: + """ + Load an HTML file from a path or URL + + :param data: the path or URL to load the HTML file from + :type data: Union[str, Path] + :return: Descriptor containing image url and the data in PIL.Image.Image format + :rtype: ImageDescriptor + """ + + images = [] + if not data: + return None + if isinstance(data, str) and data.startswith("http"): + response = requests.get(data, timeout=settings.request_timeout) + response.raise_for_status() + + soup = BeautifulSoup(response.text, "html.parser") + for img_tag in soup.find_all("img"): + img_url = img_tag.get("src") + + if img_url: + # Handle relative URLs + img_url = urljoin(data, img_url) + + # Download the image + logger.debug("Loading image: {}", img_url) + img_response = requests.get(img_url) + img_response.raise_for_status() + image = Image.open(BytesIO(img_response.content)) + + if image_resolution is not None: + image = image.resize(image_resolution) + + # Load image into Pillow + images.append( + ImageDescriptor( + url=img_url, + image=image, + ) + ) + + return images diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5216aa65fe83328015af1517e049fadd344677 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/injector.py @@ -0,0 +1,70 @@ +from pathlib import Path +from typing import Union + +from pydantic import BaseModel + +from guidellm.config import settings +from guidellm.utils.text import load_text + +__all__ = ["create_report", "inject_data"] + + +def create_report(model: BaseModel, output_path: Union[str, Path]) -> Path: + """ + Creates a report from the model and saves it to the output path. + + :param model: the model to serialize and inject + :type model: BaseModel + :param output_path: the path, either a file or a directory, + to save the report to. If a directory, the report will be saved + as "report.html" inside of the directory. + :type output_path: str + :return: the path to the saved report + :rtype: str + """ + if not isinstance(output_path, Path): + output_path = Path(output_path) + + html_content = load_text(settings.report_generation.source) + report_content = inject_data( + model, + html_content, + settings.report_generation.report_html_match, + settings.report_generation.report_html_placeholder, + ) + + if not output_path.suffix: + # assume directory, save as report.html + output_path = output_path / "report.html" + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(report_content) + + return output_path + + +def inject_data( + model: BaseModel, + html: str, + match: str, + placeholder: str, +) -> str: + """ + Injects the data from the model into the HTML while replacing the placeholder. + + :param model: the model to serialize and inject + :type model: BaseModel + :param html: the html to inject the data into + :type html: str + :param match: the string to match in the html to find the placeholder + :type match: str + :param placeholder: the placeholder to replace with the model data + inside of the placeholder + :type placeholder: str + :return: the html with the model data injected + :rtype: str + """ + model_str = model.json() + inject_str = match.replace(placeholder, model_str) + + return html.replace(match, inject_str) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py new file mode 100644 index 0000000000000000000000000000000000000000..a1e1e7987e2aaa226e5c38dfbf9a9445aac60b43 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/progress.py @@ -0,0 +1,199 @@ +from datetime import datetime +from typing import List + +from loguru import logger +from rich.console import Group +from rich.live import Live +from rich.panel import Panel +from rich.progress import ( + BarColumn, + Progress, + SpinnerColumn, + TaskID, + TaskProgressColumn, + TextColumn, + TimeElapsedColumn, + TimeRemainingColumn, +) + +__all__ = ["BenchmarkReportProgress"] + + +class BenchmarkReportProgress: + """ + Manages the progress display for benchmarks and report generation using Rich. + + This class provides a visual representation of the benchmarking process + and report generation using Rich's progress bars and panels. + """ + + def __init__(self): + """ + Initialize the BenchmarkReportProgress with default settings. + + This method sets up the progress displays for both individual benchmarks + and the overall report, as well as initializing internal task management + structures. + """ + logger.info("Initializing BenchmarkReportProgress instance") + + self.benchmarks_progress = Progress( + TextColumn("[{task.fields[start_time_str]}]"), + SpinnerColumn(), + TaskProgressColumn(), + TextColumn("{task.description}"), + TextColumn(" "), + TextColumn( + "[bold cyan]({task.fields[req_per_sec]} req/sec avg)[/bold cyan]" + ), + ) + self.benchmarks_panel = Panel( + self.benchmarks_progress, + title="Benchmarks", + title_align="left", + expand=True, + ) + self.report_progress = Progress( + SpinnerColumn(), + TextColumn("Generating report..."), + BarColumn(bar_width=None), + TextColumn( + "({task.fields[completed_benchmarks]}/{task.fields[total_benchmarks]})" + ), + TextColumn("["), + TimeElapsedColumn(), + TextColumn("<"), + TimeRemainingColumn(), + TextColumn("]"), + ) + self.render_group = Group(self.benchmarks_panel, self.report_progress) + self.live = Live(self.render_group, redirect_stdout=True, redirect_stderr=True) + + self.report_task: TaskID = None # type: ignore # noqa: PGH003 + self.benchmark_tasks: List[TaskID] = [] + self.benchmark_tasks_started: List[bool] = [] + self.benchmark_tasks_completed: List[bool] = [] + self.benchmark_tasks_progress: List[float] = [] + + def start(self, task_descriptions: List[str]) -> None: + """ + Starts the live progress display and initializes benchmark tasks. + + :param task_descriptions: List of descriptions for each benchmark task. + :type task_descriptions: List[str] + """ + logger.info( + "Starting BenchmarkReportProgress with task descriptions: {}", + task_descriptions, + ) + self.live.start() + + for task_description in task_descriptions: + logger.debug("Adding task with description: {}", task_description) + task_id = self.benchmarks_progress.add_task( + task_description, + start=False, + total=None, + start_time_str="--:--:--", + req_per_sec="#.##", + ) + self.benchmark_tasks.append(task_id) + self.benchmark_tasks_started.append(False) + self.benchmark_tasks_completed.append(False) + self.benchmark_tasks_progress.append(0) + + self.report_task = self.report_progress.add_task( + "", + total=len(self.benchmark_tasks) * 100, # 100 points per report + completed_benchmarks=0, + total_benchmarks=len(task_descriptions), + ) + logger.info("Initialized {} benchmark tasks", len(task_descriptions)) + + def update_benchmark( + self, + index: int, + description: str, + completed: bool, + completed_count: int, + completed_total: int, + start_time: float, + req_per_sec: float, + ) -> None: + """ + Updates the progress of a specific benchmark task. + + :param index: Index of the benchmark task to update. + :type index: int + :param description: Description of the current benchmark task. + :type description: str + :param completed: Flag indicating if the benchmark is completed. + :type completed: bool + :param completed_count: Number of completed operations for the task. + :type completed_count: int + :param completed_total: Total number of operations for the task. + :type completed_total: int + :param start_time: Start time of the benchmark in timestamp format. + :type start_time: float + :param req_per_sec: Average requests per second. + :type req_per_sec: float + :raises ValueError: If trying to update a completed benchmark. + """ + + if self.benchmark_tasks_completed[index]: + err = ValueError(f"Benchmark {index} already completed") + logger.error("Error updating benchmark: {}", err) + raise err + + if not self.benchmark_tasks_started[index]: + self.benchmark_tasks_started[index] = True + self.benchmarks_progress.start_task(self.benchmark_tasks[index]) + logger.info("Starting benchmark task at index {}", index) + + if completed: + self.benchmark_tasks_completed[index] = True + self.benchmark_tasks_progress[index] = 100 + self.benchmarks_progress.stop_task(self.benchmark_tasks[index]) + logger.info("Completed benchmark task at index {}", index) + + self.benchmark_tasks_progress[index] = completed_count / completed_total * 100 + self.benchmarks_progress.update( + self.benchmark_tasks[index], + description=description, + total=completed_total, + completed=completed_count if not completed else completed_total, + req_per_sec=(f"{req_per_sec:.2f}" if req_per_sec else "#.##"), + start_time_str=( + datetime.fromtimestamp(start_time).strftime("%H:%M:%S") + if start_time + else "--:--:--" + ), + ) + logger.debug( + "Updated benchmark task at index {}: {}% complete", + index, + self.benchmark_tasks_progress[index], + ) + self.report_progress.update( + self.report_task, + total=len(self.benchmark_tasks) * 100, + completed=sum(self.benchmark_tasks_progress), + completed_benchmarks=sum(self.benchmark_tasks_completed), + total_benchmarks=len(self.benchmark_tasks), + ) + + def finish(self) -> None: + """ + Marks the overall report task as finished and stops the live display. + """ + logger.info("Finishing BenchmarkReportProgress") + self.report_progress.update( + self.report_task, + total=len(self.benchmark_tasks) * 100, + completed=len(self.benchmark_tasks) * 100, + completed_benchmarks=len(self.benchmark_tasks), + total_benchmarks=len(self.benchmark_tasks), + ) + self.report_progress.stop_task(self.report_task) + self.live.stop() + logger.info("BenchmarkReportProgress finished and live display stopped") diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c5038c2e8235f02acca0503d773dbce9814e76 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/text.py @@ -0,0 +1,455 @@ +import csv +import json +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse + +import ftfy +import requests +import yaml +from loguru import logger + +from guidellm.config import settings + +__all__ = [ + "clean_text", + "filter_text", + "is_path", + "is_path_like", + "is_url", + "load_text", + "load_text_lines", + "parse_text_objects", + "split_lines_by_punctuation", + "split_text", +] + + +NAME_TITLES = [ + "Mr.", + "Mrs.", + "Ms.", + "Dr.", + "Prof.", + "Jr.", + "Sr.", + "St.", + "Lt.", + "Col.", + "Gen.", + "Rep.", + "Sen.", + "Gov.", + "Pres.", +] +SENTENCE_REGEX = r'[^.!?]*[.!?]["\']?\s*(?=[A-Z])' +MAX_EXTENSION_LENGTH = 8 +MAX_PATH_LENGTH = 4096 +EXTENSION_TYPES = { + "csv": "csv", + "jsonl": "jsonl", + "json": "json", + "yaml": "yaml", + "yml": "yaml", + "txt": "txt", + "text": "txt", +} + + +def filter_text( + text: str, + filter_start: Optional[Union[str, int]] = None, + filter_end: Optional[Union[str, int]] = None, +) -> str: + """ + Filter text by start and end strings or indices + + :param text: the text to filter + :param filter_start: the start string or index to filter from + :param filter_end: the end string or index to filter to + :return: the filtered text + """ + filter_start_index = -1 + filter_end_index = -1 + + if filter_start and isinstance(filter_start, str): + filter_start_index = text.index(filter_start) + elif filter_start: + if not isinstance(filter_start, int): + raise ValueError(f"Invalid filter start index: {filter_start}") + filter_start_index = filter_start + + if filter_end and isinstance(filter_end, str): + filter_end_index = text.index(filter_end) + elif filter_end: + if not isinstance(filter_end, int): + raise ValueError(f"Invalid filter end index: {filter_end}") + filter_end_index = filter_end + + if filter_start_index > -1: + text = text[filter_start_index:] + if filter_end_index > -1: + text = text[:filter_end_index] + + return text + + +def clean_text( + text: str, + fix_encoding: bool = True, + clean_whitespace: bool = False, + remove_empty_lines: bool = False, + force_new_line_punctuation: bool = False, +) -> str: + """ + Clean text by fixing encoding, cleaning whitespace, removing empty lines, + and forcing new line punctuation + + :param text: the text to clean + :param fix_encoding: True to fix the encoding of the text, False to leave as is + :param clean_whitespace: True to clean the whitespace in the text + (remove extra spaces, tabs, etc), False to leave as is + :param remove_empty_lines: True to remove empty lines from the text + (lines with only whitespace), False to leave as is + :param force_new_line_punctuation: True to force new lines at punctuation + (line ends in a period, exclamation point, or question mark), + False to leave as is + :return: The cleaned text + """ + + if fix_encoding: + text = ftfy.fix_text(text) + + if clean_whitespace: + text = "\n".join( + [re.sub(r"\s+", " ", line).strip() for line in text.splitlines()] + ) + + if remove_empty_lines: + text = "\n".join([line for line in text.splitlines() if line.strip()]) + + if force_new_line_punctuation: + # first remove any existing new lines + text = " ".join(line for line in text.splitlines() if line.strip()) + lines = split_lines_by_punctuation(text) + text = "\n".join(lines) + + return text + + +def split_lines_by_punctuation(text: str) -> List[str]: + """ + Split text into lines based on punctuation + + :param text: the text to split + :return: the list of lines + """ + + lines = [] + current_line = "" + skip_next = False + + for index, char in enumerate(text): + if skip_next: + skip_next = False + continue + + current_line += char + + if char not in [".", "!", "?"]: + # must match end of sentence punctuation + continue + + # if this is the character for a title, don't split + if any(current_line.endswith(title) for title in NAME_TITLES): + continue + + char_next_1 = text[index + 1] if index + 1 < len(text) else None + char_next_2 = text[index + 2] if index + 2 < len(text) else None + char_next_3 = text[index + 3] if index + 3 < len(text) else None + + next_is_space = char_next_1 and char_next_1.isspace() + next_is_quote_and_space = char_next_1 in ["'", '"'] and char_next_2 == " " + + # next character must be a space or a quote, otherwise skip + if not next_is_space and not next_is_quote_and_space: + continue + + # after this, next character must be an upper case letter + upper_char = char_next_3 if next_is_quote_and_space else char_next_2 + next_is_upper = upper_char and ( + upper_char.isupper() or upper_char in ["'", '"'] + ) + + if not next_is_upper: + continue + + # if next char is a quote, add it and skip next + if next_is_quote_and_space: + current_line += text[index + 1] + skip_next = True + + lines.append(current_line.strip()) + current_line = "" + + if current_line: + lines.append(current_line.strip()) + + return lines + + +def is_url(url: str) -> bool: + """ + Check if a string is a URL + + :param url: the string to check + :return: True if the string is a URL, False if not + """ + try: + result = urlparse(url) + return all([result.scheme, result.netloc]) + except Exception: # noqa: BLE001 + return False + + +def is_path(path: Any) -> bool: + """ + Check if a string is a path + + :param path: the string to check + :return: True if the string is a path, False if not + """ + if not isinstance(path, (str, Path)): + return False + + if isinstance(path, str): + path = Path(path) + + return path.exists() + + +def is_path_like(path: Any, enforce_file: bool = False) -> bool: + """ + Check if a string has a path like structure where it doesn't need to exist + + :param path: the string to check + :param enforce_file: True if the path should be a file, False if not + :return: True if the string is path like, False if not + """ + # if path isn't a str or Path, it's not a path + if not isinstance(path, (str, Path)): + return False + + if isinstance(path, Path): + path = str(path) + + # if text is too long, it's not a path (4096 for most linux setups) + if len(path) > MAX_PATH_LENGTH: + return False + + # if it starts with a URL scheme, it's not a path + if path.startswith(("http", "ftp")): + return False + + test_path = Path(path) + + # if it's supposed to be a file and there's no extension or + # the extension is too long, it's not a path + return not enforce_file or ( + bool(test_path.suffix) and len(test_path.suffix) <= MAX_EXTENSION_LENGTH + ) + + +def split_text(text: str) -> Tuple[List[str], List[str], List[int]]: + """ + Split text into words / tokens, the white space separators between words, + and the indices for each new line + + :param text: the text to split + :return: the words, the white space separators, and the new line indices + """ + if not text or not text.strip(): + return [], [], [] + + text = text.strip() + tokens = [] # type: List[str] + separators = [] # type: List[str] + new_lines = [0] + buffer = text[0] + is_token = not text[0].isspace() + + for char in text[1:]: + char_whitespace = char.isspace() + + if char == "\n": + new_lines.append(len(tokens) + 1) + + if char_whitespace and is_token: + tokens.append(buffer) + buffer = char + is_token = False + elif char_whitespace: + buffer += char + elif not char_whitespace and not is_token: + separators.append(buffer) + buffer = char + is_token = True + else: + buffer += char + + if buffer and is_token: + tokens.append(buffer) + separators.append(" ") + elif buffer: + separators.append(buffer) + + return tokens, separators, new_lines + + +def load_text(data: Union[str, Path], encoding: Optional[str] = None) -> str: + """ + Load an HTML file from a path or URL + + :param data: the path or URL to load the HTML file from + :type data: Union[str, Path] + :param encoding: the encoding to use when reading the file + :type encoding: str + :return: the HTML content + :rtype: str + """ + logger.debug("Loading text: {}", data) + + if not data: + return "" + + # check URLs + if isinstance(data, str) and data.startswith("http"): + response = requests.get(data, timeout=settings.request_timeout) + response.raise_for_status() + return response.text + + # check raw text + if isinstance(data, str) and not is_path_like(data, enforce_file=True): + return data + + # assume local file + if not isinstance(data, Path): + data = Path(data) + + if not data.exists(): + raise FileNotFoundError(f"File not found: {data}") + + if not data.is_file(): + raise IsADirectoryError(f"Path is a directory: {data}") + + return data.read_text(encoding=encoding) + + +def parse_text_objects(data: str, format_: str = "txt") -> List[Dict]: + """ + Parse text data into a list of dictionaries based on the format given + (csv, jsonl, json, yaml, txt). + + :param data: the text data to parse + :param format_: the format of the data to parse: + 'csv', 'jsonl', 'json', 'yaml', 'txt' + :return: the list of dictionaries parsed from the data, if text + then each line is a dictionary with a single key 'text' + """ + if not isinstance(data, str): + raise ValueError(f"Unsupported data given of type: {type(data)}") + + if format_ == "csv": + reader = csv.DictReader(data.splitlines()) + columns = reader.fieldnames + return [{col: row[col] for col in columns} for row in reader] # type: ignore # noqa: PGH003 + + if format_ == "jsonl": + return [json.loads(line) for line in data.splitlines() if line] + + if format_ in ("json", "yaml"): + data = json.loads(data) if format_ == "json" else yaml.safe_load(data) + + if not data: + return [] + + if isinstance(data, dict) and len(data) == 1: + logger.debug("Getting first value from JSON/YAML object: {}", data) + data = list(data.values())[0] + elif isinstance(data, dict): + logger.debug("Converting JSON/YAML object to list: {}", data) + data = list(data.values()) + + if not isinstance(data, list) or not isinstance(data[0], dict): + raise ValueError(f"Unsupported data structure given: {data}") + + return data + + if format_ == "txt": + return [{"text": line} for line in data.splitlines() if line] + + raise ValueError(f"Unsupported format given: {format_}") + + +def load_text_lines( + data: Union[str, Path, List[Dict]], + format_: Optional[str] = None, + filters: Optional[List[str]] = None, + encoding: Optional[str] = None, +) -> List[str]: + """ + Load text lines from a file or data object with optional filtering and formatting. + + + :param data: the data to load the text lines from + :param format_: the format of the data to load, if not provided will be inferred. + Supported formats: 'csv', 'jsonl', 'json', 'yaml', 'txt' + :param filters: the keys to filter the data by when loading in order of preference. + If not provided, will use the first key in the data object. + :param encoding: the encoding to use when reading the file + :return: the list of text lines + """ + logger.debug( + "Loading text lines with format {}, filters {}, encoding {} for data: {}", + format_, + filters, + encoding, + data, + ) + + if not data: + return [] + + if not format_ and isinstance(data, (str, Path)) and "." in str(data): + extension = str(data).split(".")[-1] + format_ = EXTENSION_TYPES.get(extension, "txt") + elif not format_: + format_ = "txt" + + # load the data if it's a path or URL + if isinstance(data, (Path, str)): + data = load_text(data, encoding=encoding) + data = clean_text(data) + + # parse the data into a list of dictionaries based on the format + if isinstance(data, str): + data = parse_text_objects(data, format_) + + if not isinstance(data, list): + raise ValueError(f"Unsupported data given of type: {type(data)}") + + if not isinstance(data[0], dict): + raise ValueError(f"Unsupported data item type given: {type(data[0])}") + + # grab the first available filter key to use if preference order as provided + filter_ = list(data[0].keys())[0] + for filt in filters or []: + if filt not in data[0]: + continue + + filter_ = filt + break + + # extract the lines from the data + return [row[filter_] for row in data] if filter_ else [str(row) for row in data] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py new file mode 100644 index 0000000000000000000000000000000000000000..540572994eb692ddcaeced0055feb6a1c932f7f2 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/src/guidellm/utils/transformers.py @@ -0,0 +1,151 @@ +from pathlib import Path +from typing import List, Optional, Union + +from datasets import ( # type: ignore # noqa: PGH003 + Dataset, + DatasetDict, + IterableDataset, + IterableDatasetDict, + load_dataset, +) +from loguru import logger + +from guidellm.config import settings + +__all__ = [ + "load_transformers_dataset", + "resolve_transformers_dataset", + "resolve_transformers_dataset_column", + "resolve_transformers_dataset_split", +] + + +def load_transformers_dataset( + dataset: Union[ + str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset + ], + split: Optional[str] = None, + preferred_splits: Optional[List[str]] = settings.dataset.preferred_data_splits, + **kwargs, +) -> Union[Dataset, IterableDataset]: + """ + Load a dataset from a file or a script and resolve the preferred split. + + :param dataset: the dataset file or script to load + :param split: the dataset split to use + (overrides preferred_splits, must be in dataset) + :param preferred_splits: the preferred dataset splits to use + :param kwargs: additional keyword arguments to pass to the dataset loader + :return: the loaded dataset + """ + dataset = resolve_transformers_dataset(dataset, **kwargs) + + return resolve_transformers_dataset_split(dataset, split, preferred_splits) + + +def resolve_transformers_dataset( + dataset: Union[ + str, Path, DatasetDict, Dataset, IterableDatasetDict, IterableDataset + ], + **kwargs, +) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]: + """ + Resolve the dataset from a file (csv, json, script) or a dataset name. + + :param dataset: the dataset file or script to load + :param kwargs: additional keyword arguments to pass to the dataset loader + :return: the loaded dataset + """ + if isinstance( + dataset, (DatasetDict, Dataset, IterableDatasetDict, IterableDataset) + ): + return dataset + + if not isinstance(dataset, (str, Path)): + raise ValueError(f"Invalid dataset type: {type(dataset)}") + + dataset = str(dataset) + + if dataset.endswith((".csv", ".json")): + logger.debug("Loading dataset from local path: {}", dataset) + extension = dataset.split(".")[-1] + + return load_dataset(extension, data_files=dataset, **kwargs) + + if dataset.endswith(".py"): + logger.debug("Loading dataset from local script: {}", dataset) + + return load_dataset(dataset, **kwargs) + + logger.debug("Loading dataset: {}", dataset) + + return load_dataset(dataset, **kwargs) + + +def resolve_transformers_dataset_split( + dataset: Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset], + split: Optional[str] = None, + preferred_splits: Optional[List[str]] = settings.dataset.preferred_data_splits, +) -> Union[Dataset, IterableDataset]: + """ + Resolve the preferred split from a dataset dictionary. + + :param dataset: the dataset to resolve the split from + :param split: the dataset split to use + (overrides preferred_splits, must be in dataset) + :param preferred_splits: the preferred dataset splits to use + :return: the resolved dataset split + """ + if not isinstance(dataset, (DatasetDict, IterableDatasetDict)): + logger.debug("Dataset is not a dictionary, using default split") + return dataset + + if split: + if split not in dataset: + raise ValueError(f"Split '{split}' not found in dataset") + + return dataset[split] + + if preferred_splits: + for spl in preferred_splits: + if spl not in dataset: + continue + return dataset[spl] + + return list(dataset.values())[0] + + +def resolve_transformers_dataset_column( + dataset: Union[Dataset, IterableDataset], + column: Optional[str] = None, + preferred_columns: Optional[List[str]] = settings.dataset.preferred_data_columns, +) -> str: + """ + Resolve the preferred column from a dataset. + + :param dataset: the dataset to resolve the column from + :param column: the dataset column to use + (overrides preferred_columns, must be in dataset) + :param preferred_columns: the preferred dataset columns to use + :return: the resolved dataset column + """ + column_names = dataset.column_names + + if not column_names: + # grab from the first item + first_item = next(iter(dataset)) + column_names = list(first_item.keys()) + + if column: + if column not in column_names: + raise ValueError(f"Column '{column}' not found in dataset") + + return column + + if preferred_columns: + for col in preferred_columns: + if col not in column_names: + continue + return col + + return list(column_names)[0] diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/__init__.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py new file mode 100644 index 0000000000000000000000000000000000000000..74000dd8d3acdde3a539c1efb01f1de9b640f9db --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/guidellm/utils/inject_build_props.py @@ -0,0 +1,79 @@ +import os +import re +from datetime import datetime +from pathlib import Path + +import toml +from loguru import logger + + +def get_build_type(): + return os.getenv("GUIDELLM_BUILD_TYPE", "dev") + + +def get_build_number(): + return os.getenv("GUIDELLM_BUILD_NUMBER", "0") + + +def construct_project_name_and_version(build_type, build_number, current_version): + if not re.match(r"^\d+\.\d+\.\d+$", current_version): + raise ValueError( + f"Version '{current_version}' does not match the " + f"semantic versioning pattern '#.#.#'", + ) + + if build_type == "dev": + project_name = "guidellm_dev" + version = f"{current_version}.dev{build_number}" + elif build_type == "nightly": + project_name = "guidellm_nightly" + date_str = datetime.now().strftime("%Y%m%d") + version = f"{current_version}.{date_str}" + elif build_type == "release": + project_name = "guidellm" + version = current_version + else: + raise ValueError(f"Unknown build type: {build_type}") + + return project_name, version + + +def update_pyproject_toml(project_name, version): + try: + with Path("pyproject.toml").open() as file: + data = toml.load(file) + + data["project"]["name"] = project_name + data["project"]["version"] = version + + with Path("pyproject.toml").open("w") as file: + toml.dump(data, file) + + logger.info( + f"Updated project name to: {project_name} and version to: {version}", + ) + except (FileNotFoundError, toml.TomlDecodeError) as e: + logger.error(f"Error reading or writing pyproject.toml: {e}") + raise + + +def main(): + build_type = get_build_type() + build_number = get_build_number() + + with Path("pyproject.toml").open() as file: + pyproject_data = toml.load(file) + + current_version = pyproject_data["project"]["version"] + project_name, version = construct_project_name_and_version( + build_type, + build_number, + current_version, + ) + + if build_type != "release": + update_pyproject_toml(project_name, version) + + +if __name__ == "__main__": + main() diff --git a/models/multimodal/vision_language_model/step3/vllm/README.md b/models/multimodal/vision_language_model/step3/vllm/README.md index ce1df9df1d573642f834fbb7e3a0c1732d34e627..11266ddc300f18172ed7b6027ac787e5fce02820 100644 --- a/models/multimodal/vision_language_model/step3/vllm/README.md +++ b/models/multimodal/vision_language_model/step3/vllm/README.md @@ -9,6 +9,7 @@ Step3 is cutting-edge multimodal reasoning model—built on a Mixture-of-Experts | GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | | :----: | :----: | :----: | | MR-V100 | dev-only | 25.12 | +| MR-V100 | 4.4.0 | 26.03 | ## Model Preparation @@ -33,6 +34,51 @@ pip3 install -r requirements.txt ## Model Inference +### Inference with W4A8 + +#### Performance Test + +1. Set environment variables: +```bash +export VLLM_W8A8_MOE_USE_W4A8=1 +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +2. Start server: +```bash +vllm serve /path/to/model --limit-mm-per-prompt '{"image":5}' --gpu-memory-utilization 0.92 --port 12347 --trust-remote-code --disable-cascade-attn --no-enable-prefix-caching --max-model-len 65536 --tensor-parallel-size 4 --pipeline-parallel-size 4 --max-num-seqs 1024 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +3. Run client (Input1024, Output1024, BS10): +```bash +vllm bench serve --num-prompts 4*[max-concurrency] --model /path/to/model --dataset-name random --random-input-len 1024 --random-output-len 1024 --max-concurrency 10 --host 0.0.0.0 --port 12347 --disable-tqdm --ignore-eos +``` + +#### Accuracy Test + +4. The evaluation scripts are already included in this directory: +```bash +# eval_dataset.py and eval_dataset_w8a8.py are in the current directory +pip install fire +``` + +5. Set environment variables: +```bash +export VLLM_W8A8_MOE_USE_W4A8=1 +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +6. Start server: +```bash +vllm serve /path/to/model --limit-mm-per-prompt '{"image":5}' --gpu-memory-utilization 0.92 --port 12347 --trust-remote-code --disable-cascade-attn --no-enable-prefix-caching --max-model-len 65536 --tensor-parallel-size 4 --pipeline-parallel-size 4 --max-num-seqs 1024 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +7. Run client (MMMU dataset): +```bash +pip install fire +python3 eval_dataset.py --dataset_name MMMU_BETA --model /path/to/model --ip 127.0.0.1 --port 12347 --num_workers 8 +``` + ### Inference with w8a8 #### Starting w8a8 server ```bash diff --git a/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md b/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md index b59e7d8aa673153fe16a725047904f7c035453e6..8b8a596f303b60acd61cbcc50dce7e1ecfd2cd78 100644 --- a/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md +++ b/models/nlp/llm/deepseek-r1-distill-qwen-7b/vllm/README.md @@ -10,6 +10,7 @@ based on Qwen2.5 and Llama3 series to the community. | GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | | :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | | MR-V100 | 4.3.0 | 25.09 | | MR-V100 | 4.2.0 | 25.03 | @@ -49,6 +50,31 @@ python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-7B --max-to vllm serve data/DeepSeek-R1-Distill-Qwen-7B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code ``` +### Inference with W4A8 + +#### Performance Test + +1. Use the pre-copied ``llm-benchmark``: +```bash +cd ../../llm-benchmark +pip3 install -r requirements.txt +``` + +2. Set environment variables: +```bash +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +4. Start server (DeepSeek-R1-Distill-Qwen-7B BF16): +```bash +vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=1 --max-model-len 20480 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +5. Run client (Input2048, Output1024, BS8): +```bash +./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 8 --random-input 2048 --max-concurrency 8 --tokenize-prompt --random-range-ratio 1 --random-output 1024 +``` + ## Model Results ### Benchmarking vLLM diff --git a/models/nlp/llm/deepseek-v3.1/vllm/README.md b/models/nlp/llm/deepseek-v3.1/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17d6d265bab98fefed187c0bbd99c19671058566 --- /dev/null +++ b/models/nlp/llm/deepseek-v3.1/vllm/README.md @@ -0,0 +1,79 @@ +# DeepSeek-V3.1 (vLLM) + +## Model Description + +DeepSeek-V3 is a powerful Mixture-of-Experts (MoE) language model with 671B total parameters and 37B activated parameters. It achieves excellent performance on math, code, and reasoning tasks, comparable to leading models like GPT-4 and Claude-3.5. + +This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +### Inference with W4A8 + +#### Performance Test + +1. Use the pre-copied ``llm-benchmark``: +```bash +cd ../../llm-benchmark +pip3 install -r requirements.txt +``` + +2. Set environment variables: +```bash +export VLLM_W8A8_MOE_USE_W4A8=1 +export VLLM_ENFORCE_CUDA_GRAPH=1 +export VLLM_PP_LAYER_PARTITION="16,16,16,13" +``` + +4. Start server: +```bash +vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=4 --tensor-parallel-size=4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +5. Run client (Input128, Output128, BS8): +```bash +./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 8 --random-input 128 --max-concurrency 8 --tokenize-prompt --random-range-ratio 1 --random-output 128 +``` + +#### Accuracy Test + +6. Install evalscope: +```bash +pip3 install 'evalscope[app,perf]' -U +``` + +7. Set environment variables: +```bash +export VLLM_USE_MODELSCOPE=True +``` + +8. Start server: +```bash +vllm serve /path/to/model --max-num-seqs 4 --max-model-len 95600 --served-model-name DeepSeek-v3.1-int4-pack8 --trust-remote-code --disable-cascade-attn --tensor-parallel-size 8 --pipeline-parallel-size 2 --compilation-config '{"level":0,"cudagraph_mode":"FULL_DECODE_ONLY"}' --port 9989 +``` + +9. Run client (MATH-500 dataset): +```bash +evalscope eval --model DeepSeek-v3.1-W4A8 --dataset-args '{"math_500": {"few_shot_num": 0}}' --generation-config '{"do_sample": true, "temperature": 0.6, "max_tokens": 32768, "n": 1, "top_p": 0.95}' --datasets math_500 --eval-type openai_api --eval-batch-size 4 --api-url http://127.0.0.1:9989/v1 --timeout 12000000 --api-key EMPTY --eval-type openai_api +``` + +## References + +- [DeepSeek-V3](https://github.com/deepseek-ai/DeepSeek-V3) +- [vLLM](https://github.com/vllm-project/vllm) \ No newline at end of file diff --git a/models/nlp/llm/llm-benchmark/README.md b/models/nlp/llm/llm-benchmark/README.md new file mode 100644 index 0000000000000000000000000000000000000000..33aa05bf03d0c21fc64da72b190b64c586ac98a8 --- /dev/null +++ b/models/nlp/llm/llm-benchmark/README.md @@ -0,0 +1,308 @@ +# 安装 + +```bash +# pip安装依赖 +pip3 install -r requirements.txt +``` + +# 精度评测 + +## 简单评测 + +假如使用SGLang拉起模型服务,IP为:127.0.0.1,PORT为30000,在指定的若干数据集上使用默认配置评测DeepSeek模型,在任意路径下,执行`eval`命令: +```bash +./iluvatar_bench eval \ + --model /data/DeepSeek-R1-AWQ \ + --datasets gsm8k \ + --limit 4 \ + --eval-batch-size 8 +``` + +### 基本参数说明 + +- `--model`: 指定了模型在ModelScope中的model_id,可自动下载,也可使用模型的本地路径,例如/path/to/model。 +- `--datasets`: 数据集名称,支持输入多个数据集,使用空格分开,数据集将自动从modelscope下载,支持的数据集参考数[据集列表](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/index.html)。 +- `--limit`: 每个数据集子集,最大评测数据量,不填写则默认为全部评测,可用于快速验证。 +- `--eval-batch-size`: 评测批量大小,默认为1,表示并发请求数量。 + + +## 模型API服务评测 + +指定模型API服务地址(api_url)和API Key(api_key),评测部署的模型API服务,此时eval-type参数指定为server,默认参数如下: +```bash +--api-key='EMPTY' \ +--api-url='http://127.0.0.1:30000/v1' \ +--eval-type='server' +``` + +# 性能测试-1 + +## 基本使用 + +下面展示了用 SGLang 框架在 Bi150 上进行 DeepSeek-R1-AWQ 模型的压测示例,固定输入1024 token,输出1024 token。用户可以根据自己的需求修改参数。 + +```bash +./iluvatar_bench perf \ + --parallel 1 10 50 100 200 \ + --number 10 20 100 200 400 \ + --model /data/DeepSeek-R1-AWQ \ + --url http://127.0.0.1:30000/v1/completions \ + --api openai \ + --dataset random \ + --max-tokens 1024 \ + --min-tokens 1024 \ + --prefix-length 0 \ + --min-prompt-length 1024 \ + --max-prompt-length 1024 \ + --tokenizer-path /data/DeepSeek-R1-AWQ \ + --extra-args '{"ignore_eos": true}' +``` + +### 参数说明 + +- `parallel`: 请求的并发数,可以传入多个值,用空格隔开。 +- `number`: 发出的请求的总数量,可以传入多个值,用空格隔开(与`parallel`一一对应)。 +- `url`: 请求的URL地址。 +- `model`: 使用的模型名称。 +- `api`: 使用的API服务,默认为`openai`。 +- `dataset`: 数据集名称,此处为`random`,表示随机生成数据集,具体使用说明参考;更多可用的(多模态)数据集请参考数据集配置。 +- `tokenizer-path`: 模型的tokenizer路径,用于计算token数量(在random数据集中是必须的)。 +- `extra-args`: 请求中的额外的参数,传入json格式的字符串,例如`{"ignore_eos": true}`表示忽略结束token。 + +**默认参数** + +其中,下列参数,属于默认参数。 +```bash +--max-tokens=1024, +--min-tokens=1024, +--min-prompt-length=1024, +--max-prompt-length=1024, +--api='openai', +--url='http://127.0.0.1:30000/v1/completions' +``` + +`max-tokens`和`min-tokens`表示最大生成长度和最小生成长度。 +`max-prompt-length`和`min-prompt-length`,表示最大提示词长度和最小提示词长度。 + +# 性能测试-2 + +这里使用SGLang自带的 bench_serving.py(源自vLLM),用于测量在线服务吞吐量和延迟。 + +bench_serving.py 文件信息: +```bash +git log --oneline -- ./python/sglang/bench_serving.py +88a6f9dab bench_serving support PD Disaggregation (#11542) +``` + +## 基本使用 + +注:`./iluvatar_bench sgl-perf` 命令,与 `python3 bench_serving.py` 等同,都可以运行。 + +```bash +./iluvatar_bench sgl-perf \ + --backend sglang \ + --host 127.0.0.1 --port 30000 \ + --num-prompts 1000 +``` +模型名或路径,若未设置,系统将向/v1/models请求默认模型配置。 + +## 公共参数 + +* `--backend backend`: sglang/vllm等后端。 +* `--model`: 模型名称或者地址。 +* 连接参数:`--host`和`--port`、或`--base-url`。 +* `--dataset-name`: sharegpt, random, random-ids, generated-shared-prefix等,不同数据集,相关的配置参数不同。 +* `--request-rate`:每秒到达的请求数(默认值 inf,表示所有请求同时到达),使用**泊松过程(Poisson process)**来模拟请求的到达时间。这意味着请求之间的时间间隔是随机的,但平均速率符合设定的值。这更真实地模拟了现实世界中随机到达的用户请求。假如每隔3.5秒,需要发送6条数据,则 `Request rate = 6 requests / 3.5 seconds ≈ 1.71 requests/second`。 +* `--request-interval`: 固定间隔时间(秒)。如果设置,此值将覆盖 `--request-rate` 的设置,并使用确定性(固定时间)的间隔调度。 +* `--max-concurrency`:最大并发请求数。表示实际处理请求的 worker 数量,虽然 `--request-rate` 参数控制请求发起的速率,但此参数控制实际允许同时执行的请求数量。。 +* `--warmup-requests`: benchmark 前的 warmup 次数。 + +## sharegpt 数据集 + +`sharegpt`是真实对话数据集(默认),相关的参数如下: +* `--num-prompts`: 请求总数。 +* `--sharegpt-output-len`: 输出长度,如果没有指定,则由数据集中的样本长度决定。 +* `--sharegpt-context-len`: 设置上下文总体长度,被指定时,当 `输入 + 输出 > 最大上下文长度`时,request会被跳过。 + +简单来说,输入长度不可以被指定,输出长度和最大上下文长度,可以被指定。 + +注意,当出现以下情况,request同样会被跳过: +* `prompt_len < 2` 或者 `output_len < 2` + +## random/random-ids数据集 + +* `random`: **真实的文本**,来自 ShareGPT 数据集。确定一个随机的目标输入长度(例如 500 token),它会从 ShareGPT 数据集中随机选择一个真实的提示。如果提示太长(例如 1000 token),它会截断 (truncate) 提示到 500 token;如果提示太短(例如 100 token),它会重复 (repeat) 这个提示的 token,直到填满 500 token。用于模拟一个提示内容是**真实自然语言**的随机长度工作负载。 +* `random-ids`: **完全随机的 Token ID**。首先确定一个随机的目标输入长度(例如 500 token),它不会加载任何外部数据集,它会直接在 tokenizer 的词汇表(vocab)范围内随机生成 500 个 token ID。这些 ID 组合起来的文本**不具有任何语言学意义**(即"乱码")。模拟一个提示内容是随机、无意义数据的随机长度工作负载,对于压力测试 tokenizer 和模型处理异常输入的能力很有用。 + +相关的参数如下: +* `--num-prompts`:要处理的请求总数 +* `--random-input-len`(default:1024): 每个请求的最大输入 token 长度。脚本会在 `[random-input-len * random-range-ratio, random-input-len + 1)`随机采样一个长度。 +* `--random-output-len`(default:1024): 每个请求的最大输出 token 长度。脚本会在 `[random-output-len * random-range-ratio, random-output-len + 1)`随机采样一个长度。 +* `--random-range-ratio`(default:0.0): 一个介于 0.0 和 1.0 之间的浮点数,用于定义随机长度的下限。如果希望输入/输出长度固定为 1024,设置为 1.0 即可。 +* `--tokenize-prompt`: 主要用于 `random` 和 `random-ids` 数据集,以便在使用 `sglang` 和 `vllm` 后端时,通过发送精确长度的 token ID 列表来进行基准测试。例如,客户端生成 `[1024个ID]` 列表,跳过解码步骤,直接将这个整数ID列表发送给服务器。服务器收到ID列表后,会跳过分词步骤,直接使用这个列表。好处就是,这保证了服务器处理的输入长度**精确地**是我们想要的1024个 token。 + +以下是对于输入/输出长度,如何进行最大值/最小值的计算,如果不需要,跳过以下“计算公式”和“举例说明”即可。 + +计算公式: +* `实际输入长度=[random-input-len * random-range-ratio, random-input-len + 1)` +* `实际输出长度=[random-output-len * random-range-ratio, random-output-len + 1)` + +举例说明: +```bash +--dataset-name random \ +--random-input-len 1024 \ +--random-output-len 1024 \ +--random-range-ratio 0.8 +``` +则,输入/输出长度大小,会从区间`[819, 1025) = [1024 * 0.8, 1024+1)`进行随机取值,这时候的输入/输出长度,可能是`833/955`. +如果希望输入/输出长度固定为 1024,则需要把 `--random-range-ratio` 设置为 1.0. + +## generated-shared-prefix 数据集 + +`generated-shared-prefix` 数据集并不是一个像 `sharegpt` 那样从外部文件加载的静态数据集,而是一个动态生成的数据集。 + +它的核心目的是模拟一个非常重要且常见的 LLM 服务场景:大量请求共享一个长的前缀(prefix)。这通常发生在多租户、RAG(检索增强生成)或设置了复杂系统指令(system prompt)的应用中。 + +每个生成的请求都由两部分组成: + +1. **共享的系统提示 (System Prompt)**:一个很长的、在组内共享的文本块。 +2. **唯一的问题 (Question)**:一个较短的、每个请求独有的文本块。 + +完整的提示 (prompt) 会被构造成类似于 `"{system_prompt}\n\n{question}"` 的形式。 + +相关的参数如下: +* `--gsp-num-groups` (default: 64): 定义了要生成的**唯一系统提示 (system prompt) 的数量**。代表了基准测试中有多少个"共享前缀"的组。 +* `--gsp-prompts-per-group` (default: 16): 定义了每个组(即每个系统提示)包含多少个唯一的请求(问题)。这决定了每个共享前缀被重用的次数。总的请求数量将是 `gsp-num-groups * gsp-prompts-per-group`。 +* `--gsp-system-prompt-len` (default: 2048): 每个生成的 system prompt 的目标 token 长度。这用于模拟一个很长的前缀(例如,一个复杂的指令集或一个大的上下文文档)。 +* `--gsp-question-len` (default: 128): 每个生成的唯一问题的目标 token 长度。这模拟了用户输入的、非共享的那部分提示。 +* `--gsp-output-len` (default: 256): 基准测试中,为每个请求设置的目标输出 token 数量。这定义了模型在接收到 `system_prompt + question` 后需要生成多少内容。 + +## 例子 + +1. `sharegpt` (模拟真实对话) + +```bash +./iluvatar_bench sgl-perf \ + --backend sglang \ + --host 127.0.0.1 --port 30000 \ + --model /home/data/qwen3/Qwen3-32B \ + --dataset-name sharegpt \ + --host 127.0.0.1 --port 30000 \ + --num-prompts 1000 +``` + +2. `random` (模拟特定长度的合成负载) + +```bash +./iluvatar_bench sgl-perf \ + --backend sglang \ + --host 127.0.0.1 --port 30000 \ + --model /home/data/qwen3/Qwen3-32B \ + --dataset-name random \ + --num-prompts 1000 \ + --random-input 2048 \ + --random-output 128 \ + --random-range-ratio 0.5 +``` +此命令测试1000个请求。每个请求的输入长度将在 `(2048 * 0.5)` 到 `2048` 之间随机(即 1024 到 2048 token)。输出长度将在 `(128 * 0.5)` 到 `128` 之间随机(即 64 到 128 token)。提示内容是基于ShareGPT文本填充的。 + +3. `random-ids` (纯粹的压力测试) + +最极端的压力测试。它不关心提示的语言含义。它只是生成完全随机的 Token ID 来填满指定的输入长度。 +可以与 `--tokenize-prompt` 结合使用,以发送 `[1024, 512, 300, ...]` 这样的ID列表,而不是解码后的乱码字符串。这可以**100%精确地控制输入长度**,是测量纯硬件和系统吞吐量的最佳方式。 + +```bash +# 压力测试: 1000个请求,每个请求 *精确地* 包含1024个输入ID +# 并请求1024个输出ID +./iluvatar_bench sgl-perf \ + --backend sglang \ + --host 127.0.0.1 --port 30000 \ + --model /home/data/qwen3/Qwen3-32B \ + --dataset-name random-ids \ + --num-prompts 1000 \ + --random-input-len 1024 \ + --random-output-len 1024 \ + --random-range-ratio 1.0 \ + --tokenize-prompt +``` +`--random-range-ratio 1.0` 确保输入/输出长度不会随机化,而是精确等于 1024。`--tokenize-prompt` 确保客户端发送的是 `input_ids` 列表,而不是 `text`。这个命令是在测量服务器处理“1024-in, 1024-out”请求的纯粹性能。 + +4. 速率控制 + 输出文件 + +```bash +./iluvatar_bench sgl-perf \ + --backend sglang \ + --host 127.0.0.1 --port 30000 \ + --model /home/data/qwen3/Qwen3-32B \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 1.0 \ + --num-prompts 2000 \ + --request-rate 100 \ + --max-concurrency 512 \ + --output-file sglang_random.jsonl --output-details +``` + +5. `generated-shared-prefix` (测试 KV Cache 性能) + +```bash +./iluvatar_bench sgl-perf \ + --backend sglang \ + --host 127.0.0.1 --port 30000 \ + --model /home/data/qwen3/Qwen3-32B \ + --dataset-name generated-shared-prefix \ + --gsp-num-groups 64 --gsp-prompts-per-group 16 \ + --gsp-system-prompt-len 4096 --gsp-question-len 128 --gsp-output-len 256 \ + --num-prompts 1024 +``` +此命令将生成 `64 * 16 = 1024` 个总请求。它会创建 64 个不同的、长度为 4096 token 的“系统提示”(共享前缀)。 + +然后,对于每一个系统提示,它都会生成 16 个不同的、长度为 128 token 的“问题”。服务器在处理这1024个(被打乱的)请求时,如果其KV Cache效率高,那么它处理4096-token前缀的成本应该只支付64次,而不是1024次。 + +## PD Disaggregation Mode性能分析 + +**关键参数** + +* `--pd-separated`: 启动 PD 模式。 +* `--profile-prefill-url`:用于性能分析的 prefill worker数量。 +* `--profile-decode-url`: 用于性能分析的 decode worker数量。 + +`--profile-prefill-url` 并且 `--profile-decode-url` 是相互排斥的 - 只能二选一。 + +Start server +```bash +# set trace path +export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log + +# start prefill and decode servers (see PD disaggregation docs for setup) +python3 -m sglang.launch_server --model-path /home/data/qwen3/Qwen3-32B --disaggregation-mode prefill +python3 -m sglang.launch_server --model-path /home/data/qwen3/Qwen3-32B --disaggregation-mode decode --port 30001 --base-gpu-id 1 + +# start router +python3 -m sglang_router.launch_router --pd-disaggregation --prefill http://127.0.0.1:30000 --decode http://127.0.0.1:30001 --host 0.0.0.0 --port 8000 +``` + +Profile Prefill Workers +```bash +# send profiling request targeting prefill workers +./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000 +``` + +Profile Decode Workers +```bash +# send profiling request targeting decode workers +./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --sharegpt-output-len 100 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001 +``` + +注意: +* 这两个选项都支持用于多实例设置下的多个 worker URL: +```bash +# Profile multiple prefill workers +./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --profile --pd-separated --profile-prefill-url http://127.0.0.1:30000 http://127.0.0.1:30002 + +# Profile multiple decode workers +./iluvatar_bench sgl-perf --backend sglang --model /home/data/qwen3/Qwen3-32B --num-prompts 10 --profile --pd-separated --profile-decode-url http://127.0.0.1:30001 http://127.0.0.1:30003 +``` + +# References + +- [evalscope](https://github.com/modelscope/evalscope) \ No newline at end of file diff --git a/models/nlp/llm/llm-benchmark/iluvatar_bench b/models/nlp/llm/llm-benchmark/iluvatar_bench new file mode 100644 index 0000000000000000000000000000000000000000..fa6ab398bea7a1006c26b2acb6e0e5560b55fe05 --- /dev/null +++ b/models/nlp/llm/llm-benchmark/iluvatar_bench @@ -0,0 +1,181 @@ +#!/usr/local/bin/python3 + +import argparse +from argparse import ArgumentParser + +from evalscope import __version__ +from evalscope.cli.base import CLICommand + +from bench_serving import define_sgl_bench_args, run_benchmark + +class PerfBenchCMD(CLICommand): + name = 'perf' + + def __init__(self, args): + self.args = args + + @classmethod + def subparser_func(cls, args): + """ + Function which will be called for a specific sub parser. + This method creates an instance of PerfBenchCMD from parsed arguments. + """ + return cls(args) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for create pipeline template command. + """ + from evalscope.perf.arguments import add_argument + + parser = parsers.add_parser(PerfBenchCMD.name) + add_argument(parser) + parser.set_defaults( + max_tokens=1024, + min_tokens=1024, + min_prompt_length=1024, + max_prompt_length=1024, + api='openai', + url='http://127.0.0.1:30000/v1/completions' + ) + parser.set_defaults(func=PerfBenchCMD.subparser_func) + + def execute(self): + try: + from evalscope.perf.main import run_perf_benchmark + except ImportError as e: + raise ImportError( + f'Failed to import run_perf_benchmark from evalscope.perf.main, due to {e}. ' + "Please run `pip install 'evalscope[perf]'`." + ) + + run_perf_benchmark(self.args) + + +class EvalCMD(CLICommand): + name = 'eval' + + def __init__(self, args): + self.args = args + + @classmethod + def subparser_func(cls, args): + """ + Function which will be called for a specific sub parser. + This method creates an instance of EvalCMD from parsed arguments. + """ + return cls(args) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for create pipeline template command. + """ + from evalscope.arguments import add_argument + + parser = parsers.add_parser(EvalCMD.name) + add_argument(parser) + parser.set_defaults( + api_key='EMPTY', + api_url='http://127.0.0.1:30000/v1', + eval_type='server' + ) + parser.set_defaults(func=EvalCMD.subparser_func) + + def execute(self): + from evalscope.run import run_task + + run_task(self.args) + + +class StartAppCMD(CLICommand): + name = 'app' + + def __init__(self, args): + self.args = args + + @classmethod + def subparser_func(cls, args): + """ + Function which will be called for a specific sub parser. + This method creates an instance of StartAppCMD from parsed arguments. + """ + return cls(args) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for create pipeline template command. + """ + from evalscope.app import add_argument + + parser = parsers.add_parser(StartAppCMD.name) + add_argument(parser) + parser.set_defaults(func=StartAppCMD.subparser_func) + + def execute(self): + try: + from evalscope.app import create_app + except ImportError as e: + raise ImportError( + f'Failed to import create_app from evalscope.app, due to {e}. ' + "Please run `pip install 'evalscope[app]'`." + ) + + create_app(self.args) + +class SGLPerfCMD(CLICommand): + name = 'sgl-perf' + + def __init__(self, args): + self.args = args + + @classmethod + def subparser_func(cls, args): + """ + Function which will be called for a specific sub parser. + """ + return cls(args) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for sgl-perf command. """ + + parser = parsers.add_parser(SGLPerfCMD.name, + help='Run SGLang performance benchmark (bench_serving.py)') + + define_sgl_bench_args(parser) + + parser.set_defaults(func=SGLPerfCMD.subparser_func) + + def execute(self): + if run_benchmark is None: + raise ImportError( + "Failed to import 'run_benchmark' from 'bench_serving'. " + "Command 'sgl-perf' cannot execute." + ) + + run_benchmark(self.args) + +def run_cmd(): + parser = argparse.ArgumentParser('EvalScope Command Line tool', usage='evalscope []') + parser.add_argument('-v', '--version', action='version', version=f'evalscope {__version__}') + subparsers = parser.add_subparsers(help='EvalScope command line helper.') + + PerfBenchCMD.define_args(subparsers) + EvalCMD.define_args(subparsers) + StartAppCMD.define_args(subparsers) + + # sgl-perf + SGLPerfCMD.define_args(subparsers) + + args = parser.parse_args() + + if not hasattr(args, 'func'): + parser.print_help() + exit(1) + + cmd = args.func(args) + cmd.execute() + + +if __name__ == '__main__': + run_cmd() diff --git a/models/nlp/llm/llm-benchmark/requirements.txt b/models/nlp/llm/llm-benchmark/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..d25d9047746399f23453ec9a29e29b92aa9db271 --- /dev/null +++ b/models/nlp/llm/llm-benchmark/requirements.txt @@ -0,0 +1,3 @@ +evalscope==1.0.2 +evalscope[perf] +evalscope[app] \ No newline at end of file diff --git a/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md b/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d790fe79907a37f5ef61f0c8ab81f3e91c77ed82 --- /dev/null +++ b/models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md @@ -0,0 +1,56 @@ +# Qwen3-235B-A22B-Thinking-2507 (vLLM) + +## Model Description + +Qwen3-235B-A22B is a large Mixture-of-Experts (MoE) language model with 235B total parameters and 22B activated parameters. The "Thinking" version is optimized for complex logical reasoning, math, and coding tasks with enhanced reasoning capabilities. + +This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +### Inference with W4A8 + +#### Performance Test + +1. Use the pre-copied ``llm-benchmark``: +```bash +cd ../../llm-benchmark +pip3 install -r requirements.txt +``` + +2. Set environment variables: +```bash +export VLLM_ENFORCE_CUDA_GRAPH=1 +export VLLM_W8A8_MOE_USE_W4A8=1 +``` + +4. Start server: +```bash +vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=4 --tensor-parallel-size=4 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +5. Run client (Input128, Output128, BS1): +```bash +./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 128 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 128 +``` + +## References + +- [Qwen3](https://github.com/QwenLM/Qwen3) +- [vLLM](https://github.com/vllm-project/vllm) \ No newline at end of file diff --git a/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md b/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3d4b23188d6132f6fb1d89e3cbf245802fc98be1 --- /dev/null +++ b/models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md @@ -0,0 +1,56 @@ +# Qwen3-30B-A3B-Thinking-2507 (vLLM) + +## Model Description + +Qwen3-30B-A3B is a Mixture-of-Experts (MoE) large language model with 30B total parameters and 3B activated parameters. The "Thinking" version is optimized for complex logical reasoning, math, and coding tasks. + +This version supports W4A8 (Weight-4bit, Activation-8bit) quantization for efficient inference. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +### Inference with W4A8 + +#### Performance Test + +1. Use the pre-copied ``llm-benchmark``: +```bash +cd ../../llm-benchmark +pip3 install -r requirements.txt +``` + +2. Set environment variables: +```bash +export VLLM_ENFORCE_CUDA_GRAPH=1 +export VLLM_W8A8_MOE_USE_W4A8=1 +``` + +4. Start server: +```bash +vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=2 --max-model-len 4096 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +5. Run client (Input128, Output128, BS1): +```bash +./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 128 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 128 +``` + +## References + +- [Qwen3](https://github.com/QwenLM/Qwen3) +- [vLLM](https://github.com/vllm-project/vllm) \ No newline at end of file diff --git a/models/nlp/llm/qwen3-32b/vllm/README.md b/models/nlp/llm/qwen3-32b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8494587317da66ad862415d8ffd10a1c16142277 --- /dev/null +++ b/models/nlp/llm/qwen3-32b/vllm/README.md @@ -0,0 +1,55 @@ +# Qwen3-32B (vLLM) + +## Model Description + +Qwen3-32B is a dense large language model with 32B parameters, offering excellent performance on reasoning, instruction-following, and multilingual tasks. It supports seamless switching between thinking mode (for complex logical reasoning, math, and coding) and non-thinking mode (for efficient, general-purpose dialogue). + +This version supports W8A8 (Weight-8bit, Activation-8bit) and W4A16 (Weight-4bit, Activation-16bit) quantization for efficient inference. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +### Inference with W8A8/W4A16 + +#### Performance Test + +1. Use the pre-copied ``llm-benchmark``: +```bash +cd ../../llm-benchmark +pip3 install -r requirements.txt +``` + +2. Set environment variables: +```bash +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +4. Start server: +```bash +vllm serve /path/to/model --trust-remote-code --pipeline-parallel-size=1 --tensor-parallel-size=2 --max-model-len 8192 --gpu-memory-utilization 0.9 --disable-cascade-attn --no-enable-prefix-caching --no-enable_chunked_prefill --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' +``` + +5. Run client (Input2048, Output1024, BS1): +```bash +./iluvatar_bench sgl-perf --backend vllm --host 0.0.0.0 --port 8000 --model /path/to/model --dataset-name random --dataset-path /path/to/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1 --random-input 2048 --max-concurrency 1 --tokenize-prompt --random-range-ratio 1 --random-output 1024 +``` + +## References + +- [Qwen3](https://github.com/QwenLM/Qwen3) +- [vLLM](https://github.com/vllm-project/vllm) \ No newline at end of file diff --git a/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md b/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f462989e4b6f662e1a4f02f374157fb3e9c033eb --- /dev/null +++ b/models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md @@ -0,0 +1,55 @@ +# Qwen3-Next-80B-A3B-Instruct (vLLM) + +## Model Description + +Qwen3-Next-80B-A3B-Instruct is a Mixture-of-Experts (MoE) large language model with 80B total parameters and 3B activated parameters. This is the next generation Qwen model with enhanced reasoning capabilities and instruction following. + +This version runs in BF16 precision for maximum accuracy. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.4.0 | 26.03 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +### Inference with BF16 + +#### Accuracy Test + +1. Install evalscope: +```bash +pip3 install 'evalscope[app,perf]' -U +``` + +2. Set environment variables: +```bash +export VLLM_USE_MODELSCOPE=True +export VLLM_ENFORCE_CUDA_GRAPH=1 +``` + +3. Start server: +```bash +vllm serve /path/to/model --served-model-name Qwen3-Next-80B-A3B-Instruct --trust_remote_code --port 8801 --pipeline-parallel-size 1 --tensor-parallel-size 8 --max-num-seqs 64 --max-model-len 40960 --disable-cascade-attn --gpu-memory-utilization 0.90 --compilation-config '{"cudagraph_mode": "FULL_DECODE_ONLY", "level": 0}' --port 9989 +``` + +4. Run client (MMLU-Pro dataset): +```bash +evalscope eval --model Qwen3-Next-80B-A3B-Instruct --dataset-args '{"mmlu_pro": {"few_shot_num": 0}}' --generation-config '{"do_sample": true, "temperature": 0.7, "max_tokens": 32768, "n": 1, "top_p": 0.8, "top_k": 20}' --datasets mmlu_pro --eval-type openai_api --eval-batch-size 64 --api-url http://127.0.0.1:9989/v1 --timeout 12000000 --api-key EMPTY --eval-type openai_api +``` + +## References + +- [Qwen3](https://github.com/QwenLM/Qwen3) +- [vLLM](https://github.com/vllm-project/vllm) \ No newline at end of file diff --git a/tests/model_info.json b/tests/model_info.json index 5fa94159b24ceded3fe8dcc876e4313390de496b..1eb8591fa3a56ccd51211c20095b16493a1344d7 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -10060,6 +10060,370 @@ "type": "inference", "hasDemo": false, "demoType": "" + }, + { + "display_name": "DeepSeek V3.1", + "model_name": "deepseek-v3.1", + "framework": "vllm", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "nlp/llm", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/nlp/llm/deepseek-v3.1/vllm", + "readme_file": "models/nlp/llm/deepseek-v3.1/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://www.modelscope.cn/models/deepseek-ai/DeepSeek-V3", + "need_third_part": false, + "precisions": [ + "w4a8" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen3 32B", + "model_name": "qwen3-32b", + "framework": "vllm", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "nlp/llm", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/nlp/llm/qwen3-32b/vllm", + "readme_file": "models/nlp/llm/qwen3-32b/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-32B", + "need_third_part": false, + "precisions": [ + "w8a8", + "w4a16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen3 30B A3B Thinking", + "model_name": "qwen3-30b-a3b-thinking", + "framework": "vllm", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "nlp/llm", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/nlp/llm/qwen3-30b-a3b-thinking/vllm", + "readme_file": "models/nlp/llm/qwen3-30b-a3b-thinking/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-30B-A3B", + "need_third_part": false, + "precisions": [ + "w4a8" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen3 235B A22B Thinking", + "model_name": "qwen3-235b-a22b-thinking", + "framework": "vllm", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "nlp/llm", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/nlp/llm/qwen3-235b-a22b-thinking/vllm", + "readme_file": "models/nlp/llm/qwen3-235b-a22b-thinking/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-235B-A22B", + "need_third_part": false, + "precisions": [ + "w4a8" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen3 Next 80B A3B", + "model_name": "qwen3-next-80b-a3b", + "framework": "vllm", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "nlp/llm", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/nlp/llm/qwen3-next-80b-a3b/vllm", + "readme_file": "models/nlp/llm/qwen3-next-80b-a3b/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://www.modelscope.cn/models/Qwen/Qwen3-Next-80B-A3B-Instruct", + "need_third_part": false, + "precisions": [ + "bf16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "FLUX.1 Dev", + "model_name": "flux.1-dev", + "framework": "xdit", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/flux.1-dev/xdit", + "readme_file": "models/multimodal/diffusion_model/flux.1-dev/xdit/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://modelscope.cn/models/black-forest-labs/FLUX.1-dev", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "HunyuanVideo", + "model_name": "hunyuan_video", + "framework": "xdit", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/hunyuan_video/xdit", + "readme_file": "models/multimodal/diffusion_model/hunyuan_video/xdit/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://modelscope.cn/models/Tencent-Hunyuan/HunyuanVideo", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Wan2.1 T2V 14B", + "model_name": "wan2.1-t2v-14b", + "framework": "xdit", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit", + "readme_file": "models/multimodal/diffusion_model/wan2.1-t2v-14b/xdit/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://modelscope.cn/models/Wan-AI/Wan2.1-T2V-14B-Diffusers", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Wan2.2 TI2V 5B", + "model_name": "wan2.2-ti2v-5b", + "framework": "xdit", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit", + "readme_file": "models/multimodal/diffusion_model/wan2.2-ti2v-5b/xdit/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://www.modelscope.cn/models/Wan-AI/Wan2.2-TI2V-5B-Diffusers", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "HunyuanDiT v1.2", + "model_name": "hunyuandit-v1.2", + "framework": "xdit", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit", + "readme_file": "models/multimodal/diffusion_model/hunyuanDit-v1.2/xdit/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://modelscope.cn/models/dengcao/HunyuanDiT-v1.2-Diffusers", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "SD3 Medium", + "model_name": "stable-diffusion-3-medium", + "framework": "xdit", + "release_version": "26.03", + "release_sdk": "4.4.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "4.4.0", + "latest_gpgpu": "", + "category": "multimodal/diffusion_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit", + "readme_file": "models/multimodal/diffusion_model/stable-diffusion-3-medium/xdit/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://modelscope.cn/models/stabilityai/stable-diffusion-3-medium-diffusers", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" } ] } \ No newline at end of file