diff --git a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README.md b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README.md index bc151e2..3f7f16b 100644 --- a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README.md +++ b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README.md @@ -201,7 +201,7 @@ cd python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge agentkit config \ --agent_name vikingdb_agent \ --entry_point 'agent.py' \ ---runtime_envs DATABASE_TOS_BUCKET=agentkit-platform-2107625663 \ +--runtime_envs DATABASE_TOS_BUCKET=agentkit-platform- \ --runtime_envs DATABASE_VIKING_COLLECTION=agentkit_knowledge_app \ --launch_type cloud diff --git a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README_en.md b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README_en.md index 169229a..8ac19e0 100644 --- a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README_en.md +++ b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/README_en.md @@ -201,7 +201,7 @@ cd python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge agentkit config \ --agent_name vikingdb_agent \ --entry_point 'agent.py' \ ---runtime_envs DATABASE_TOS_BUCKET=agentkit-platform-2107625663 \ +--runtime_envs DATABASE_TOS_BUCKET=agentkit-platform- \ --runtime_envs DATABASE_VIKING_COLLECTION=agentkit_knowledge_app \ --launch_type cloud diff --git a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/agent.py b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/agent.py index 0048c58..37bb137 100644 --- a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/agent.py +++ b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/agent.py @@ -29,11 +29,14 @@ "售后服务政策:\n1. 质保期:所有电子产品提供1年免费质保。\n2. 退换货:购买后7天内无理由退货,15天内有质量问题换货。\n3. 客服支持:提供7x24小时在线客服咨询。" ) - # 创建知识库 -kb = KnowledgeBase( - backend="viking", - app_name=os.getenv("DATABASE_VIKING_COLLECTION", "agentkit_knowledge_app"), -) +# 创建知识库 +knowledge_collection_name = os.getenv("DATABASE_VIKING_COLLECTION", "") +if knowledge_collection_name != "": + # 使用用户指定的知识库 + kb = KnowledgeBase(backend="viking", index=knowledge_collection_name) +else: + raise ValueError("DATABASE_VIKING_COLLECTION environment variable is not set") + kb.add_from_files( files=["/tmp/product_info.txt", "/tmp/service_policy.txt"], tos_bucket_name=os.environ.get("DATABASE_TOS_BUCKET"), diff --git a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/pyproject.toml b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/pyproject.toml index bd2ea47..59f742d 100644 --- a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/pyproject.toml +++ b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/pyproject.toml @@ -5,9 +5,9 @@ description = "Add your description here" readme = "README.md" requires-python = ">=3.12" dependencies = [ - "google-adk==1.18.0", - "veadk-python[extensions]==0.2.29", - "veadk-python==0.2.29", + "google-adk==1.19.0", + "veadk-python[extensions]==0.5.5", + "veadk-python==0.5.5", "agentkit-sdk-python==0.2.0", "python-dotenv>=1.0.0", ] \ No newline at end of file diff --git a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/requirements.txt b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/requirements.txt index a79890f..698c35e 100644 --- a/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/requirements.txt +++ b/python/01-tutorials/06-agentkit-knowledge/01_viking_knowledge/requirements.txt @@ -1,4 +1,4 @@ -veadk-python==0.2.29 -veadk-python[extensions]==0.2.29 -google-adk==1.18.0 +veadk-python==0.5.5 +veadk-python[extensions]==0.5.5 +google-adk==1.19.0 agentkit-sdk-python==0.2.0 diff --git a/python/02-use-cases/12_ad_video_gen_seq/README.md b/python/02-use-cases/12_ad_video_gen_seq/README.md new file mode 100644 index 0000000..056cfdf --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/README.md @@ -0,0 +1,197 @@ +# 电商营销视频生成 E-commerce Marketing Video Generation + +## 概述 + +> 本项目基于 VeADK 的 SequentialAgent(串行多 Agent 编排)实现电商营销视频生成:从“营销策划/分镜脚本/图片生成/质量评估/视频生成/合成发布”串联成一条稳定的工作流,适合快速生成商品展示类短视频(如单品宣传、活动促销物料)。 +> +> 该示例以单个 Root Agent 对外提供服务,内部由多个子 Agent 按固定顺序执行,便于本地调试与云端部署。 + +- 本项目是`04_ad_video_gen_a2a`的衍生,对其进行了agentkit平台部署的适配。 +- 本项目是sequential-agent,而multimedia是使用的是a2a 方式进行agent交互 +- 本项目能够在agentkit 平台进行部署 + +## 核心功能 + +本项目提供以下核心功能: + +- **营销策划与生成配置**:根据用户输入(商品名/卖点/素材链接)生成视频结构与生成参数 +- **分镜脚本生成**:自动输出分镜(shot)脚本,包含画面描述、动作与生成要点 +- **文生图/图生图批量生成**:按分镜批量生成多张候选首帧图,支持参考图输入 +- **图片/视频质量评估与筛选**:对候选图/视频打分并选优,减少“抽卡”成本 +- **文生视频/首帧引导视频生成**:基于选中的首帧为每个分镜生成多条视频候选 +- **本地合成与TOS上传**:本地拼接分镜视频为成片,并上传到 TOS 生成可访问 URL + +## Agent 能力 + +系统由一个 Root Agent 对外提供服务,内部按顺序编排以下子 Agent: + +- **营销策划 Agent (`market_agent`)**:解析用户输入、补全关键信息,生成视频生成配置与分镜数量要求 +- **分镜 Agent (`storyboard_agent`)**:根据配置输出分镜脚本(shots) +- **生图 Agent (`image_agent`)**:为每个分镜批量生成候选首帧图 +- **图片评估 Agent (`image_evaluate_agent`)**:对每个分镜候选图片进行打分并选优 +- **生视频 Agent (`video_agent`)**:基于选优图片生成分镜视频(支持批量、多条候选) +- **视频评估 Agent (`video_evaluate_agent`)**:对分镜视频进行质量评估并选优 +- **合成发布 Agent (`release_agent`)**:将选优分镜视频拼接成成片并上传到 TOS 输出链接 + +### 费用说明 + +| 相关服务 | 描述 | 计费说明 | +| --- | --- | --- | +| [Doubao-Seed-1.6](https://console.volcengine.com/ark/region:ark+cn-beijing/model/detail?Id=doubao-seed-1-6) | 负责理解用户信息并转化为工具调用。 | [多种计费方式](https://www.volcengine.com/docs/82379/1099320) | +| [Doubao-Seedance 1.5 pro](https://console.volcengine.com/ark/region:ark+cn-beijing/model/detail?Id=doubao-seedance-1-5-pro) | 负责将图片和文字描述转为视频。 | [多种计费方式](https://www.volcengine.com/docs/82379/1099320) | +| [Doubao-Seedream 4.5 pro](https://console.volcengine.com/ark/region:ark+cn-beijing/model/detail?Id=doubao-seedream-4-5) | 负责根据文字或参考图生成图片 | [多种计费方式](https://www.volcengine.com/docs/82379/1099320) | + +## 本地运行 + +### 环境准备 + +开始前,请确保您的开发环境满足以下要求: + +- Python 3.12 或更高版本 +- veadk-python 0.5.5(见 `pyproject.toml`) +- 推荐使用 `uv` 进行依赖管理 +- 本地需要可用的 `ffmpeg`(用于 `moviepy` 合成视频) +- 获取火山方舟 API KEY +- 获取火山引擎 AK/SK + +### 快速入门 + +请按照以下步骤在本地部署和运行本项目。 + +#### 1. 下载代码并安装依赖 + +```bash +# 克隆代码仓库 +git clone https://github.com/volcengine/agentkit-samples.git +cd agentkit-samples/python/02-use-cases/12_ad_video_gen_seq + +# 安装项目依赖 +uv sync --index-url https://mirrors.aliyun.com/pypi/simple + +# mac or linux +source .venv/bin/activate +# windows powershell +.venv\Scripts\activate +``` + +#### 2. 配置环境变量 + +请参考 `config.yaml.example` 创建 `config.yaml`,并填入必要的密钥信息(模型、AK/SK、TOS bucket 等)。 + +```bash +# 复制配置文件 +cp config.yaml.example config.yaml +``` + +`config.yaml` 的关键字段包括: + +- `model.agent.*`:用于文本理解/规划/评估的模型配置 +- `model.agent.image.*`:用于生图的模型配置 +- `model.agent.video.*`:用于生视频的模型配置 +- `volcengine.access_key` / `volcengine.secret_key`:用于 TOS 上传鉴权 +- `database.tos.bucket`:用于存储生成视频、图片等产物的 bucket 名称 + - 你可以将bucket设置为agentkit-platform-{{your_account_id}} + - 其中 `{{your_account_id}}`需要替换为您的火山引擎账号 ID + - 示例: `DATABASE_TOS_BUCKET=agentkit-platform-12345678901234567890` + +#### 3.本地调试 + +- 本地调试时,可直接运行 `debug.py` 启动服务。 + + ```bash + python debug.py + ``` + +- 或者通过 `veadk web` 进行调试 + + 通过 `veadk web` 进行本地测试 + + ```bash + veadk web + ``` + +默认监听 `http://0.0.0.0:8000`。 + +#### 4. 调试方法 + +推荐使用以下方式在本地快速调试完整链路: + +```bash +python debug.py +``` + +## AgentKit 部署 + +部署前请设置相关环境变量 + +```bash +export VOLCENGINE_ACCESS_KEY={your_ak} +export VOLCENGINE_SECRET_KEY={your_sk} +``` + +部署到运行时 + +```bash +agentkit config \ + --agent_name multimedia_seq \ + --entry_point main.py \ + --launch_type cloud \ + --runtime_envs DATABASE_TOS_BUCKET=agentkit-platform-{{your_account_id}} \ + --image_tag v1.0.0 + +agentkit launch +``` + +### 技术实现 + +本项目核心为一套基于 VeADK 构建的串行多 Agent 工作流,由 Root Agent 统一编排各子 Agent 顺序执行,形成稳定、可复现的视频生产链路: + +用户输入 → 营销策划 → 分镜生成 → 生图 → 图片评估 → 生视频 → 视频评估 → 合成与上传 + +## 目录结构说明 + +```plaintext +/ +├── README.md # 本文档 +├── app/ # Agent 与工具实现 +│ ├── root/ # Root 顺序编排入口(SequentialAgent) +│ ├── market/ # 营销策划(生成视频配置/分镜数量等) +│ ├── storyboard/ # 分镜脚本生成 +│ ├── image/ # 生图与图片结果结构化 +│ ├── eval/ # 图片/视频评估与选优 +│ ├── video/ # 生视频(支持批量生成) +│ ├── release/ # 视频拼接与上传 +│ └── utils.py # URL code 映射、TOS 上传等公共方法 +├── config.yaml.example # 配置文件示例 +├── debug.py # 本地调试脚本(不启动服务) +├── model.py # Agent Model +├── main.py # 本地启动服务入口(AgentkitAgentServerApp) +├── pyproject.toml # 依赖管理(uv) +└── requirements.txt # 依赖管理(pip/uv pip) +``` + +## 示例提示词 + +以下是一些常用的提示词示例: +- `请生成一条巧克力的圣诞节营销视频。商品名:圣诞限定黑巧克力礼盒装。适用场景和人群:适合所有巧克力爱好者,特别是追求圣诞节极致口感、甜蜜分享与能量补充的消费者;适用于圣诞下午茶时光、节日亲友聚会、温馨赠礼或任何需要增添节日氛围的愉悦时刻。主要成分:精选可可豆、纯可可脂、优质牛奶、天然香草,无添加人工色素和防腐剂,富含抗氧化剂。口味/特点:入口即化,丝滑醇厚,浓郁可可香,微苦回甘中带有节日限定的暖心回味 http://lf3-static.bytednsdoc.com/obj/eden-cn/lm_sth/ljhwZthlaukjlkulzlp/ark/assistant/images/ad_chocolate.png` +- `请生成一条面包营销视频。商品名:奶香松软拉丝吐司;适用场景和人群:场景:早餐配餐、下午茶点、日常代餐,人群:上班族、学生党、家庭群体(偏好松软口感的面包爱好者);主要成分:高筋面粉、牛奶、鸡蛋、黄油、酵母、白砂糖;口味 / 特点:口味:浓郁奶香,搭配黄油 + 蜂蜜后口感香甜柔润,特点:面包质地松软、切面蜂窝气孔均匀,烤后表皮带焦脆斑点,兼具松软内里与香脆外皮的双重口感 http://lf3-static.bytednsdoc.com/obj/eden-cn/lm_sth/ljhwZthlaukjlkulzlp/ark/assistant/images/ad_bread.jpeg` +- `生成一条侘寂风香薰蜡烛的商品图电商营销视频。商品名:侘寂风香薰蜡烛;适用场景和人群:场景:居家客厅装饰、卧室助眠、书房放松、极简风空间氛围营造,人群:喜欢极简 / 侘寂美学的家居爱好者、追求松弛感的都市白领、香薰收藏者;主要成分:天然大豆蜡、植物精油、水泥罐身、纸质标识贴;气味 / 特点:气味:可选木质调(雪松 / 檀香)、草本调(鼠尾草 / 尤加利)等中性舒缓香型,特点:水泥罐身带原生肌理质感,搭配黑白极简花纹标识,烛火柔和不刺眼;罐身可重复利用,整体风格低调质朴,契合侘寂美学的调性 http://lf3-static.bytednsdoc.com/obj/eden-cn/lm_sth/ljhwZthlaukjlkulzlp/ark/assistant/images/ad_candle.jpeg` + +## 效果展示 + +系统能够: + +- ✅ 自动解析商品信息并生成营销策略 +- ✅ 创建高质量的视频脚本和分镜 +- ✅ 生成吸引人的营销文案 +- ✅ 制作专业的电商营销视频 +- ✅ 提供视频质量评估和优化 +- ✅ 支持一键发布到多个平台 + +## 常见问题 + +常见问题列表待补充。 + +## 代码许可 + +本项目采用开源许可证,详情请参考项目根目录下的 LICENSE 文件。 diff --git a/python/02-use-cases/12_ad_video_gen_seq/README_en.md b/python/02-use-cases/12_ad_video_gen_seq/README_en.md new file mode 100644 index 0000000..265f807 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/README_en.md @@ -0,0 +1,200 @@ +# E-commerce Marketing Video Generation + +## Overview + +> This project uses VeADK SequentialAgent (serial multi-agent orchestration) to generate e-commerce marketing videos. It builds a stable workflow by chaining “marketing planning / storyboard script / image generation / quality evaluation / video generation / final composition & publishing”. It’s suitable for quickly producing short product showcase videos (e.g., single-product promos, campaign materials). +> +> The example exposes a single Root Agent as the service entry point. Internally, multiple sub-agents execute in a fixed order, which is convenient for local debugging and cloud deployment. + +- This project is derived from `04_ad_video_gen_a2a`, adapted for AgentKit platform deployment. +- This project uses sequential-agent, while the multimedia example uses A2A for agent interaction. +- This project can be deployed on the AgentKit platform. + +## Key Features + +This project provides the following capabilities: + +- **Marketing planning & generation configuration**: based on user input (product name / selling points / asset links), generates the video structure and generation parameters +- **Storyboard script generation**: automatically outputs shot scripts, including visual description, actions, and generation highlights +- **Text-to-image / image-to-image batch generation**: generates multiple candidate first-frame images per shot, with optional reference images +- **Image/video quality evaluation & selection**: scores candidate images/videos and selects the best to reduce trial-and-error cost +- **Text-to-video / first-frame guided video generation**: generates multiple video candidates per shot based on the selected first frame +- **Local composition & TOS upload**: stitches shot videos into a final video locally, then uploads to TOS and returns an accessible URL + +## Agent Capabilities + +The system exposes one Root Agent and orchestrates the following sub-agents in sequence: + +- **Marketing Planning Agent (`market_agent`)**: parses user inputs, fills missing key info, and generates video configuration and shot count requirements +- **Storyboard Agent (`storyboard_agent`)**: produces shot scripts based on the configuration +- **Image Agent (`image_agent`)**: batch-generates candidate first-frame images for each shot +- **Image Evaluation Agent (`image_evaluate_agent`)**: scores and selects the best image per shot +- **Video Agent (`video_agent`)**: generates shot videos from selected images (supports batch generation and multiple candidates) +- **Video Evaluation Agent (`video_evaluate_agent`)**: evaluates and selects the best shot videos +- **Release Agent (`release_agent`)**: stitches selected shot videos into a final output and uploads to TOS, returning a link + +### Cost Notes + +| Related Service | Description | Pricing | +| --- | --- | --- | +| [Doubao-Seed-1.6](https://console.volcengine.com/ark/region:ark+cn-beijing/model/detail?Id=doubao-seed-1-6) | Understands user inputs and converts them into tool calls. | [Multiple pricing options](https://www.volcengine.com/docs/82379/1099320) | +| [Doubao-Seedance 1.5 pro](https://console.volcengine.com/ark/region:ark+cn-beijing/model/detail?Id=doubao-seedance-1-5-pro) | Converts images and text descriptions into videos. | [Multiple pricing options](https://www.volcengine.com/docs/82379/1099320) | +| [Doubao-Seedream 4.5 pro](https://console.volcengine.com/ark/region:ark+cn-beijing/model/detail?Id=doubao-seedream-4-5) | Generates images from text or reference images. | [Multiple pricing options](https://www.volcengine.com/docs/82379/1099320) | + +## Run Locally + +### Prerequisites + +Before starting, make sure your environment meets these requirements: + +- Python 3.12 or later +- veadk-python 0.5.5 (see `pyproject.toml`) +- `uv` is recommended for dependency management +- `ffmpeg` available locally (used by `moviepy` for video composition) +- Get Volcengine Ark API KEY +- Get Volcengine AK/SK + +### Quick Start + +Follow these steps to set up and run the project locally. + +#### 1. Clone and install dependencies + +```bash +# Clone the repository +git clone https://github.com/volcengine/agentkit-samples.git +cd agentkit-samples/python/02-use-cases/12_ad_video_gen_seq + +# Install dependencies +uv sync --index-url https://mirrors.aliyun.com/pypi/simple + +# mac or linux +source .venv/bin/activate +# windows powershell +.venv\Scripts\activate +``` + +#### 2. Configure environment variables + +Create `config.yaml` by following `config.yaml.example`, and fill in required secrets (models, AK/SK, TOS bucket, etc.). + +```bash +# Copy the config file +cp config.yaml.example config.yaml +``` + +Key fields in `config.yaml` include: + +- `model.agent.*`: model configuration for text understanding / planning / evaluation +- `model.agent.image.*`: model configuration for image generation +- `model.agent.video.*`: model configuration for video generation +- `volcengine.access_key` / `volcengine.secret_key`: used for TOS upload authentication +- `database.tos.bucket`: bucket name used to store generated videos, images, and other artifacts + - You can set the bucket to `agentkit-platform-{{your_account_id}}` + - Replace `{{your_account_id}}` with your Volcengine account ID + - Example: `DATABASE_TOS_BUCKET=agentkit-platform-12345678901234567890` + +#### 3. Local debugging + +- For local debugging, run `debug.py` to start the service. + + ```bash + python debug.py + ``` + +- Or debug via `veadk web` + + Use `veadk web` for local testing: + + ```bash + veadk web + ``` + +By default it listens on `http://0.0.0.0:8000`. + +#### 4. Debugging tips + +Recommended way to quickly debug the full pipeline locally: + +```bash +python debug.py +``` + +## AgentKit Deployment + +Set related environment variables before deployment: + +```bash +export VOLCENGINE_ACCESS_KEY={your_ak} +export VOLCENGINE_SECRET_KEY={your_sk} +``` + +Deploy to runtime: + +```bash +agentkit config \ + --agent_name multimedia_seq \ + --entry_point main.py \ + --launch_type cloud \ + --runtime_envs DATABASE_TOS_BUCKET=agentkit-platform-{{your_account_id}} \ + --image_tag v1.0.0 + +agentkit launch +``` + +### Technical Details + +At its core, this project is a serial multi-agent workflow built with VeADK. The Root Agent orchestrates sub-agents in a fixed sequence to form a stable, reproducible video production pipeline: + +User input → Marketing planning → Storyboard generation → Image generation → Image evaluation → Video generation → Video evaluation → Composition & upload + +## Directory Structure + +```plaintext +/ +├── README.md # Chinese documentation +├── README_en.md # English documentation +├── app/ # Agents and tool implementations +│ ├── root/ # Root orchestration entry (SequentialAgent) +│ ├── market/ # Marketing planning (video config / shot count, etc.) +│ ├── storyboard/ # Storyboard script generation +│ ├── image/ # Image generation and result structuring +│ ├── eval/ # Image/video evaluation and selection +│ ├── video/ # Video generation (supports batch) +│ ├── release/ # Video stitching and upload +│ └── utils.py # URL-code mapping, TOS upload, shared utilities +├── config.yaml.example # Example config +├── debug.py # Local debug script (does not start server) +├── model.py # Agent Model +├── main.py # Local service entry (AgentkitAgentServerApp) +├── pyproject.toml # Dependency management (uv) +└── requirements.txt # Dependency management (pip/uv pip) +``` + +## Example Prompts + +Here are some commonly used prompt examples: + +- `Please generate a Christmas marketing video for chocolate. Product name: Christmas limited dark chocolate gift box. Applicable scenarios and audience: suitable for all chocolate lovers, especially consumers seeking the ultimate Christmas taste, sweet sharing, and energy replenishment; suitable for Christmas afternoon tea, holiday gatherings with friends and family, warm gift-giving, or any moment that needs a festive atmosphere. Main ingredients: selected cocoa beans, pure cocoa butter, premium milk, natural vanilla, no artificial colorants or preservatives, rich in antioxidants. Flavor/features: melts in the mouth, silky and rich, intense cocoa aroma, slightly bitter with a sweet aftertaste and a warm holiday-limited finish http://lf3-static.bytednsdoc.com/obj/eden-cn/lm_sth/ljhwZthlaukjlkulzlp/ark/assistant/images/ad_chocolate.png` +- `Please generate a bread marketing video. Product name: milky soft pull-apart toast. Scenarios/audience: scenarios: breakfast pairing, afternoon tea snacks, daily meal replacement; audience: office workers, students, families (bread lovers who prefer soft texture). Main ingredients: high-gluten flour, milk, eggs, butter, yeast, sugar. Flavor/features: rich milky aroma; tastes sweet and smooth when paired with butter + honey; features: soft crumb with even honeycomb pores, toasted crust with slightly charred spots, combining a soft interior and a crispy crust http://lf3-static.bytednsdoc.com/obj/eden-cn/lm_sth/ljhwZthlaukjlkulzlp/ark/assistant/images/ad_bread.jpeg` +- `Generate an e-commerce marketing video from product images for a wabi-sabi style scented candle. Product name: wabi-sabi scented candle. Scenarios/audience: scenarios: living room decor, bedroom sleep aid, study relaxation, minimalist ambience; audience: home decor lovers who like minimalism / wabi-sabi aesthetics, urban professionals seeking a relaxed vibe, fragrance collectors. Main ingredients: natural soy wax, essential oils, cement jar, paper label sticker. Scents/features: wood scents (cedar/sandalwood) or herbal scents (sage/eucalyptus), etc.; features: cement jar with raw texture, black-and-white minimalist patterned label; soft candlelight; jar reusable; overall understated and rustic, matching wabi-sabi aesthetics http://lf3-static.bytednsdoc.com/obj/eden-cn/lm_sth/ljhwZthlaukjlkulzlp/ark/assistant/images/ad_candle.jpeg` + +## Demo Output + +The system can: + +- ✅ Automatically parse product information and generate marketing strategy +- ✅ Create high-quality video scripts and storyboards +- ✅ Generate engaging marketing copy +- ✅ Produce professional e-commerce marketing videos +- ✅ Provide video quality evaluation and optimization +- ✅ Support one-click publishing to multiple platforms + +## FAQ + +FAQ list to be added. + +## License + +This project is open-sourced. See the LICENSE file in the repository root for details. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/__init__.py new file mode 100644 index 0000000..b7466c9 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .root import get_root_agent + +root_agent = get_root_agent() diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/__init__.py new file mode 100644 index 0000000..645e9a0 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent import get_eval_agent + +__all__ = ["get_eval_agent"] diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/agent.py new file mode 100644 index 0000000..de99ef8 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/agent.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Literal + +from veadk import Agent + +from app.eval.hook import hook_url_id_mapping +from app.eval.prompt import PROMPT_EVALUATE_AGENT +from app.eval.tools.geval import evaluate_media +from app.model import ArkLlm + + +def get_eval_agent(eval_type: Literal["image", "video"]): + eval_agent = Agent( + name=f"{eval_type}_evaluate_agent", + enable_responses=True, + description="根据用户的需求,评估分镜图片或分镜视频的质量", + instruction=PROMPT_EVALUATE_AGENT, + after_tool_callback=[hook_url_id_mapping], + tools=[evaluate_media], + model_extra_config={ + "extra_body": { + "thinking": {"type": os.getenv("THINKING_EVALUATE_AGENT", "disabled")}, + "caching": { + "type": "disabled", + }, + } + }, + ) + eval_agent.model = ArkLlm( + model=f"{eval_agent.model_provider}/{eval_agent.model_name}", + api_key=eval_agent.model_api_key, + api_base=eval_agent.model_api_base, + **eval_agent.model_extra_config, + ) + return eval_agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/hook.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/hook.py new file mode 100644 index 0000000..c04d0a3 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/hook.py @@ -0,0 +1,47 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Any + +from google.adk.tools import BaseTool, ToolContext +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + + +def hook_url_id_mapping( + tool: BaseTool, args: dict[str, Any], tool_context: ToolContext, tool_response: Any +) -> Optional[Any]: + """ + Handle evaluation results and generate callback output. + after_tool_callback + """ + agent_name = tool_context.agent_name + tool_name = tool.name + + if tool_name == "evaluate_media": + if agent_name == "image_evaluate_agent": + tool_context.state["cb_agent_state"] = ( + "\n✅首帧图评估生成任务已经完成,继续执行下一步视频生成任务\n" + ) + tool_context.state["cb_agent_output"] = "" + elif agent_name == "video_evaluate_agent": + tool_context.state["cb_agent_state"] = ( + "\n✅视频评估生成任务已经完成,继续执行下一步视频合成任务\n" + ) + tool_context.state["cb_agent_output"] = "" + + return tool_response + + return None diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/prompt.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/prompt.py new file mode 100644 index 0000000..d0f88dc --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/prompt.py @@ -0,0 +1,117 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PROMPT_EVALUATE_AGENT = """ +# 角色: +你是一位食品饮料行业的电商营销评审 evaluate_agent,对分镜图片和分镜视频进行质量评估。 +## 背景介绍 +你是电商营销视频生成流程的一部分,在你的前一步已经生成了四个分镜,每个分镜有多N个图片/视频, +你的任务是评估每个分镜中每个图片/视频的得分,进而选择出合适的图片/视频作为该分镜的素材(N->1) + +## Notice: +1. 生成内容不要使用单引号、双引号等字符。语音问中文,不要用英文。 +2. 输入输出以及运行过程中,任何涉及图片或视频的code(⌥code格式),不要做任何修改。 + +# 工具: +1. evaluate_media:为图片或视频打分。 + +# 任务描述: +你作为 evaluate_agent,可能会收到用户的两种不同任务:图片评分任务和视频评分任务。 +但他们本质上没有区别,都是需要你输入分镜信息进行评估。 +至于如何确定输入是图片还是视频,你根据你的名称来: + - 如果你叫做`image_eval_agent`那么你做的就是视频评估任务 + - 如果你叫做`video_eval_agent`那么你做的就是图片评估任务 + + +# 注意事项: +1. 即使每个分镜只有一张图片/视频,你也要进行同样的处理逻辑,因为评分也很重要 +2. 你只需识别用户请求的是哪种任务,然后调用 `evaluate_media` 工具,根据 `evaluate_media` 工具返回的评估结果返回给用户。 +3. 输入输出中,任何涉及图片或视频的code(⌥code格式),不要做任何修改。 + +# 输出要求 +请你按照markdown格式输出,输出内容务必精简 + +## 输出字段说明 +- score:得分,范围为0~1分,保留两位小数 +- reason:评分理由,综合了美学、画质、一致性三个维度进行点评,具体的理由写法请根据工具返回结果来填写 +- code:图片/视频的 code(⌥code格式) +## 输出模板 +```markdown +## 图片/视频评估 + +### 评估结果 + +分镜1: +- 图片/视频1(「code」): 得分: 「score」, 理由:「reason」 +- 图片/视频2(「code」): 得分: 「score」, 理由: 「reason」 +// 注意,这里一定要输出`\n`分割,下面同理 +分镜2: +- 图片/视频1(「code」): 得分: 「score」, 理由: 「reason」 +- 图片/视频2(「code」): 得分: 「score」, 理由: 「reason」 + +分镜3: +- 图片/视频1(「code」): 得分: 「score」, 理由: 「reason」 +- 图片/视频2(「code」): 得分: 「score」, 理由: 「reason」 + +分镜4: +- 图片/视频1(「code」): 得分: 「score」, 理由: 「reason」 +- 图片/视频2(「code」): 得分: 「score」, 理由: 「reason」 + +### 选择结果 +根据评估结果,我们选取得分最高的「图片/视频」作为该分镜的素材。 + +| 分镜 | 选择的图片/视频code| 得分 | +| ---- | ----------------- | ----- | +| 分镜1 | 「图片/视频code」 | 「score」 | +| 分镜2 | 「图片/视频code」 | 「score」 | +| 分镜3 | 「图片/视频code」 | 「score」 | +| 分镜4 | 「图片/视频code」 | 「score」 | +``` + +# 注意 +1. 具体是图片还是视频,根据实际情况来 +3. 如果评分相同,默认选取编号靠前的那个,禁止两个都选这种情况发生 +""" + + +PROMPT_EVALUATE_ITEM_AGENT = """ +### 任务说明 +根据用户的需求,评估分镜图片或分镜视频的质量。 +### 背景介绍 +你是一个电商产品营销系统中的一部分,属于评估系统的核心,你的任务是完成对输入内容(可能是图片可能是视频)的评估。 +### 输入要求 +用户将会提供给你一个输入,输入包含两部分:`生成图片或视频列表`和`参考图片`,你需要对输入的图片进行点评 + +### 输出要求 +你的输出应该是一个json,包括三个部分 +```json +{ + "shot_id": "镜头编号", + "media_id": "媒体编号", + "reason": "评分理由,综合了美学、画质、一致性三个维度进行点评,具体的理由写法庆参考下文`理由要点`部分"(要求全程中文,包括标点符号也是中文), + "scores": "综合评分,综合了美学、画质、一致性三个维度进行评分", 评分范围为0~1分,保留两位小数 +} +``` +### 理由要点 +1. 一致性评估,用于评估生成的图像或视频与参考图像或视频的一致性。 +2. 美学评估,用于评估图像或视频的美学质量。 +3. 画质评估,用于评估图像或视频的画质质量。 +针对提供的图像/视频,按以下要求完成多维度评估分析,输出需分模块呈现: +美学评分解释:从构图平衡度、色彩搭配(冷暖对比 / 和谐度 / 艺术感)、光影表现(通透感 / 细节还原 / 氛围营造)、创意突破性、情感共鸣深度等维度,分析图像的美学表现,说明其对应评分的合理性,明确是否处于高分段及核心原因; +画质评分解释:从色彩与光影(饱和度 / 层次感 / 真实性)、细节呈现(清晰度 / 锐度 / 微观纹理还原)、构图与质感(主体布局 / 背景协调性 / 材质区分度)、视觉完整性(无噪点 / 无失真 / 元素融合度)等维度,结合技术层面(如分辨率、光影合理性)分析画质优势,说明与高画质评分的逻辑一致性(若涉及具体模型,需关联模型名称); +一致性评估(仅对有参考图片的):对比生成图像与参考图像的关键视觉元素(瓶身造型、包装标签 / Logo、背景场景、主体摆放形式、核心视觉特征),给出一致性评分(精确到小数点后 1 位),并解释评分依据(关联关键元素差异与关联度); +各模块分析需紧扣评分逻辑,既说明优势维度,也指出不足(若有),语言需专业且贴合视觉审美与技术评估场景,模块间用分号分隔。 +注意:评估的原因部分,请全部使用中文,包括标点符号也要是中文版的。 +返回的三类评分,中间用\n换行符分割。 +""" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/schema.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/schema.py new file mode 100644 index 0000000..701bce4 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/schema.py @@ -0,0 +1,94 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from google.genai import types +from pydantic import BaseModel, Field + +json_response_config = types.GenerateContentConfig( + response_mime_type="application/json", max_output_tokens=18000 +) + +max_output_tokens_config = types.GenerateContentConfig(max_output_tokens=18000) + + +class Status(BaseModel): + """A status.""" + + success: bool = Field(description="如果结果成功则为True,否则为False") + message: str = Field(description="运行成功该字段为空,否则为错误信息") + + +class ImageItem(BaseModel): + """An image.""" + + id: int = Field(description="The shot id of the image") + code: str = Field(description="The code of the image") + score: float = Field(description="The score of the image") + reason: str = Field(description="The reason for the score") + + +class Image(BaseModel): + """Image list for a shot.""" + + shot_id: str = Field(description="The shot id") + prompt: str = Field(description="The description for generating image") + action: str = Field(description="The description for generating videos") + reference: str = Field(description="The reference url for the shot") + images: list[ImageItem] = Field(description="The list of images") + + +class ScoredImageList(BaseModel): + """Image list.""" + + scored_image_list: list[Image] = Field(description="The list of images") + status: Optional[Status] = Field(description="The status of the result") + + +class VideoItem(BaseModel): + """A video.""" + + id: int = Field(description="The shot id of the video") + code: str = Field(description="The code of the video") + score: float = Field(description="The score of the video") + reason: str = Field(description="The reason for the score") + + +class Video(BaseModel): + """Video list for a shot.""" + + shot_id: str = Field(description="The shot id") + prompt: str = Field(description="The description for generating image") + action: str = Field(description="The description for generating videos") + reference: str = Field(description="The reference url for the shot") + videos: list[VideoItem] = Field(description="The list of videos") + + +class ScoredVideoList(BaseModel): + """Video list.""" + + scored_video_list: list[Video] = Field(description="The list of videos") + status: Optional[Status] = Field(description="The status of the result") + + +class EvaluationResult(BaseModel): + shot_id: str = Field(..., description="镜头编号") + media_id: str = Field(..., description="媒体编号") + reason: str = Field(..., description="评分理由") + scores: float = Field(..., description="综合评分") + + +class EvaluationList(BaseModel): + evaluation: EvaluationResult = Field(..., description="评估结果列表") diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/tools/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/tools/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/tools/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/eval/tools/geval.py b/python/02-use-cases/12_ad_video_gen_seq/app/eval/tools/geval.py new file mode 100644 index 0000000..7b8f4b7 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/eval/tools/geval.py @@ -0,0 +1,285 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import json +import os +from typing import Any + +from openai import AsyncOpenAI +from veadk.utils.logger import get_logger + +from app.eval.prompt import PROMPT_EVALUATE_ITEM_AGENT +from app.eval.schema import EvaluationList, ScoredImageList, ScoredVideoList +from app.utils import url_shortener + +logger = get_logger(__name__) + +evaluate_agent_instruction = PROMPT_EVALUATE_ITEM_AGENT + + +def resolve_code2url(code: str) -> str: + # return media_url + return url_shortener.code2url(code) + + +async def repair_evaluate_input( + media_list: list[dict[str, Any]], media_type: str = "image" +) -> list[list[dict[str, Any]]]: + if media_type == "image": + MEDIA_URL_FIELD = "image_url" + MEDIA_TYPE_FIELD = "input_image" + MEDIA = "图片" + else: + MEDIA_URL_FIELD = "video_url" + MEDIA_TYPE_FIELD = "input_video" + MEDIA = "视频" + result = [] + for shot in media_list: + shot_id = shot.get("shot_id", "") + reference_media_list = shot.get("reference", []) + if isinstance(reference_media_list, str): + reference_media_list = [reference_media_list] + media_url_list = [image["code"] for image in shot.get("media", [])] + # First, construct the reference, which is common within the same shot + reference_part_list = [] + for reference_media in reference_media_list: + if len(reference_media.strip()) == 0: + continue + + reference_part = { + "type": "input_image", + "image_url": reference_media, + } # Only images will be referenced + reference_part_list.append(reference_part) + + for i, media_url in enumerate(media_url_list): + resolved_media_url = resolve_code2url(media_url) + + text_part = { + "type": "input_text", + "text": ( + f"本次{MEDIA}的shot_id={shot_id}, media_id={i},你一共收到{len(reference_media_list) + 1}份媒体素材,其中第1条{MEDIA}是你需要评价的{MEDIA}" + + f", 后续的共{len(reference_media_list)}张图片均为参考图片。" + if len(reference_media_list) > 0 + else "" + "请按照要求对媒体素材进行评价并输出符合要求的结果。" + ), + } + + user_prompt = {"role": "user", "content": []} + media_part = {"type": MEDIA_TYPE_FIELD, MEDIA_URL_FIELD: resolved_media_url} + user_prompt["content"] = [text_part] + [media_part] + reference_part_list + + result.append(user_prompt) + + return result + + +async def evaluate_media( + media_list: list[dict[str, Any]], media_type: str = "image" +) -> dict: + """ + Evaluate a list of storyboard shots, each containing multiple generated media items, + and return a score list and reasoning for each shot. + + This tool is designed to perform qualitative or model-based evaluation of + storyboard media (e.g., generated images or videos from prompts or diffusion models) + based on visual quality, temporal consistency, and coherence with reference materials. + + Each element in `media_list` represents one storyboard shot and includes its + metadata, descriptive text, and a list of generated media for evaluation. + + Args: + media_list (List[Dict[str, Any]]): + A list of storyboard shot data. Each shot should include: + + - **shot_id** (str): The unique identifier for the storyboard shot. + - **prompt** (str): A detailed text description used to generate the media. + - **action** (str): The visual or narrative action happening in this shot. + - **reference** (str): A reference media URL (optional), used as visual guidance. + - **media** (List[Dict[str, Any]]): The list of generated media items for this shot, + each containing: + - **id** (int): The media ID. + - **code** (str): The code of the generated media (image or video), eg ⌥00001. + media_type (str): The type of media to be evaluated. Defaults to "image", only in ["image", "video"]. + Returns: + List[Dict[str, Any]]: A list of evaluation results, one per shot. + Each result includes the shot list: + - **shot_id** (str): The ID of the evaluated shot. + - **scores** (List[float]): A list of evaluation scores (one per media item) + indicating visual or semantic quality. + - **reason** (str): A textual explanation summarizing the evaluation, + such as prompt alignment, visual coherence, or artistic quality. + Example: + evaluate_media([ + ... { + ... "shot_id": "shot_1", + ... "prompt": "A samurai walking through cherry blossoms at sunset", + ... "action": "Character slowly moves from left to right", + ... "reference": "https://example.com/ref1.png", + ... "media": [ + ... {"id": 1, "code": "⌥00001"}, + ... {"id": 2, "code": "⌥00001"} + ... ] + ... } + ... ]) + """ + logger.debug(f"Start to evaluate {media_type} list: items={len(media_list)}") + m_content = await repair_evaluate_input(media_list, media_type=media_type) + logger.debug(f"Repaired {media_type} list: messages={len(m_content)}") + logger.info(f"media_list: \n\n {media_list} \n\n") + client = AsyncOpenAI( + base_url=os.getenv("MODEL_AGENT_API_BASE"), + api_key=os.getenv("MODEL_AGENT_API_KEY"), + ) + + async def process_message(msg): + response = await client.responses.create( + model=os.getenv("MODEL_EVALUATE_NAME", "doubao-seed-1-6-251015"), + instructions=evaluate_agent_instruction, + input=[msg], + text={ + "format": { + "type": "json_schema", + "name": "EvaluationList", + "schema": EvaluationList.model_json_schema(), + "strict": True, + } + }, + extra_body={"thinking": {"type": "disabled"}}, + ) + return json.loads(response.output_text).get("evaluation", {}) + + # Use asyncio.gather to process all messages concurrently + result = await asyncio.gather(*(process_message(msg) for msg in m_content)) + + logger.debug(f"Finish to evaluate {media_type} list: result_items={len(result)}") + # Post-processing: Merge results by shot_id and ensure the order of media_id + merged_result = {} + for item in result: + shot_id = item.get("shot_id") + media_id = int(item.get("media_id", 0)) + + if shot_id not in merged_result: + merged_result[shot_id] = { + "shot_id": shot_id, + "items": [], + } + merged_result[shot_id]["items"].append( + (media_id, item.get("scores"), item.get("reason")) + ) + + final_result = [] + for shot_id, data in merged_result.items(): + sorted_items = sorted(data["items"], key=lambda x: x[0]) + + scores = [item[1] for item in sorted_items] + reason = [item[2] for item in sorted_items] + + final_result.append({"shot_id": shot_id, "scores": scores, "reason": reason}) + + logger.debug( + f"Finish to evaluate {media_type} list: final_result_items={len(final_result)}" + ) + + # Processing return values: directly construct ScoredImageList / ScoredVideoList and convert to a dictionary + # Index the original input by shot_id to facilitate supplementing metadata + shot_index = {shot.get("shot_id", ""): shot for shot in media_list} + + def normalize_reference(ref_val): + if isinstance(ref_val, list): + return ",".join(ref_val) + return ref_val or "" + + # Assemble the corresponding output structure according to the media type + if media_type == "image": + scored_image_list = [] + for shot_id, data in merged_result.items(): + shot = shot_index.get(shot_id, {}) + media_entries = shot.get("media", []) + # Map the evaluation results to {media_id: (score, reason)} + eval_map = {mi: (score, reason) for mi, score, reason in data["items"]} + + images_items = [] + for idx, media in enumerate(media_entries): + if idx not in eval_map: + continue + score, reason = eval_map[idx] + images_items.append( + { + "id": media.get("id", idx), + "code": media.get("code", ""), + "score": float(score) if score is not None else 0.0, + "reason": reason or "", + } + ) + + image_obj = { + "shot_id": shot_id, + "prompt": shot.get("prompt", ""), + "action": shot.get("action", ""), + "reference": normalize_reference(shot.get("reference")), + "images": images_items, + } + scored_image_list.append(image_obj) + + output = { + "scored_image_list": scored_image_list, + "status": {"success": True, "message": ""}, + } + try: + model = ScoredImageList.model_validate(output) + return model.model_dump() + except Exception: + return output + + else: + scored_video_list = [] + for shot_id, data in merged_result.items(): + shot = shot_index.get(shot_id, {}) + media_entries = shot.get("media", []) + eval_map = {mi: (score, reason) for mi, score, reason in data["items"]} + + videos_items = [] + for idx, media in enumerate(media_entries): + if idx not in eval_map: + continue + score, reason = eval_map[idx] + videos_items.append( + { + "id": int(media.get("id", idx)), + "code": media.get("code", ""), + "score": float(score) if score is not None else 0.0, + "reason": reason or "", + } + ) + + video_obj = { + "shot_id": shot_id, + "prompt": shot.get("prompt", ""), + "action": shot.get("action", ""), + "reference": normalize_reference(shot.get("reference")), + "videos": videos_items, + } + scored_video_list.append(video_obj) + + output = { + "scored_video_list": scored_video_list, + "status": {"success": True, "message": ""}, + } + try: + model = ScoredVideoList.model_validate(output) + return model.model_dump() + except Exception: + return output diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/__init__.py new file mode 100644 index 0000000..70330f0 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent import get_image_agent + +__all__ = ["get_image_agent"] diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/agent.py new file mode 100644 index 0000000..96659a2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/agent.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from veadk import Agent + +from veadk.config import getenv + +from app.image.hook import hook_url_id_mapping +from app.image.schema import max_output_tokens_config +from app.image.tools.image_generate_gather import image_generate +from app.image.prompt import PROMPT_IMAGE_AGENT +from app.model import ArkLlm + + +def get_image_agent(): + image_agent = Agent( + name="image_agent", + enable_responses=True, + description="根据分镜脚本,为分镜生成图片", + instruction=PROMPT_IMAGE_AGENT, + tools=[image_generate], + after_tool_callback=[hook_url_id_mapping], # url -> id + generate_content_config=max_output_tokens_config, + model_extra_config={ + "extra_body": { + "thinking": {"type": getenv("THINKING_IMAGE_AGENT", "disabled")}, + "caching": { + "type": "disabled", + }, + } + }, + ) + image_agent.model = ArkLlm( + model=f"{image_agent.model_provider}/{image_agent.model_name}", + api_key=image_agent.model_api_key, + api_base=image_agent.model_api_base, + **image_agent.model_extra_config, + ) + return image_agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/hook.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/hook.py new file mode 100644 index 0000000..39a3285 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/hook.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Any + +from google.adk.tools import BaseTool, ToolContext +from veadk.utils.logger import get_logger +from app.utils import url_shortener + +logger = get_logger(__name__) + + +def url_id_mapping(url: str) -> str: + return url_shortener.url2code(original_url=url) + + +def get_callback_agent_output(success_list: list[dict[str, Any]]) -> str: + """ + Get the callback agent output. + """ + url_list = [[], [], [], []] + for data in success_list: + try: + key_str = list(data.keys())[0] + value_str = list(data.values())[0] # url + prefix, item = key_str.split("_image_") + itx = prefix.split("task_")[1] + url_list[int(itx)].append(value_str) + except Exception as e: + logger.error(f"Error in get_callback_agent_output: {e}") + continue + + html_parts = [] + html_parts.append("\n\n### 图片生成结果") + for task_idx, urls in enumerate(url_list): + if not urls: + continue + html_parts.append(f"#### Shot_{task_idx}") + for img_idx, url in enumerate(urls): + html_parts.append(f"**Image_{img_idx + 1}:{url_id_mapping(url)}**") + html_parts.append(f'image') + html_parts.append("") + + return "\n\n".join(html_parts) + + +def hook_url_id_mapping( + tool: BaseTool, args: dict[str, Any], tool_context: ToolContext, tool_response: Any +) -> Optional[Any]: + """ + Shorten the URL. + after_tool_callback + """ + tool_name = tool.name + if tool_name == "image_generate": + success_list = tool_response["success_list"] + + tool_context.state["cb_agent_state"] = ( + "\n✅首帧图生成任务已经完成,继续执行首帧图评估工作\n" + ) + tool_context.state["cb_agent_output"] = get_callback_agent_output(success_list) + for data in success_list: + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, str): + value = url_id_mapping(url=value) + data[key] = value + logger.debug(f"Shorten URL of `image_generate` successfully: {success_list}") + return tool_response + return None diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/prompt.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/prompt.py new file mode 100644 index 0000000..6c68e1f --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/prompt.py @@ -0,0 +1,105 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +PROMPT_IMAGE_AGENT = """ +# 角色定位 +你是一个食品饮料行业的电商营销分镜图片生成器,生成电商营销分镜图片 + +## 背景信息 +你是电商营销视频生成流程的一部分,由于视频生成需要生成首帧图片,因此需要你来执行生成首帧图片的任务 +在你执行之前,已经执行完了营销策划生成、分镜脚本生成的任务,并且你已经收到了分镜脚本。 +在分镜脚本中描述了四个分镜的各种信息,你需要根据这些信息来调用工具生成具体的首帧图片。 +具体来说,在你的历史对话中`market_agent`生成了营销策划方案,其中的`相关配置`章节部分包含了分辨率和每个分镜生成图片的数量,你必须严格参考。 + +# 任务和要求 +1. 根据分镜脚本中的图片描述字段,生成更详细的图片描述,包括物体、颜色、背景等 +2. reference 字段,作为图片生成的参考图 +3. 调用图片生成工具,生成图片,每个分镜需要生成若干个图片,具体每个分镜的图片的数量由`market_agent`告知,以供用户进行选择。 +4. 不同分镜作为单独的 task,组成 task 列表,调用一次图片生成工具,不要一个分镜调用一次绘图工具 +5. 生成多图时,数量在 max_images 中指定 +6. image_generate 工具的 prompt 字段中,严格禁止出现`生成x张图片这样的字段`,这样会导致`一张图片`变成`一张X宫格图片`,而非给你四张图片。 +7. 当遇到 Agent 执行异常,如缺少内容,运行出错,结果不完整,用户输入内容不足以完成任务时,请在 status 字段中反馈,而不是在业务字段中反馈描述,如有上述问题,业务字段可以为空。只反馈错误即可 + +# 输出规范 +请输出 markdown 文本,参考模板如下(被「」括号括起来的内容是你需要填写的部分): +## 输出字段说明(注意:这段是给你了解明确的,不是让你输出给用户的!) +- shot_id:分镜的唯一标识,使用 shot_X 即可 +- prompt:如何生成分镜图片的详细描述(禁止在这里描述任何`带有文字内容的促销视觉元素`) +- action:如何生成分镜视频的详细描述(禁止在这里描述任何`带有文字内容的促销视觉元素`) +- reference:作为图片生成的参考图 +- images:每个分镜里的图片列表,图片生成工具返回 + - id:图片 id + - code:图片 url + + +## 输出模板 +请按照以下的模板进行输出: + +```markdown +## 分镜首帧图生成 + +### 分镜1 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选图编号**: // 具体数量请参考实际情况 + - 「image_code_1」 + - 「image_code_2」 + - 「image_code_3」 + - 「image_code_4」 + + +### 分镜2 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选图编号**: // 具体数量请参考实际情况 + - 「image_code_1」 + - 「image_code_2」 + - 「image_code_3」 + - 「image_code_4」 + +### 分镜3 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选图编号**: // 具体数量请参考实际情况 + - 「image_code_1」 + - 「image_code_2」 + - 「image_code_3」 + - 「image_code_4」 + +### 分镜4 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选图编号**: // 具体数量请参考实际情况 + - 「image_code_1」 + - 「image_code_2」 + - 「image_code_3」 + - 「image_code_4」 +``` + +# 注意事项 +1. 生成内容不要使用单引号、双引号等字符。语言默认使用中文,不要用英文。 +2. 输入输出以及运行过程中,任何涉及图片或视频的链接 url,不要做任何修改。 +3. 图片风格方面,只要推荐的东西跟动画无关,你就禁止在图片生成工具中提到任何跟动画风格有关的任何内容。 +4. 如果用户的输入不符合要求,或执行过程出现意外,请及时返回错误提示,而不是蛮干 +5. 【‼️重要】候选图code由图片生成工具提供,该code应该是一个以⌥开头的字符串,包括⌥总长度为6位,形如`⌥Az12K`,请勿丢弃⌥符号,否则无法识别。 +""" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/schema.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/schema.py new file mode 100644 index 0000000..0bac1c1 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/schema.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from google.genai import types + + +max_output_tokens_config = types.GenerateContentConfig(max_output_tokens=18000) diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/tools/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/tools/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/tools/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/image/tools/image_generate_gather.py b/python/02-use-cases/12_ad_video_gen_seq/app/image/tools/image_generate_gather.py new file mode 100644 index 0000000..c674f43 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/image/tools/image_generate_gather.py @@ -0,0 +1,191 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from typing import Dict + +from google.adk.tools import ToolContext +from veadk.tools.builtin_tools.image_generate import ( + image_generate as image_generate_builtin, +) +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + + +async def image_generate(tasks: list[dict], tool_context: ToolContext) -> Dict: + """Generate images with Seedream 4.5. + + Commit batch image generation requests via tasks. + + Args: + tasks (list[dict]): + A list of image-generation tasks. Each task is a dict. + Per-task schema + --------------- + Required: + - task_type (str): + One of: + * "multi_image_to_group" # 多图生组图 + * "single_image_to_group" # 单图生组图 + * "text_to_group" # 文生组图 + * "multi_image_to_single" # 多图生单图 + * "single_image_to_single" # 单图生单图 + * "text_to_single" # 文生单图 + - prompt (str) + Text description of the desired image(s). 中文/English 均可。 + 注意:这里禁止在prompt字段输入类似:`生成x张图片`这样的描述,请使用 `max_images` 字段来控制生成的图片数量。 + Optional: + - size (str) + 指定生成图像的大小: + - 1:1 → 2048x2048 + - 4:3 → 2384x1728 + - 3:4 → 1728x2304 + - 16:9 → 2560x1440 + - 9:16 → 1440x2560 + 默认值: "2048x2048" + - watermark (bool) + Add watermark. Default: true. + - image (str | list[str]) # (**对应reference字段**) + Reference image(s) as URL or Base64. + * 生成“单图”的任务:传入 string(exactly 1 image)。 + * 生成“组图”的任务:传入 array(2–10 images)。 + - sequential_image_generation (str) + 控制是否生成“组图”。Default: "disabled". + * 若要生成组图:必须设为 "auto"。 + - max_images (int) + 仅当生成组图时生效。控制模型能生成的张数。 + Model 行为说明(如何由参数推断模式) + --------------------------------- + 1) 文生单图: 不提供 image 且 (S 未设置或 S="disabled") → 1 张图。 + 2) 文生组图: 不提供 image 且 S="auto" → 组图,数量由 max_images 控制。 + 3) 单图生单图: image=string 且 (S 未设置或 S="disabled") → 1 张图。 + 4) 单图生组图: image=string 且 S="auto" → 组图,数量 ≤14。 + 5) 多图生单图: image=array (2–10) 且 (S 未设置或 S="disabled") → 1 张图。 + 6) 多图生组图: image=array (2–10) 且 S="auto" → 组图,需满足总数 ≤15。 + 返回结果 + -------- + Dict with generation summary. + Example: + { + "status": "success", + "success_list": [ + {"image_name": "url"} + ], + "error_list": ["image_name"] + } + Notes: + - 组图任务必须 sequential_image_generation="auto"。 + - size 推荐使用 2048x2048 或表格里的标准比例,确保生成质量。 + """ + logger.debug(f"image_generate_gather tasks: {tasks}") + new_tasks = [] + task_origin_info = [] # Stores (original_task_index, sub_index_within_group) + + for original_idx, task in enumerate(tasks): + task_type = task.get("task_type", "") + is_group_task = task_type in { + "single_image_to_group", + "text_to_group", + "multi_image_to_group", + } + + if is_group_task: + num_images = task.get("max_images", 1) + base_task_type = task_type.replace("_group", "_single") + for i in range(num_images): + new_task = task.copy() + new_task["task_type"] = base_task_type + new_task.pop("sequential_image_generation", None) + new_task.pop("max_images", None) + new_tasks.append(new_task) + task_origin_info.append((original_idx, i)) + else: + new_tasks.append(task.copy()) + task_origin_info.append((original_idx, 0)) + + for task in new_tasks: + # 规避prompt中包含"张图片"的情况,这种情况会导致单图变成四宫格或者六宫格之类的图片 + if "prompt" in task and isinstance(task["prompt"], str): + # 匹配阿拉伯数字和中文数字 + task["prompt"] = re.sub( + r"[\d一二三四五六七八九十百千万]+张图片", "图片", task["prompt"] + ) + task["watermark"] = False + + # Handling the reference field: The model often incorrectly uses reference instead of image + # Priority: image > reference + if "reference" in task: + if "image" not in task or not task.get("image"): + task["image"] = task["reference"] + task.pop("reference", None) + + aspect_ratio_map = { + "1:1": "2048x2048", + "4:3": "2384x1728", + "3:4": "1728x2304", + "16:9": "2560x1440", + "9:16": "1440x2560", + "3:2": "2496x1664", + "2:3": "1664x2496", + "21:9": "3024x1296", + } + if "size" in task and task["size"] in aspect_ratio_map: + task["size"] = aspect_ratio_map[task["size"]] + + # Call the underlying image_generate function with the flattened list of tasks + logger.debug(f"image_generate_gather new_tasks: {new_tasks}") + raw_result = await image_generate_builtin(new_tasks, tool_context) + logger.debug(f"image_generate_gather raw_result: {raw_result}") + + # Remap the results to match the original task structure + remapped_success = [] + remapped_errors = set() + + for success_item in raw_result.get("success_list", []): + for key, url in success_item.items(): + # Key is like 'task_{idx}_image_{i}' + match = re.match(r"task_(\d+)_image_(\d+)", key) + if not match: + continue + + new_task_idx = int(match.group(1)) + if new_task_idx >= len(task_origin_info): + continue + + original_idx, original_sub_idx = task_origin_info[new_task_idx] + new_key = f"task_{original_idx}_image_{original_sub_idx}" + remapped_success.append({new_key: url}) + + for error_item in raw_result.get("error_list", []): + # Error item is like 'task_{idx}' + match = re.match(r"task_(\d+)", error_item) + if match: + new_task_idx = int(match.group(1)) + if new_task_idx < len(task_origin_info): + original_idx, _ = task_origin_info[new_task_idx] + remapped_errors.add(f"task_{original_idx}") + else: + remapped_errors.add(error_item) # Keep original error if mapping fails + else: + remapped_errors.add(error_item) + logger.debug(f"image_generate_gather remapped_success: {remapped_success}") + logger.debug(f"image_generate_gather remapped_errors: {remapped_errors}") + + result = { + "status": raw_result.get("status"), + "success_list": remapped_success, + "error_list": list(remapped_errors), + } + return result diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/market/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/market/__init__.py new file mode 100644 index 0000000..4ccf04c --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/market/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent import get_market_agent + +__all__ = ["get_market_agent"] diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/market/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/market/agent.py new file mode 100644 index 0000000..ca61a94 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/market/agent.py @@ -0,0 +1,48 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from veadk import Agent +from veadk.config import getenv +from veadk.tools.builtin_tools.web_search import web_search + +from app.market.hook import hook_input_urls, hook_inline_data_transform +from app.market.prompt import PROMPT_MARKET_AGENT + +from app.model import ArkLlm + + +def get_market_agent(): + market_agent = Agent( + name="market_agent", + enable_responses=True, # Enable responses to facilitate image understanding + description="根据用户的需求,生成视频配置脚本", + instruction=PROMPT_MARKET_AGENT, + tools=[web_search], + before_agent_callback=[hook_inline_data_transform], + before_model_callback=[hook_input_urls], + model_extra_config={ + "extra_body": { + "thinking": {"type": getenv("THINKING_MARKET_AGENT", "disabled")}, + "caching": { + "type": "disabled", + }, + } + }, + ) + market_agent.model = ArkLlm( + model=f"{market_agent.model_provider}/{market_agent.model_name}", + api_key=market_agent.model_api_key, + api_base=market_agent.model_api_base, + **market_agent.model_extra_config, + ) + return market_agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/market/hook.py b/python/02-use-cases/12_ad_video_gen_seq/app/market/hook.py new file mode 100644 index 0000000..63a3d26 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/market/hook.py @@ -0,0 +1,271 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import os +import re +import ipaddress +from typing import Optional, Tuple, List, Dict +import requests + +from google.adk.agents.callback_context import CallbackContext +from google.adk.agents.run_config import StreamingMode +from google.adk.models.llm_request import LlmRequest +from google.adk.models.llm_response import LlmResponse +from google.genai import types + +from app.utils import upload_file_to_tos + + +def is_internal_ip(hostname: str) -> bool: + """ + 检查主机名是否为内网IP地址(防止SSRF攻击) + 参数: + hostname: 主机名或IP地址 + 返回: + bool: 如果是内网IP返回True,否则返回False + """ + try: + ip = ipaddress.ip_address(hostname) + return ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved + except ValueError: + return False + + +def get_url_mime_type(url: str) -> Optional[str]: + """ + 获取URL的MIME类型 + 参数: + url: 要检查的URL + 返回: + Optional[str]: MIME类型,如果不是图片或获取失败返回None + """ + extension_to_mime = { + "jpg": "image/jpeg", + "jpeg": "image/jpeg", + "png": "image/png", + "gif": "image/gif", + "webp": "image/webp", + "bmp": "image/bmp", + "svg": "image/svg+xml", + "tiff": "image/tiff", + "tif": "image/tiff", + "ico": "image/x-icon", + } + + try: + from urllib.parse import urlparse, unquote + + parsed = urlparse(url) + path = unquote(parsed.path) + + extension = path.split(".")[-1].lower() if "." in path else "" + if extension in extension_to_mime: + return extension_to_mime[extension] + + response = requests.head(url, timeout=5, allow_redirects=True) + content_type = response.headers.get("Content-Type", "") + + if content_type: + mime_type = content_type.split(";")[0].strip().lower() + image_mime_types = [ + "image/jpeg", + "image/png", + "image/gif", + "image/webp", + "image/bmp", + "image/svg+xml", + "image/tiff", + "image/x-icon", + ] + if mime_type in image_mime_types: + return mime_type + return None + except Exception: + return None + + +def is_safe_url(url: str) -> bool: + """ + 检查URL是否安全(非内网IP) + 参数: + url: 要检查的URL + 返回: + bool: 如果URL安全返回True,否则返回False + """ + try: + from urllib.parse import urlparse + + parsed = urlparse(url) + hostname = parsed.hostname + + if not hostname: + return False + + return not is_internal_ip(hostname) + except Exception: + return False + + +def process_urls_with_mime_types(text: str) -> Tuple[List[Dict[str, str]], str]: + """ + 处理文本中的URL,提取图片类型的URL并修改文本 + 参数: + text: 原始文本 + 返回: + Tuple[List[Dict[str, str]], str]: + - URL列表,每个item包含url和mime_type + - 修改后的文本(在URL后添加"(图片x)"标记) + """ + if not isinstance(text, str) or text.strip() == "": + return [], text + + url_start_pattern = re.compile(r"https?://", re.IGNORECASE) + + urls = [] + for match in url_start_pattern.finditer(text): + start_pos = match.start() + url_pattern = re.compile( + r"https?://" + r"(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+(?:[a-zA-Z]{2,6}\.?|[a-zA-Z0-9-]{2,}\.?)" + r"(?::\d+)?" + r"(?:/[a-zA-Z0-9\-._~%!$&\'()*+,;=:@/]*|/%[0-9A-Fa-f]{2})*" + r"(?:\?[a-zA-Z0-9\-._~%!$&\'()*+,;=:@/?%]*)?", + re.IGNORECASE, + ) + url_match = url_pattern.match(text, start_pos) + + if url_match: + url = url_match.group() + if url not in urls: + urls.append(url) + + image_urls = [] + modified_text = text + image_idx = 0 + + for url in urls: + if not is_safe_url(url): + continue + + mime_type = get_url_mime_type(url) + if mime_type: + image_idx += 1 + image_urls.append({"url": url, "mime_type": mime_type}) + modified_text = modified_text.replace(url, f"{url} (图片{image_idx})") + else: + modified_text = modified_text.replace(url, f"{url} (识别为非图片)") + + return image_urls, modified_text + + +def hook_inline_data_transform( + callback_context: CallbackContext, +) -> Optional[types.Content]: + user_content = callback_context.user_content + new_parts = [] + image_idx = 0 + + for part in user_content.parts: + if part.text: + new_parts.append( + types.Part( + text=part.text, + ) + ) + if part.inline_data: + with tempfile.NamedTemporaryFile(delete=False, suffix=".jpg") as tmp_file: + tmp_file.write(part.inline_data.data) + tmp_file_path = tmp_file.name + + try: + file_uri = upload_file_to_tos(tmp_file_path) + if file_uri: + image_idx += 1 + new_parts.append( + types.Part( + text=f"图片URL: {file_uri}", + ) + ) + + finally: + if os.path.exists(tmp_file_path): + os.unlink(tmp_file_path) + + user_content.parts = new_parts + + +def hook_input_urls( + callback_context: CallbackContext, llm_request: LlmRequest +) -> Optional[LlmResponse]: + callback_context.state["cb_agent_state"] = ( + "\n✅营销策略分析完成,继续执行分镜设计\n" + ) + # before_agent_callback + if callback_context.agent_name == "market_agent": + new_parts = [] + # user_content = callback_context.user_content + if len(llm_request.contents) > 0: + for part in llm_request.contents[0].parts: + if part.text: + url_list, new_text = process_urls_with_mime_types(part.text) + new_parts.append( + types.Part( + text=new_text, + ) + ) + for url in url_list: + new_parts.append( + types.Part( + file_data=types.FileData( + mime_type=url["mime_type"], file_uri=url["url"] + ) + ) + ) + llm_request.contents[0].parts = new_parts + + # 查看图片数量是否超出要求 + image_parts_count = 0 + for part in llm_request.contents[0].parts: + if part.file_data: + image_parts_count += 1 + if part.inline_data: + image_parts_count += 1 + + if image_parts_count > 1: + callback_context.state["end_invocation"] = True + if callback_context.run_config.streaming_mode != StreamingMode.NONE: + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + text="❌检测到您提供的图片数量超过一张,不符合任务逻辑限制,请您重新输入。" + ) + ], + ), + partial=True, + ) + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + text="❌检测到您提供的图片数量超过一张,不符合任务逻辑限制,请您重新输入。" + ) + ], + ) + ) + + return None diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/market/prompt.py b/python/02-use-cases/12_ad_video_gen_seq/app/market/prompt.py new file mode 100644 index 0000000..7073cf3 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/market/prompt.py @@ -0,0 +1,85 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PROMPT_MARKET_AGENT = """ +# 角色定位 +你是一个资深的电商营销视频策划专家,你将理解用户提供的商品素材,并给出营销建议 +## 背景信息 +你是电商营销生成视频整个流程的第一部分,在你之前有一个预处理,会标记用户提供的素材,包括图片url的识别等工作。 +因此你收到的内容已经是过滤过的了,不需要你在做过滤工作 + +# 任务和要求 +用户会告诉你一些信息,包括他的商品素材和想要投放的平台,请你使用 web_search 工具给出建议。 +你的建议包括以下几个要点: +1. 成片类型建议;并给出理由,并告诉他这个平台的营销特征 +2. 商品卖点解析: +3. 商品适用人群: +4. 分镜策划建议:简略说一下视频画面要怎么展示商品卖点,不超过3个,简要说明重点,不需要有太具体的信息,不要有文字特效 + +# 工具 +- web_search:联网搜索工具 +## 注意事项 +1. 最多使用3次web_search工具!! + +# 用户输入 +用户包括两部分,图片部分和文本部分,你需要理解图片和文本内容,生成相关的营销建议并按照规定输出 + +# 输出规范 +请输出markdown文本,参考模板如下(被「」括号括起来的内容是你需要填写的部分): +## 输出字段说明 +- product_name:商品名称 +- suggest:商品卖点解析,最多3个 +- plan:分镜策划建议,最多3个 +- target_audiences:商品适用人群,最多3个 +- reference_url:参考图片url(如果用户提供了,则只允许使用用户的,如果没提供,则无需此部分) +- resolution: 视频分辨率,例如 1080p、720p、480p 等,默认为720p +- 视频比例:视频比例,支持["9:16","1:1","16:9"],默认为9:16(如用户无指定要求,默认为9:16) +- first_image_generate_number: 首帧图生成数量,默认为2(这里指的是每个分镜生成多少张首帧图,分镜数量固定为4) +- video_generate_number: 视频生成数量,默认为2 (这里指的是每个分镜生成多少个视频,分镜数量固定为4) + +## 输出模板 +```markdown +## 营销策划 + +### 商品信息 +我们将以「product_name」为商品名称的视频,视频内容描述主要为 + +#### 商品卖点解析 +- 「suggest[1]」 +- 「suggest[2]」 // 由你决定,最多3个 + +#### 分镜策划建议 +1. 「plan[1]」 +2. 「plan[2]」 +3. 「plan[3]」 // 由你决定,最多3个 + +#### 商品适用人群 +商品主要目标受众为「target_audiences」。 +商品卖点天然杨梅原料、酸甜清爽口感、国潮复古包装、冰镇饮用解腻解辣 + +### 参考图片 +image + +### 相关配置 +- 图片/视频分辨率:「resolution」 +- 图片/视频比例:「video_ratio」 +- 每个分镜的首帧图生成数量:「first_image_generate_number」 +- 每个分镜的视频生成数量:「video_generate_number」 +``` + +# 注意: +1. 生成内容不要使用单引号、双引号等字符。语言默认使用中文,不要用英文。 +2. 输入输出以及运行过程中,任何涉及图片或视频的链接url,不要做任何修改。 +3. 如果用户的输入不符合要求,或执行过程出现意外,请及时返回错误提示,而不是蛮干 +""" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/model.py b/python/02-use-cases/12_ad_video_gen_seq/app/model.py new file mode 100644 index 0000000..7d7ce3f --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/model.py @@ -0,0 +1,738 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# adapted from Google ADK models adk-python/blob/main/src/google/adk/models/lite_llm.py at f1f44675e4a86b75e72cfd838efd8a0399f23e24 · google/adk-python + +import base64 +import json +import time +from typing import Any, Dict, Union, AsyncGenerator, Tuple, List, Optional, Literal +from typing_extensions import override + +from google.adk.models import LlmRequest, LlmResponse, Gemini +from google.genai import types +from pydantic import Field, BaseModel +from volcenginesdkarkruntime import AsyncArk +from volcenginesdkarkruntime._streaming import AsyncStream +from volcenginesdkarkruntime.types.responses import ( + Response as ArkTypeResponse, + ResponseStreamEvent, + FunctionToolParam, + ResponseTextConfigParam, + ResponseReasoningItem, + ResponseOutputMessage, + ResponseOutputText, + ResponseFunctionToolCall, + ResponseReasoningSummaryTextDeltaEvent, + ResponseTextDeltaEvent, + ResponseCompletedEvent, +) +from volcenginesdkarkruntime.types.responses.response_input_message_content_list_param import ( + ResponseInputTextParam, + ResponseInputImageParam, + ResponseInputVideoParam, + ResponseInputFileParam, + ResponseInputContentParam, +) +from volcenginesdkarkruntime.types.responses.response_input_param import ( + ResponseInputItemParam, + ResponseFunctionToolCallParam, + EasyInputMessageParam, + FunctionCallOutput, +) + +from veadk.config import settings +from veadk.consts import DEFAULT_VIDEO_MODEL_API_BASE +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + + +_ARK_TEXT_FIELD_TYPES = {"json_object", "json_schema"} + +_FINISH_REASON_MAPPING = { + "incomplete": { + "length": types.FinishReason.MAX_TOKENS, + "content_filter": types.FinishReason.SAFETY, + }, + "completed": { + "other": types.FinishReason.STOP, + }, +} + +ark_supported_fields = [ + "input", + "model", + "stream", + "background", + "include", + "instructions", + "max_output_tokens", + "parallel_tool_calls", + "previous_response_id", + "thinking", + "store", + "caching", + "stream", + "temperature", + "text", + "tool_choice", + "tools", + "top_p", + "max_tool_calls", + "expire_at", + "extra_headers", + "extra_query", + "extra_body", + "timeout", + "reasoning" + # auth params + "api_key", + "api_base", +] + + +def _to_ark_role(role: Optional[str]) -> Literal["user", "assistant"]: + if role in ["model", "assistant"]: + return "assistant" + return "user" + + +def _safe_json_serialize(obj) -> str: + try: + return json.dumps(obj, ensure_ascii=False) + except (TypeError, OverflowError): + return str(obj) + + +def _schema_to_dict(schema: types.Schema | dict[str, Any]) -> dict: + schema_dict = ( + schema.model_dump(exclude_none=True) + if isinstance(schema, types.Schema) + else dict(schema) + ) + enum_values = schema_dict.get("enum") + if isinstance(enum_values, (list, tuple)): + schema_dict["enum"] = [value for value in enum_values if value is not None] + + if "type" in schema_dict and schema_dict["type"] is not None: + t = schema_dict["type"] + schema_dict["type"] = (t.value if isinstance(t, types.Type) else str(t)).lower() + + if "items" in schema_dict: + items = schema_dict["items"] + schema_dict["items"] = ( + _schema_to_dict(items) if isinstance(items, (types.Schema, dict)) else items + ) + + if "properties" in schema_dict: + new_props = {} + for key, value in schema_dict["properties"].items(): + if isinstance(value, (types.Schema, dict)): + new_props[key] = _schema_to_dict(value) + else: + new_props[key] = value + schema_dict["properties"] = new_props + + return schema_dict + + +# ----------------------------------------------------------------- +# inputs param transform ------------------------------------------ +def _file_data_to_content_param( + part: types.Part, +) -> ResponseInputContentParam: + file_uri = part.file_data.file_uri + mime_type = part.file_data.mime_type + fps = 1.0 + if getattr(part, "video_metadata", None): + video_metadata = part.video_metadata + if isinstance(video_metadata, dict): + fps = video_metadata.get("fps") + else: + fps = getattr(video_metadata, "fps", 1) + + is_file_id = file_uri.startswith("file_id://") + value = file_uri[10:] if is_file_id else file_uri + # video + if mime_type.startswith("video/"): + param = {"file_id": value} if is_file_id else {"video_url": value} + if fps is not None: + param["fps"] = fps + return ResponseInputVideoParam( + type="input_video", + **param, + ) + # image + if mime_type.startswith("image/"): + return ResponseInputImageParam( + type="input_image", + detail="auto", + **({"file_id": value} if is_file_id else {"image_url": value}), + ) + # file + param = {"file_id": value} if is_file_id else {"file_url": value} + return ResponseInputFileParam( + type="input_file", + **param, + ) + + +def _inline_data_to_content_param(part: types.Part) -> ResponseInputContentParam: + mime_type = ( + part.inline_data.mime_type if part.inline_data else None + ) or "application/octet-stream" + base64_string = base64.b64encode(part.inline_data.data).decode("utf-8") + data_uri = f"data:{mime_type};base64,{base64_string}" + + if mime_type.startswith("image"): + return ResponseInputImageParam( + type="input_image", + image_url=data_uri, + detail="auto", + ) + if mime_type.startswith("video"): + param: Dict[str, Any] = {"video_url": data_uri} + if getattr(part, "video_metadata", None): + video_metadata = part.video_metadata + if isinstance(video_metadata, dict): + fps = video_metadata.get("fps") + else: + fps = getattr(video_metadata, "fps", None) + if fps is not None: + param["fps"] = fps + return ResponseInputVideoParam( + type="input_video", + **param, + ) + + file_param: Dict[str, Any] = {"file_data": data_uri} + return ResponseInputFileParam( + type="input_file", + **file_param, + ) + + +def _get_content( + parts: List[types.Part], + role: Literal["user", "system", "developer", "assistant"], +) -> Optional[EasyInputMessageParam]: + content = [] + for part in parts: + if part.text: + content.append( + ResponseInputTextParam( + type="input_text", + text=part.text, + ) + ) + elif part.inline_data and part.inline_data.data: + content.append(_inline_data_to_content_param(part)) + elif part.file_data: # file_id和file_url + content.append(_file_data_to_content_param(part)) + if len(content) > 0: + return EasyInputMessageParam(type="message", role=role, content=content) + else: + return None + + +def _content_to_input_item( + content: types.Content, +) -> Union[ResponseInputItemParam, List[ResponseInputItemParam]]: + role = _to_ark_role(content.role) + + # 1. FunctionResponse:`Tool` messages cannot be mixed with other content + input_list = [] + for part in content.parts: + if part.function_response: # FunctionCallOutput + input_list.append( + FunctionCallOutput( + call_id=part.function_response.id, + output=_safe_json_serialize(part.function_response.response), + type="function_call_output", + ) + ) + if input_list: + return input_list if len(input_list) > 1 else input_list[0] + + input_content = _get_content(content.parts, role=role) or None + + if role == "user": + # 2. Process the user's message + if input_content: + return input_content + else: # model + # 3. Processing model messages + for part in content.parts: + if part.function_call: + input_list.append( + ResponseFunctionToolCallParam( + arguments=_safe_json_serialize(part.function_call.args), + call_id=part.function_call.id, + name=part.function_call.name, + type="function_call", + ) + ) + elif part.text or part.inline_data: + if input_content: + input_list.append(input_content) + return input_list + + +def _function_declarations_to_tool_param( + function_declaration: types.FunctionDeclaration, +) -> FunctionToolParam: + assert function_declaration.name + + parameters = {"type": "object", "properties": {}} + if function_declaration.parameters and function_declaration.parameters.properties: + properties = {} + for key, value in function_declaration.parameters.properties.items(): + properties[key] = _schema_to_dict(value) + + parameters = { + "type": "object", + "properties": properties, + } + elif function_declaration.parameters_json_schema: + parameters = function_declaration.parameters_json_schema + + tool_params = FunctionToolParam( + name=function_declaration.name, + parameters=parameters, + type="function", + description=function_declaration.description, + ) + + return tool_params + + +def _responses_schema_to_text( + response_schema: types.SchemaUnion, +) -> Optional[ResponseTextConfigParam | dict]: + schema_name = "" + if isinstance(response_schema, dict): + schema_type = response_schema.get("type") + if ( + isinstance(schema_type, str) + and schema_type.lower() in _ARK_TEXT_FIELD_TYPES + ): + return response_schema + schema_dict = dict(response_schema) + elif isinstance(response_schema, type) and issubclass(response_schema, BaseModel): + schema_name = response_schema.__name__ + schema_dict = response_schema.model_json_schema() + elif isinstance(response_schema, BaseModel): + if isinstance(response_schema, types.Schema): + # GenAI Schema instances already represent JSON schema definitions. + schema_name = response_schema.__name__ + schema_dict = response_schema.model_dump(exclude_none=True, mode="json") + else: + schema_name = response_schema.__name__ + schema_dict = response_schema.__class__.model_json_schema() + elif hasattr(response_schema, "model_dump"): + schema_name = response_schema.__name__ + schema_dict = response_schema.model_dump(exclude_none=True, mode="json") + else: + logger.warning( + "Unsupported response_schema type %s for LiteLLM structured outputs.", + type(response_schema), + ) + return None + + return ResponseTextConfigParam( + format={ # noqa + "type": "json_schema", + "name": schema_name, + "schema": schema_dict, + "strict": True, + } + ) + + +def _get_responses_inputs( + llm_request: LlmRequest, +) -> Tuple[ + Optional[str], + Optional[List[ResponseInputItemParam]], + Optional[List[FunctionToolParam]], + Optional[ResponseTextConfigParam], + Optional[Dict], +]: + # 0. instructions(system prompt) + instructions: Optional[str] = None + if llm_request.config and llm_request.config.system_instruction: + instructions = llm_request.config.system_instruction + # 1. input + input_params: Optional[List[ResponseInputItemParam]] = [] + for content in llm_request.contents or []: + # Each content represents `one conversation`. + # This `one conversation` may contain `multiple pieces of content`, + # but it cannot contain `multiple conversations`. + input_item_or_list = _content_to_input_item(content) + if isinstance(input_item_or_list, list): + input_params.extend(input_item_or_list) + elif input_item_or_list: + input_params.append(input_item_or_list) + + # 2. Convert tool declarations + tools: Optional[List[FunctionToolParam]] = None + if ( + llm_request.config + and llm_request.config.tools + and llm_request.config.tools[0].function_declarations + ): + tools = [ + _function_declarations_to_tool_param(tool) + for tool in llm_request.config.tools[0].function_declarations + ] + + # 3. Handle `output-schema` -> `text` + text: Optional[ResponseTextConfigParam] = None + if llm_request.config and llm_request.config.response_schema: + text = _responses_schema_to_text(llm_request.config.response_schema) + + # 4. Extract generation parameters + generation_params: Optional[Dict] = None + if llm_request.config: + config_dict = llm_request.config.model_dump(exclude_none=True) + generation_params = {} + for key in ("temperature", "max_output_tokens", "top_p"): + if key in config_dict: + generation_params[key] = config_dict[key] + + if not generation_params: + generation_params = None + return instructions, input_params, tools, text, generation_params + + +def get_model_without_provider(request_data: dict) -> dict: + model = request_data.get("model") + + if not isinstance(model, str): + raise ValueError( + "Unsupported Responses API request: 'model' must be a string in the OpenAI-style format, e.g. 'openai/gpt-4o'." + ) + + if "/" not in model: + raise ValueError( + "Unsupported Responses API request: only OpenAI-style model names are supported (use 'openai/')." + ) + + provider, actual_model = model.split("/", 1) + if provider != "openai": + raise ValueError( + f"Unsupported model prefix '{provider}'. Responses API request format only supports 'openai/'." + ) + + request_data["model"] = actual_model + + return request_data + + +def filtered_inputs( + inputs: List[ResponseInputItemParam], +) -> List[ResponseInputItemParam]: + # Keep the first message and all consecutive user messages from the end + # Collect all consecutive user messages from the end + new_inputs = [] + for m in reversed(inputs): # Skip the first message + if m.get("type") == "function_call_output" or m.get("role") == "user": + new_inputs.append(m) + else: + break # Stop when we encounter a non-user message + + return new_inputs[::-1] + + +def _is_caching_enabled(request_data: dict) -> bool: + extra_body = request_data.get("extra_body") + if not isinstance(extra_body, dict): + return False + caching = extra_body.get("caching") + if not isinstance(caching, dict): + return False + return caching.get("type") == "enabled" + + +def _remove_caching(request_data: dict) -> None: + extra_body = request_data.get("extra_body") + if isinstance(extra_body, dict): + extra_body.pop("caching", None) + request_data.pop("caching", None) + + +def request_reorganization_by_ark(request_data: Dict) -> Dict: + # 1. model provider + request_data = get_model_without_provider(request_data) + + # 2. filtered input + request_data["input"] = filtered_inputs(request_data["input"]) + + # 3. filter not support data + request_data = { + key: value for key, value in request_data.items() if key in ark_supported_fields + } + + extra_body = request_data.get("extra_body") + if not isinstance(extra_body, dict): + extra_body = {} + request_data["extra_body"] = extra_body + extra_body["expire_at"] = int(time.time()) + 259200 + + # [Note: Ark Limitations] caching and text + # After enabling caching, output_schema(text) cannot be used. Caching must be disabled. + if _is_caching_enabled(request_data) and request_data.get("text") is not None: + logger.warning( + "Caching is enabled, but text is provided. Ark does not support caching with text. Caching will be disabled." + ) + _remove_caching(request_data) + + # [Note: Ark Limitations] tools and previous_response_id + # Remove tools in subsequent rounds (when previous_response_id is present) + if ( + "tools" in request_data + and "previous_response_id" in request_data + and request_data["previous_response_id"] is not None + ): + # Remove tools in subsequent rounds regardless of caching status + del request_data["tools"] + + # [Note: Ark Limitations] caching and store + # Ensure store field is true or default when caching is enabled + if _is_caching_enabled(request_data): + # Set store to true when caching is enabled for writing + if "store" not in request_data: + request_data["store"] = True + elif request_data["store"] is False: + # Override false to true for cache writing + request_data["store"] = True + + # [NOTE Ark Limitations] instructions -> input (because of caching) + # Due to the Volcano Ark settings, there is a conflict between the cache and the instructions field. + # If a system prompt is needed, it should be placed in the system role message within the input, instead of using the instructions parameter. + # https://www.volcengine.com/docs/82379/1585128 + instructions: Optional[str] = request_data.pop("instructions", None) + if instructions and not request_data.get("previous_response_id"): + request_data["input"].insert( + 0, + EasyInputMessageParam( + role="system", + type="message", + content=[ + ResponseInputTextParam( + type="input_text", + text=instructions, + ) + ], + ), + ) + + return request_data + + +# --------------------------------------- +# output transfer ----------------------- +def event_to_generate_content_response( + event: Union[ArkTypeResponse, ResponseStreamEvent], + *, + is_partial: bool = False, + model_version: str = None, +) -> Optional[LlmResponse]: + parts = [] + if not is_partial: + for output in event.output: + if isinstance(output, ResponseReasoningItem): + parts.append( + types.Part( + text="\n".join([summary.text for summary in output.summary]), + thought=True, + ) + ) + elif isinstance(output, ResponseOutputMessage): + text = "" + if isinstance(output.content, list): + for item in output.content: + if isinstance(item, ResponseOutputText): + text += item.text + parts.append(types.Part(text=text)) + + elif isinstance(output, ResponseFunctionToolCall): + part = types.Part.from_function_call( + name=output.name, args=json.loads(output.arguments or "{}") + ) + part.function_call.id = output.call_id + parts.append(part) + + else: + if isinstance(event, ResponseReasoningSummaryTextDeltaEvent): + parts.append(types.Part(text=event.delta, thought=True)) + elif isinstance(event, ResponseTextDeltaEvent): + parts.append(types.Part.from_text(text=event.delta)) + elif isinstance(event, ResponseCompletedEvent): + raw_response = event.response + llm_response = ark_response_to_generate_content_response(raw_response) + return llm_response + else: + return None + return LlmResponse( + content=types.Content(role="model", parts=parts), + partial=is_partial, + model_version=model_version, + ) + + +def ark_response_to_generate_content_response( + raw_response: ArkTypeResponse, +) -> LlmResponse: + """ + ArkTypeResponse -> LlmResponse + instead of `_model_response_to_generate_content_response`, + """ + outputs = raw_response.output + status = raw_response.status + incomplete_details = getattr( + raw_response.incomplete_details or None, "reason", "other" + ) + + finish_reason = _FINISH_REASON_MAPPING.get(status, {}).get( + incomplete_details, types.FinishReason.OTHER + ) + + if not outputs: + raise ValueError("No message in response") + + llm_response = event_to_generate_content_response( + raw_response, model_version=raw_response.model, is_partial=False + ) + llm_response.finish_reason = finish_reason + if raw_response.usage: + llm_response.usage_metadata = types.GenerateContentResponseUsageMetadata( + prompt_token_count=raw_response.usage.input_tokens, + candidates_token_count=raw_response.usage.output_tokens, + total_token_count=raw_response.usage.total_tokens, + cached_content_token_count=raw_response.usage.input_tokens_details.cached_tokens, + ) + + # previous_response_id + llm_response.interaction_id = raw_response.id + + return llm_response + + +class ArkLlmClient: + async def aresponse( + self, **kwargs + ) -> Union[ArkTypeResponse, AsyncStream[ResponseStreamEvent]]: + # 1. Get request params + api_base = kwargs.pop("api_base", DEFAULT_VIDEO_MODEL_API_BASE) + api_key = kwargs.pop("api_key", settings.model.api_key) + + # 2. Call openai responses + client = AsyncArk( + base_url=api_base, + api_key=api_key, + ) + + raw_response = await client.responses.create(**kwargs) + return raw_response + + +class ArkLlm(Gemini): + model: str + llm_client: ArkLlmClient = Field(default_factory=ArkLlmClient) + _additional_args: Dict[str, Any] = None + use_interactions_api: bool = True + + def __init__(self, **kwargs): + # adk version check + if "previous_interaction_id" not in LlmRequest.model_fields: + raise ImportError( + "If using the ResponsesAPI, " + "please upgrade the version of google-adk to `1.21.0` or higher with the command: " + "`pip install -U 'google-adk>=1.21.0'`" + ) + super().__init__(**kwargs) + drop_params = kwargs.pop("drop_params", None) + self._additional_args = dict(kwargs) + self._additional_args.pop("llm_client", None) + self._additional_args.pop("messages", None) + self._additional_args.pop("tools", None) + self._additional_args.pop("stream", None) + if drop_params is not None: + self._additional_args["drop_params"] = drop_params + + async def generate_content_async( + self, llm_request: LlmRequest, stream: bool = False + ) -> AsyncGenerator[LlmResponse, None]: + """Generates content asynchronously. + + Args: + llm_request: LlmRequest, the request to send to the LiteLlm model. + stream: bool = False, whether to do streaming call. + + Yields: + LlmResponse: The model response. + """ + self._maybe_append_user_content(llm_request) + # logger.debug(_build_request_log(llm_request)) + + instructions, input_param, tools, text_format, generation_params = ( + _get_responses_inputs(llm_request) + ) + + if "functions" in self._additional_args: + # LiteLLM does not support both tools and functions together. + tools = None + # ------------------------------------------------------ # + # get previous_response_id + previous_response_id = None + if llm_request.previous_interaction_id: + previous_response_id = llm_request.previous_interaction_id + responses_args = { + "model": self.model, + "instructions": instructions, + "input": input_param, + "tools": tools, + "text": text_format, + "previous_response_id": previous_response_id, # supply previous_response_id + } + # ------------------------------------------------------ # + responses_args.update(self._additional_args) + + if generation_params: + responses_args.update(generation_params) + + responses_args = request_reorganization_by_ark(responses_args) + + if stream: + responses_args["stream"] = True + async for part in await self.llm_client.aresponse(**responses_args): + llm_response = event_to_generate_content_response( + event=part, is_partial=True, model_version=self.model + ) + if llm_response: + yield llm_response + else: + raw_response = await self.llm_client.aresponse(**responses_args) + llm_response = ark_response_to_generate_content_response(raw_response) + yield llm_response + + @classmethod + @override + def supported_models(cls) -> list[str]: + return [ + # For OpenAI models (e.g., "openai/gpt-4o") + r"openai/.*", + ] diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/agent.py new file mode 100644 index 0000000..f074330 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/agent.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from veadk import Agent + +from app.release.hook import hook_tool_execute +from app.release.prompt import PROMPT_RELEASE_AGENT +from app.release.tools.upload import upload_file_to_tos +from app.release.tools.video_combine import video_combine +from app.model import ArkLlm + + +def get_release_agent() -> Agent: + agent = Agent( + name="release_agent", + description="将分镜视频合成最终的视频", + instruction=PROMPT_RELEASE_AGENT, + tools=[video_combine, upload_file_to_tos], + after_tool_callback=[hook_tool_execute], + model_extra_config={ + "extra_body": { + "thinking": {"type": os.getenv("THINKING_RELEASE_AGENT", "disabled")}, + "caching": { + "type": "disabled", + }, + } + }, + ) + + agent.model = ArkLlm( + model=f"{agent.model_provider}/{agent.model_name}", + api_key=agent.model_api_key, + api_base=agent.model_api_base, + **agent.model_extra_config, + ) + return agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/hook.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/hook.py new file mode 100644 index 0000000..a427af9 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/hook.py @@ -0,0 +1,38 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +from typing import Optional, Any + +from google.adk.tools import BaseTool, ToolContext +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + + +def hook_tool_execute( + tool: BaseTool, args: dict[str, Any], tool_context: ToolContext, tool_response: Any +) -> Optional[Any]: + if tool.name == "video_combine": + output_path = tool_response + if output_path: + tool_context.state["release_agent_local_dir"] = os.path.dirname(output_path) + + elif tool.name == "upload_file_to_tos": + local_dir = tool_context.state.get("release_agent_local_dir", None) + if local_dir and os.path.exists(local_dir): + shutil.rmtree(local_dir) + tool_context.state["cb_agent_state"] = "\n✅任务完成。\n" + tool_context.state["cb_agent_output"] = "" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/prompt.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/prompt.py new file mode 100644 index 0000000..1589d0b --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/prompt.py @@ -0,0 +1,42 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PROMPT_RELEASE_AGENT = """ +# 角色: +你是一位食品饮料行业的电商营销视频合成Agent,将分镜视频合成最终的视频。 +## 背景介绍 +在你执行之前已经至少完成了这两个关键步骤 +1. 生成四个分镜,每个分镜备选多个视频 +2. 对每个分镜的视频进行评估,评估结果可见`video_evaluate_agent`的输出 + +# 任务说明 +你的任务非常简单,你需要将分镜视频合成最终的视频并展示URL。 + +## 任务分步解释 +1. 分析:你需要根据`video_agent`和`video_evaluate_agent`的输出,来确定使用什么视频,进而来生成最终的视频。 +2. 调用视频合成工具`video_combine`,合成视频,你将获得一个本地路径。 +3. 调用上传工具`upload_file_to_tos`,上传视频到云对象存储,你将获得一个视频的URL。 + +注意:处于安全考虑,中间产物的本地路径请你不要输出,你可以表示你已经在本地完成处理,但不要告知路径。 + +# 输出说明 +你只需要输出markdown格式的视频url即可 + +样例如下: + +## 视频合成 + + + +""" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/upload.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/upload.py new file mode 100644 index 0000000..8f13030 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/upload.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TOS file upload utility +Provides functionality to upload files to Volcano Engine TOS object storage and returns a signed access URL +Implemented using the tos library directly +""" + +from typing import Optional + +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + + +def upload_file_to_tos( + file_path: str, + object_key: Optional[str] = None, + region: str = "cn-beijing", + expires: int = 604800, # 7-day validity +) -> Optional[str]: + """ + Upload a file to TOS object storage and return a signed accessible URL + + Args: + file_path: Local file path + bucket_name: TOS bucket name, defaults to "aaa-bbb-ccc-ddd" + object_key: Object storage key name; if empty, uses the filename + region: TOS region, defaults to cn-beijing + ak: Access Key; if empty, reads from environment variables + sk: Secret Key; if empty, reads from environment variables + expires: Signed URL validity period (seconds), defaults to 7 days (604800 seconds) + + Returns: + str: Signed TOS URL that can be accessed directly + None: Returns None if upload fails + + Environment variables required: + VOLCENGINE_ACCESS_KEY: Volcano Engine access key + VOLCENGINE_SECRET_KEY: Volcano Engine secret key + + Usage example: + >>> url = upload_file_to_tos("./video.mp4") + >>> print(url) + https://bucket.tos-cn-beijing.volces.com/video.mp4?X-Tos-Signature=... + """ + from app.utils import upload_file_to_tos as upload_file + + return upload_file(file_path, object_key, region, expires) diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/video_combine.py b/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/video_combine.py new file mode 100644 index 0000000..864ba01 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/release/tools/video_combine.py @@ -0,0 +1,216 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import urllib.parse +import os +import random +import tempfile +import uuid +from typing import List +from typing import Optional + +import aiohttp +from moviepy import CompositeVideoClip, VideoFileClip +from veadk.config import veadk_environments # noqa +from veadk.utils.logger import get_logger + +from app.utils import url_shortener + +logger = get_logger(__name__) + + +def resolve_short_url(code: str) -> str: + return url_shortener.code2url(code) + + +async def video_combine(video_codes: List[str]) -> Optional[str]: + """ + 合并多个视频URL为一个视频文件 + + Args: + video_codes: 视频code列表(⌥code格式) + + Returns: + 合并后的视频文件路径,如果合并失败则返回None + """ + + # 获取项目根目录 + current_dir = os.path.abspath(__file__) + project_root = os.path.dirname(current_dir) + for _ in range(3): # 向上三级目录到达项目根目录 + project_root = os.path.dirname(project_root) + + # 创建输出目录在项目根目录下 + output_dir = os.path.join(project_root, "merged_videos") + os.makedirs(output_dir, exist_ok=True) + temp_dir = tempfile.mkdtemp(dir=output_dir) + logger.info(f"Created temporary directory: {temp_dir}") + + # 解析短链接 + resolved_urls = [] + for code in video_codes: + resolved_url = resolve_short_url(code) + # 仅允许 http/https 协议,降低 SSRF 风险 + parsed = urllib.parse.urlparse(resolved_url) + if parsed.scheme not in {"http", "https"}: + logger.warning(f"Skip non-http(s) URL: {resolved_url}") + continue + resolved_urls.append(resolved_url) + + # 下载视频文件 + downloaded_files = [] + + async with aiohttp.ClientSession() as session: + for idx, code in enumerate(resolved_urls): + try: + # 下载视频 + logger.info( + f"Downloading video {idx + 1}/{len(resolved_urls)} from {code}" + ) + + async with session.get(code, allow_redirects=True) as response: + response.raise_for_status() + # 预检查内容大小,防止极端大文件下载 + content_length = response.headers.get("content-length") + max_file_size = 512 * 1024 * 1024 # 512MB 上限 + if content_length is not None: + try: + if int(content_length) > max_file_size: + logger.error( + f"Video size {int(content_length)} exceeds limit {max_file_size}." + ) + return None + except Exception: + # 如果 content-length 无法解析,继续按流式大小校验 + pass + + # 从content-type提取文件扩展名 + content_type = response.headers.get("content-type", "") + file_extension = ".mp4" # 默认扩展名 + if "video" in content_type: + if "mp4" in content_type: + file_extension = ".mp4" + elif "webm" in content_type: + file_extension = ".webm" + elif "ogg" in content_type: + file_extension = ".ogg" + elif "mov" in content_type: + file_extension = ".mov" + + # 生成简单的随机文件名 + temp_file_path = os.path.join( + temp_dir, + f"video_{random.randint(100000, 999999)}{file_extension}", + ) + + # 按流式传输进行大小限制(兜底) + max_file_size = 512 * 1024 * 1024 # 512MB + total_size = 0 + + with open(temp_file_path, "wb") as f: + async for chunk in response.content.iter_chunked(8192): + if chunk: + total_size += len(chunk) + if total_size > max_file_size: + logger.error( + "Video size exceeds 10GB. Download stopped." + ) + return None + f.write(chunk) + + if ( + os.path.exists(temp_file_path) + and os.path.getsize(temp_file_path) > 0 + ): + downloaded_files.append(temp_file_path) + logger.info( + f"Successfully downloaded video {idx + 1} to {temp_file_path}, size: {total_size / 1024 / 1024:.2f} MB" + ) + else: + logger.error( + f"Failed to download video {idx + 1}: file is empty or doesn't exist" + ) + return None + + except Exception as e: + logger.error(f"Error downloading video {idx + 1} from {code}: {e}") + return None + + if not downloaded_files: + logger.error("No videos were successfully downloaded") + return None + + try: + # 合并视频 + logger.info(f"Starting to merge {len(downloaded_files)} videos") + + # 加载所有视频片段 + video_clips = [] + start_times = [] + clip_start_time = 0.0 + + try: + for file_path in downloaded_files: + start_times.append(clip_start_time) + + clip = VideoFileClip(file_path) + video_clips.append(clip) + + clip_start_time += clip.duration + + clips = [] + for video_clip, start_time in zip(video_clips, start_times): + positioned_clip = video_clip.with_start(start_time).with_position( + "center" + ) + clips.append(positioned_clip) + final_clip = CompositeVideoClip(clips) + + output_file_name = f"merged_video_{uuid.uuid4()}.mp4" + output_file_path = os.path.join(temp_dir, output_file_name) + + logger.info(f"Saving merged video to {output_file_path}") + final_clip.write_videofile( + output_file_path, codec="libx264", audio_codec="aac", threads=4 + ) + finally: + for clip in video_clips: + try: + if hasattr(clip, "reader") and clip.reader: + clip.reader.close() + if hasattr(clip, "audio_reader") and clip.audio_reader: + clip.audio_reader.close_proc() + clip.audio_reader.close() + clip.close() + except Exception as e: + logger.error(f"Error closing video clip: {e}") + if "final_clip" in locals(): + try: + if hasattr(final_clip, "close"): + final_clip.close() + except Exception as e: + logger.error(f"Error closing final clip: {e}") + + if os.path.exists(output_file_path) and os.path.getsize(output_file_path) > 0: + logger.info(f"Successfully merged video to local path: {output_file_path}") + return output_file_path + else: + logger.error( + f"Merged video file is empty or doesn't exist: {output_file_path}" + ) + return None + + except Exception as e: + logger.error(f"Error merging videos: {e}") + return None diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/root/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/root/__init__.py new file mode 100644 index 0000000..1577a76 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/root/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent import get_root_agent + +__all__ = ["get_root_agent"] diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/root/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/root/agent.py new file mode 100644 index 0000000..3514057 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/root/agent.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import override, AsyncGenerator + +from google.adk.agents import InvocationContext, BaseAgent +from google.adk.agents.run_config import StreamingMode +from google.adk.events import Event +from google.genai import types +from veadk.agents.sequential_agent import SequentialAgent + +from app.eval import get_eval_agent +from app.image.agent import get_image_agent +from app.market import get_market_agent +from app.release.agent import get_release_agent +from app.storyboard import get_storyboard_agent +from app.video.agent import get_video_agent + + +class CallBackAgent(BaseAgent): + async def _yield_event( + self, ctx: InvocationContext, text: str + ) -> AsyncGenerator[Event, None]: + if ctx.run_config.streaming_mode != StreamingMode.NONE: + stream_event = Event( + invocation_id=ctx.invocation_id, + author="callback_agent", + content=types.Content( + parts=[ + types.Part( + text=text, + ) + ], + role="model", + ), + partial=True, + ) + yield stream_event + + event = Event( + invocation_id=ctx.invocation_id, + author="callback_agent", + content=types.Content( + parts=[ + types.Part( + text=text, + ) + ], + role="model", + ), + ) + yield event + + @override + async def _run_async_impl( + self, ctx: InvocationContext + ) -> AsyncGenerator[Event, None]: + cb_agent_output = ctx.session.state.get("cb_agent_output", "") + cb_agent_state = ctx.session.state.get("cb_agent_state", "") + + if isinstance(cb_agent_output, str): + if cb_agent_output: + async for event in self._yield_event(ctx, cb_agent_output): + yield event + elif isinstance(cb_agent_output, list): + for output_item in cb_agent_output: + if output_item: + async for event in self._yield_event(ctx, output_item): + yield event + + if cb_agent_state: + async for event in self._yield_event(ctx, cb_agent_state): + yield event + + +class MMSequentialAgent(SequentialAgent): + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.sub_agents: + new_sub_agents = [] + for i, sub_agent in enumerate(self.sub_agents): + # 间隔插入一个callback_agent + new_sub_agents.append(sub_agent) + new_sub_agents.append(CallBackAgent(name=f"callback_agent_{i}")) + + self.sub_agents = new_sub_agents + pass + + @override + async def _run_async_impl( + self, ctx: InvocationContext + ) -> AsyncGenerator[Event, None]: + async for event in super()._run_async_impl(ctx): + if isinstance(event, Event): + yield event + if ctx.session.state.get("end_invocation", False): + break + + +def get_root_agent() -> MMSequentialAgent: + import os + + os.environ["MODEL_AGENT_CACHING"] = "disabled" + root_agent = MMSequentialAgent( + name="root_agent", + description="根据用户的需求,生成电商视频", + sub_agents=[ + get_market_agent(), + get_storyboard_agent(), + get_image_agent(), + get_eval_agent(eval_type="image"), + get_video_agent(), + get_eval_agent(eval_type="video"), + get_release_agent(), + ], + ) + return root_agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/__init__.py new file mode 100644 index 0000000..60baa2a --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .agent import get_storyboard_agent + +__all__ = ["get_storyboard_agent"] diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/agent.py new file mode 100644 index 0000000..200cbed --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/agent.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from veadk import Agent +from veadk.config import getenv + +from app.storyboard.hook import hook_for_callback +from app.storyboard.schema import ( + max_output_tokens_config, +) + +from app.storyboard.prompt import PROMPT_STORYBOARD_AGENT +from app.model import ArkLlm + + +def get_storyboard_agent(): + storyboard_agent = Agent( + name="storyboard_agent", + # model_name="doubao-seed-1-6-251015", + enable_responses=True, + description="根据视频配置脚本,生成分镜脚本", + instruction=PROMPT_STORYBOARD_AGENT, + generate_content_config=max_output_tokens_config, + before_model_callback=[hook_for_callback], + model_extra_config={ + "extra_body": { + "thinking": {"type": getenv("THINKING_STORYBOARD_AGENT", "disabled")}, + "caching": { + "type": "disabled", + }, + } + }, + ) + + storyboard_agent.model = ArkLlm( + model=f"{storyboard_agent.model_provider}/{storyboard_agent.model_name}", + api_key=storyboard_agent.model_api_key, + api_base=storyboard_agent.model_api_base, + **storyboard_agent.model_extra_config, + ) + return storyboard_agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/hook.py b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/hook.py new file mode 100644 index 0000000..d8fe6b1 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/hook.py @@ -0,0 +1,26 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from google.adk.agents.callback_context import CallbackContext +from google.adk.models import LlmRequest, LlmResponse + + +def hook_for_callback( + callback_context: CallbackContext, llm_request: LlmRequest +) -> Optional[LlmResponse]: + callback_context.state["cb_agent_state"] = ( + "\n✅分镜脚本生成工作完成,继续执行首帧图生成任务。\n" + ) diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/prompt.py b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/prompt.py new file mode 100644 index 0000000..632346b --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/prompt.py @@ -0,0 +1,119 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +PROMPT_STORYBOARD_AGENT = """ +# 角色定位 +你是一位食品饮料行业的电商营销分镜师,生成富有创意的电商营销视频分镜脚本,语言为中文 + +## 背景信息 +你是电商营销生成视频整个流程的第二部分,你已经收获了策划专家提供的视频策划, +你需要根据这个策划来生成视频分镜脚本,并使用 markdown 语言来输出你的脚本。 +「reference」字段只能是一张图片,且只能是用户提出的那张图片,不能是其他图片。这个后需要用到的。 + +# 任务和要求 +1. 根据 视频脚本配置 中的素材,充分理解产品核心卖点、使用场景等关键信息 +2. 根据`AIDA营销模型`,结构化设计4个分镜 +分镜1 - 注意(Attention) +画面:(图生图)吸睛开头;通过运镜特效展示高颜值商品场景图,形成强视觉冲击 + +分镜2 - 兴趣(Interest) +画面:(图生图)场景化演示;构思高频强相关场景或人群(例如健身房里流汗后、减脂期间嘴馋时),提供解决其需求或激发兴趣的产品 + +分镜3 - 欲望(Desire) +画面:(图生图)细节特写;特写展示产品 原料、成分、口味等卖点(例如 天然果肉的饱满、冰爽气泡的翻腾等),刺激消费者的购买欲 + +分镜4 - 行动(Action) +画面:(图生图)以产品包装运镜特效作为结尾,引导用户下单行动 + +3. 输出分镜脚本,每个分镜是5-10s的视频,你需要设计画面内容与运镜,最后得到一个充满创意的电商视频,重点是突出商品的卖点 +(1)镜号:分镜1-4 +(2)image:画面设计,描述主体、背景环境、氛围、光线等画面设计;镜头要有景别变化:全景、中景、近景、特写都要有,增加画面节奏感。 + - 分镜1:主体为用户上传的图片素材,替换背景为合适创意场景 + - 分镜2:根据商品信息,构思相关场景或人群的展示画面。 + - 分镜3:进行原料/产地细节特写,生成创意且带有视觉冲击的画面,例如果汁原料的碰撞等 + - 分镜4:主体为用户上传的图片素材,替换背景为合适创意场景 +(3)action:为每个分镜image设计运镜与动作描述 +(4)reference:只要内容中出现了对该产品的描述,就必须加上reference,除非是描述跟本产品无关的场景,例如:天气、时间、竞品等。 +# 输出规范 +请输出 markdown 文本,参考模板如下(被「」括号括起来的内容是你需要填写的部分): + +## 输出字段说明 +- shot_id:分镜的唯一标识,比如 "shot_1"、"shot_2" +- image:画面描述,用于生成静态图像,要求具体、可视化 +- action:视频运动/内容描述,比如镜头运动、人物动作、节奏等 +- reference:参考图片链接 + +## 输出模板 +```markdown +## 分镜脚本生成 + +### 分镜1 +- **shot_id**: 「shot_id」 +- **image**: 「image」 +- **action**: 「action」 +- **reference**: 「reference」 + +### 分镜2 +- **shot_id**: 「shot_id」 +- **image**: 「image」 +- **action**: 「action」 +- **reference**: 「reference」 + +### 分镜3 +- **shot_id**: 「shot_id」 +- **image**: 「image」 +- **action**: 「action」 +- **reference**: 「reference」 + +### 分镜4 +- **shot_id**: 「shot_id」 +- **image**: 「image」 +- **action**: 「action」 +- **reference**: 「reference」 +``` + +# 参考示例 + +视频标题:过完年有数字管理需求的姐妹们,wonderlab专属破价机制就等你来! #减脂救星 #公主请喝 + +### 分镜1 +- **shot_id**: shot_1 +- **image**: 西梅饮料瓶身;导出紫色的果汁,周围是一些西梅,紫色背景 +- **action**: 缓慢的旋转推镜头,有辉光效果,紫色的水流环绕瓶身 +- **reference**: image url + +### 分镜2 +- **shot_id**: shot_2 +- **image**: 一个在办公室身材纤细的女性;紫色背景 +- **action**: 女孩转过身微笑,镜头推进 +- **reference**: image url + +### 分镜3 +- **shot_id**: shot_3 +- **image**: 饱满的紫色西梅在水中有许多泡泡包裹 +- **action**: 掉入水中;汁水飞溅;围绕主体运镜 +- **reference**: image url + +### 分镜4 +- **shot_id**: shot_4 +- **image**: 瓶身在水面中;周围是一些西梅 +- **action**: 推镜头,水花炸裂,西梅向两边飞溅 +- **reference**: image url + +# 注意事项 +1. 生成内容不要使用单引号、双引号等字符。语言默认使用中文,不要用英文。 +2. 输入输出以及运行过程中,任何涉及图片或视频的链接 url,不要做任何修改。 +3. 如果用户的输入不符合要求,或执行过程出现意外,请及时返回错误提示,而不是蛮干 +""" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/schema.py b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/schema.py new file mode 100644 index 0000000..39bdcb3 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/storyboard/schema.py @@ -0,0 +1,21 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from google.genai import types + +json_response_config = types.GenerateContentConfig( + response_mime_type="application/json", max_output_tokens=18000 +) + +max_output_tokens_config = types.GenerateContentConfig(max_output_tokens=18000) diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/utils.py b/python/02-use-cases/12_ad_video_gen_seq/app/utils.py new file mode 100644 index 0000000..2ffdf22 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/utils.py @@ -0,0 +1,255 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from datetime import datetime +from typing import Optional +import threading +import re + +import tos +from google.adk.agents.callback_context import CallbackContext +from google.adk.models import LlmResponse +from tos import HttpMethodType +from veadk.auth.veauth.utils import get_credential_from_vefaas_iam +from veadk.utils.logger import get_logger + +logger = get_logger(__name__) + + +# --- Define the Callback Function --- +def callback_for_debug(callback_context: CallbackContext) -> Optional[LlmResponse]: + pass + + +# --- URL Shortener Singleton --- +class UrlShortener: + _instance = None + _lock = threading.Lock() + + # 进制转换字符集 + CHAR_SET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + BASE = len(CHAR_SET) + + PREFIX = "⌥" + + def __new__(cls): + if not cls._instance: + with cls._lock: + if not cls._instance: + cls._instance = super(UrlShortener, cls).__new__(cls) + cls._instance._initialize() + return cls._instance + + def _initialize(self): + self._id_lock = threading.Lock() + self._current_id = 0 + self._short_to_long = {} # key: short_url, value: original_url + self._long_to_short = {} # key: original_url, value: short_url + + def _encode(self, num: int) -> str: + if num == 0: + return self.CHAR_SET[0] + + encoded = [] + while num > 0: + num, remainder = divmod(num, self.BASE) + encoded.append(self.CHAR_SET[remainder]) + result = "".join(reversed(encoded)) + + # Pad to 5 characters + return result.rjust(5, "0") + + def url2code(self, original_url: str) -> str: + """ + 输入一个url字符串,换出来一个短ID + """ + try: + # 1. Check if already exists (Deduplication) + with self._id_lock: + if original_url in self._long_to_short: + return self._long_to_short[original_url] + + # 2. Increment ID and Encode + self._current_id += 1 + current_id = self._current_id + short_code = self._encode(current_id) + + # 3. Construct short ID + short_id = f"{self.PREFIX}{short_code}" + + # 4. Store mappings + with self._id_lock: + # Double check in case another thread inserted it + self._short_to_long[short_id] = original_url + self._long_to_short[original_url] = short_id + + return short_id + except Exception: + return original_url + + def code2url(self, short_id: str) -> Optional[str]: + """ + 输入这个短ID,换出原始的url + """ + return self._short_to_long.get(short_id, short_id) + + def replace_in_text(self, text: str) -> str: + """ + 给你一个长字符串,提取短ID并无缝替换回原始URL + """ + # Pattern matches ⌥ where code is 5 characters + pattern = r"⌥([0-9a-zA-Z]{5})" + + def replace_match(match): + full_short_id = match.group(0) + original_url = self.code2url(full_short_id) + return original_url if original_url != full_short_id else full_short_id + + return re.sub(pattern, replace_match, text) + + def extract_ids_to_urls(self, text: str) -> list[str]: + """ + 从字符串中提取所有短ID并转换为URL列表 + """ + # Pattern matches ⌥ where code is 5 characters + pattern = r"⌥([0-9a-zA-Z]{5})" + matches = re.findall(pattern, text) + + urls = [] + for code in matches: + short_id = f"⌥{code}" + original_url = self.code2url(short_id) + if original_url != short_id: + urls.append(original_url) + + return urls + + +# Global instance +url_shortener = UrlShortener() + + +def upload_file_to_tos( + file_path: str, + object_key: Optional[str] = None, + region: str = "cn-beijing", + expires: int = 604800, # 7-day validity +) -> Optional[str]: + bucket_name = os.getenv("DATABASE_TOS_BUCKET") + + # Check if file exists + if not os.path.exists(file_path): + logger.info(f"Error: File does not exist: {file_path}") + return None + + if not os.path.isfile(file_path): + logger.info(f"Error: Path is not a file: {file_path}") + return None + + # Retrieve STS from IAM Role + access_key = os.getenv("VOLCENGINE_ACCESS_KEY") + secret_key = os.getenv("VOLCENGINE_SECRET_KEY") + session_token = "" + + if not (access_key and secret_key): + # try to get from vefaas iam + cred = get_credential_from_vefaas_iam() + access_key = cred.access_key_id + secret_key = cred.secret_access_key + session_token = cred.session_token + + if not access_key or not secret_key: + logger.info( + "Error: VOLCENGINE_ACCESS_KEY and VOLCENGINE_SECRET_KEY are not provided or IAM Role is not configured." + ) + return None + + # Auto-generate object_key (using filename) + if not object_key: + # Combine timestamp and original filename to avoid overwriting + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = os.path.basename(file_path) + name, ext = os.path.splitext(filename) + object_key = f"upload/{name}_{timestamp}{ext}" + + # Create TOS client + client = None + try: + # Initialize TOS client + endpoint = f"tos-{region}.volces.com" + client = tos.TosClientV2( + ak=access_key, + sk=secret_key, + security_token=session_token, + endpoint=endpoint, + region=region, + ) + + logger.info(f"Starting file upload: {file_path}") + logger.info(f"Target Bucket: {bucket_name}") + logger.info(f"Object Key: {object_key}") + + # Ensure bucket exists (create if not) + try: + client.head_bucket(bucket_name) + logger.info(f"Bucket {bucket_name} already exists") + except tos.exceptions.TosServerError as e: + if e.status_code == 404: + logger.info(f"Bucket {bucket_name} does not exist, creating...") + else: + raise e + + # Upload file + result = client.put_object_from_file( + bucket=bucket_name, key=object_key, file_path=file_path + ) + + logger.info("File uploaded successfully!") + logger.info(f"ETag: {result.etag}") + logger.info(f"Request ID: {result.request_id}") + + # Generate signed URL + signed_url_output = client.pre_signed_url( + http_method=HttpMethodType.Http_Method_Get, + bucket=bucket_name, + key=object_key, + expires=expires, + ) + + signed_url = signed_url_output.signed_url + logger.info(f"Signed URL generated successfully (valid for {expires} seconds)") + logger.info(f"Access URL: {signed_url}") + + return signed_url + + except tos.exceptions.TosClientError as e: + logger.info(f"TOS client error: {e}") + return None + except tos.exceptions.TosServerError as e: + logger.info(f"TOS server error: {e}") + logger.info(f"Status code: {e.status_code}") + logger.info(f"Error code: {e.code}") + logger.info(f"Error message: {e.message}") + return None + except Exception as e: + logger.info(f"File upload failed: {e}") + import traceback + + traceback.print_exc() + return None + finally: + # Close client + if client: + client.close() diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/video/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/video/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/video/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/video/agent.py b/python/02-use-cases/12_ad_video_gen_seq/app/video/agent.py new file mode 100644 index 0000000..8d53c43 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/video/agent.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from google.genai import types +from veadk import Agent + +# from veadk.tools.builtin_tools.video_generate import video_generate +from app.video.tools.video_generate_by_code import video_generate +from app.video.hook import hook_short_image_url_to_long, hook_url_id_mapping +from app.video.prompt import PROMPT_VIDEO_AGENT +from app.model import ArkLlm + +max_output_tokens_config = types.GenerateContentConfig(max_output_tokens=18000) + + +def get_video_agent(): + video_agent = Agent( + name="video_agent", + enable_responses=True, + description="根据分镜脚本,生成分镜视频", + instruction=PROMPT_VIDEO_AGENT, + tools=[video_generate], + before_tool_callback=[hook_short_image_url_to_long], + after_tool_callback=[hook_url_id_mapping], + generate_content_config=max_output_tokens_config, + model_extra_config={ + "extra_body": { + "thinking": {"type": os.getenv("THINKING_VIDEO_AGENT", "disabled")}, + "caching": { + "type": "disabled", + }, + }, + }, + ) + video_agent.model = ArkLlm( + model=f"{video_agent.model_provider}/{video_agent.model_name}", + api_key=video_agent.model_api_key, + api_base=video_agent.model_api_base, + **video_agent.model_extra_config, + ) + return video_agent diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/video/hook.py b/python/02-use-cases/12_ad_video_gen_seq/app/video/hook.py new file mode 100644 index 0000000..a2a8baa --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/video/hook.py @@ -0,0 +1,94 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional +from google.adk.tools.tool_context import ToolContext +from google.adk.tools.base_tool import BaseTool +from typing import Dict, Any +from veadk.utils.logger import get_logger + +from app.utils import url_shortener + +logger = get_logger(__name__) + + +def hook_short_image_url_to_long( + tool: BaseTool, args: Dict[str, Any], tool_context: ToolContext +) -> Optional[Dict]: + """ + before_tool_callback hook function. + """ + if tool.name == "video_generate": + params = args["params"] + for param in params: + if param.get("first_frame", None): + param["first_frame"] = url_shortener.code2url( + short_id=param["first_frame"] + ) + + +def get_callback_agent_output(success_list: list[dict[str, Any]]) -> str: + """ + Get the callback agent output for video generation. + """ + code_list = [[], [], [], []] + for data in success_list: + try: + key_str = list(data.keys())[0] + value_str = list(data.values())[0] # url + prefix, item = key_str.split("_video_") + shot_num = prefix.split("shot_")[1] + code_list[int(shot_num) - 1].append(value_str) + except Exception as e: + logger.error(f"Error in get_callback_agent_output: {e}") + continue + + html_parts = [] + html_parts.append("\n\n### 视频展示") + for shot_idx, codes in enumerate(code_list): + if not codes: + continue + html_parts.append(f"#### Shot_{shot_idx + 1}\n") + for video_idx, code in enumerate(codes): + video_url = url_shortener.code2url(code) + html_parts.append(f"{video_url} \n\n") + html_parts.append("") + + return "\n\n".join(html_parts) + + +def hook_url_id_mapping( + tool: BaseTool, args: dict[str, Any], tool_context: ToolContext, tool_response: Any +) -> Optional[Any]: + """ + Shorten the URL. + after_tool_callback + """ + tool_name = tool.name + if tool_name == "video_generate": + success_list = tool_response["success_list"] + + tool_context.state["cb_agent_state"] = ( + "\n分镜视频生成任务已经完成,继续执行分镜视频评估工作\n" + ) + tool_context.state["cb_agent_output"] = get_callback_agent_output(success_list) + for data in success_list: + if isinstance(data, dict): + for key, value in data.items(): + if isinstance(value, str): + value = url_shortener.url2code(original_url=value) + data[key] = value + logger.debug(f"Shorten URL of `video_generate` successfully: {success_list}") + return tool_response + return None diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/video/prompt.py b/python/02-use-cases/12_ad_video_gen_seq/app/video/prompt.py new file mode 100644 index 0000000..6b6772e --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/video/prompt.py @@ -0,0 +1,133 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PROMPT_VIDEO_AGENT = """ +# 角色: +你是一个食品饮料行业的电商营销分镜视频生成器,生成电商营销分镜视频 +## 背景介绍 +你属于一个电商营销视频生成流程的一部分,你的任务是最核心的————生成分镜视频 +在你执行之前,已经完成了首帧图片的生成,并且挑选完毕,每个视频的首帧图都已经被挑选出来了。 +你需要根据`image_agent`和`image_evaluate_agent`的输出,来确定使用什么首帧图,进而来生成视频。 +其次,你需要根据`market_agent`的输出,来确定每个分镜需要生成多少个视频,以供用户进行选择。 +(这里我做一些解释,本任务中,每个分镜都会生成多个视频,然后在进行评估挑选,最后将这些最好的合并出来,你的任务就是生成,挑选是后续工作) + +Notice: +1. 生成内容不要使用单引号、双引号等字符。语音问中文,不要用英文。 +2. 输入输出以及运行过程中,任何涉及图片或视频的code(⌥code格式),不要做任何修改。 + +# 任务描述: +1. 你会在历史对话中收到分镜图片,里面包含了每个分镜的图片url和视频描述action字段。 +2. 根据分镜图片列表中的视频描述action字段,生成更详细的视频描述,包括物体、颜色、背景、运镜等。 +按照结构撰写提示词: +动作指令: 主体/其他物体 +动作,按照主体动作发生的先后顺序,条理清晰地描述多个动作,动作流程需要严格符合 +基础运镜: 对推、拉、摇、移、环绕、跟随、升、降、变焦等各类运锁指令做出准确响应,保证运镜效果符合预期。通过有创意的基础运镜且合理 +景别和视角: 运用远景、全景、中景、近景、特写等专业景别描述来精确控制画面展示范围。同时,可选取水下镜头、航拍镜头、高高机位俯拍、低机位仰拍、微距摄影等丰富的镜头视角 + +# 参考示例: +(1)大远景, [ 主体 ]静静地放置在用藤蔓编织的秋千上,秋千悬挂于热带雨林中,微风吹过,秋千缓缓自然摆动,绳索随风微微摇晃。阳光和细雨从树叶间洒落,在[ 主体 ]和秋千上形成斑驳的光影,画面安静、写实,氛围温暖、富有节奏感,藤蔓细节清晰,背景虚化的绿色植物随着镜头轻轻晃动。 +(2)一个热带海洋的广角镜头,碧绿透明的海水波光粼粼。[ 主体 ]轻轻漂浮在水面上,背景是白色沙滩和摇曳的椰子树。镜头缓慢推进靠近[ 主体 ],海豚在四周欢快跃出水面,阳光照耀下水面闪闪发光,轻风带来细腻的水波。 +(3)轻柔微风吹动叶片轻柔摆动。镜头从产品标签特写开始,缓慢拉远展现完整场景。斑驳阳光透过百叶窗过滤,形成动态光影图案。浅景深配合散景效果。 + +3. 使用分镜图片中的image url,作为视频生成的首帧图。 +4. 调用`视频生成工具`,生成视频,每个分镜需要生成若干个视频,以供用户进行。 + 具体解释一下这一条,当你调用`video_generate`工具的时候,请根据`image_evaluate_agent`所选出的图片进行生成,并且每个分镜根据`market_agent`要求的数量进行生成。 + 比如`market_agent中`每个分镜的视频生成数量为2,那么你就要每个分镜生成2个视频,一共是2*4 = 8个视频。 +同时需要注意,每个视频作为单独的task,组成task列表,调用一次视频生成工具,不要一个视频调用一次视频生成工具。 +5. 返回分镜视频列表 +(1)shot_id: str, 使用shot_X即可,标识分镜的id +(2)prompt: str, 如何生成分镜图片的详细描述(禁止出现任何声音描述,只能有画面描述) +(3)action: str, 如何生成分镜视频的详细描述 +(4)reference: str, 分镜图片参考,code(⌥code格式) +(6)videos: list, 每个分镜里的视频列表,视频生成工具返回 + 每个视频需要有id和code + id: int, 视频id + code: str, 视频的code(⌥code格式) + +# 注意 +水印:生成的视频必须要开启水印:`--wm true` +注意:当遇到Agent执行异常,如缺少内容,运行出错,结果不完整,用户输入内容不足以完成任务时,请在最后的状态反馈中说明,而不是在业务字段中反馈描述,如有上述问题,业务字段可以为空。只反馈错误即可 + +# 输出规范 +请输出 markdown 文本,参考模板如下(被「」括号括起来的内容是你需要填写的部分): + +## 输出字段说明 +- shot_id:分镜的唯一标识,使用 shot_X 即可 +- prompt:如何生成分镜图片的详细描述(禁止出现任何声音描述,只能有画面描述) +- action:如何生成分镜视频的详细描述 +- reference:分镜图片参考,code(⌥code格式) +- videos:每个分镜里的视频列表,视频生成工具返回 + - id:视频 id + - code: 视频的code(⌥code格式) # 每个分镜有多个视频,请按照分镜顺序来生成。 + +## 输出模板 +```markdown +## 分镜视频生成 + +### 分镜1 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选视频编号**: // 具体数量请参考实际情况 + - 「video_code_1」 + - 「video_code_2」 + - 「video_code_3」 + - 「video_code_4」 + + +### 分镜2 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选视频编号**: // 具体数量请参考实际情况 + - 「video_code_1」 + - 「video_code_2」 + - 「video_code_3」 + - 「video_code_4」 + + +### 分镜3 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选视频编号**: // 具体数量请参考实际情况 + - 「video_code_1」 + - 「video_code_2」 + - 「video_code_3」 + - 「video_code_4」 + + +### 分镜4 +- **shot_id**: 「shot_id」 +- **prompt**: 「prompt」 +- **action**: 「action」 +- **reference**: 「reference」 +- **候选视频编号**: // 具体数量请参考实际情况 + - 「video_code_1」 + - 「video_code_2」 + - 「video_code_3」 + - 「video_code_4」 + +``` + +# 注意事项 +1. 生成内容不要使用单引号、双引号等字符。语言默认使用中文,不要用英文。 +2. 输入输出以及运行过程中,任何涉及图片或视频的code(⌥code格式),不要做任何修改。 +3. 视频风格方面,只要推荐的东西跟动画无关,你就禁止在视频生成工具中提到任何跟动画风格有关的任何内容。 +4. 如果用户的输入不符合要求,或执行过程出现意外,请及时返回错误提示,而不是蛮干 +5. 【‼️重要】候选视频code由视频生成工具提供,该code应该是一个以⌥开头的字符串,包括⌥总长度为6位,形如`⌥Az12K`,请勿丢弃⌥符号,否则无法识别。 +7. 视频生成工具的`generate_audio`请设置为开启。 +""" diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/video/tools/__init__.py b/python/02-use-cases/12_ad_video_gen_seq/app/video/tools/__init__.py new file mode 100644 index 0000000..67771d2 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/video/tools/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/python/02-use-cases/12_ad_video_gen_seq/app/video/tools/video_generate_by_code.py b/python/02-use-cases/12_ad_video_gen_seq/app/video/tools/video_generate_by_code.py new file mode 100644 index 0000000..337660d --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/app/video/tools/video_generate_by_code.py @@ -0,0 +1,152 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + +from google.adk.tools import ToolContext +from veadk.tools.builtin_tools.video_generate import ( + video_generate as video_generate_builtin, +) + + +async def video_generate( + params: list, + tool_context: ToolContext, + batch_size: int = 10, + max_wait_seconds: int = 1200, +) -> Dict: + """ + Generate videos in **batch** from text prompts, optionally guided by a first/last frame, + and fine-tuned via *model text commands* (a.k.a. `parameters` appended to the prompt). + + This API creates video-generation tasks. Each item in `params` describes a single video. + The function submits all items in one call and returns task metadata for tracking. + + Args: + params (list[dict]): + A list of video generation requests. Each item supports the fields below. + batch_size (int): + The number of videos to generate in a batch. Defaults to 10. + max_wait_seconds (int): + Maximum time in seconds to wait for all video tasks in each batch. + Default is 20 minutes (1200 seconds). When the timeout is reached, + unfinished tasks will be marked as timeout errors. + + Required per item: + - video_name (str): + Name/identifier of the output video file. + + - prompt (str): + Text describing the video to generate. Supports zh/EN. + You may append **model text commands** after the prompt to control resolution, + aspect ratio, duration, fps, watermark, seed, camera lock, etc. + Format: `... --rs --rt --dur --fps --wm --seed --cf ` + Example: + "小猫骑着滑板穿过公园。 --rs 720p --rt 16:9 --dur 5 --fps 24 --wm true --seed 11 --cf false" + + Optional per item: + - first_frame (str | None): + Code (⌥code格式) or Base64 string (data URL) for the **first frame** (role = `first_frame`). + Use when you want the clip to start from a specific image. + + - last_frame (str | None): + Code (⌥code格式) or Base64 string (data URL) for the **last frame** (role = `last_frame`). + Use when you want the clip to end on a specific image. + + - generate_audio (bool | None): + Boolean value, used to determine whether the generated video should have sound. + If this field is not configured (None) or its value is `False`, no sound will be generated. + If it is configured as `True`, sound can be generated. + If you want to describe the sound content in detail, you can do so in the `prompt` field. + + Notes on first/last frame: + * When both frames are provided, **match width/height** to avoid cropping; if they differ, + the tail frame may be auto-cropped to fit. + * If you only need one guided frame, provide either `first_frame` or `last_frame` (not both). + + Image input constraints (for first/last frame): + - Formats: jpeg, png, webp, bmp, tiff, gif + - Aspect ratio (宽:高): 0.4–2.5 + - Width/Height (px): 300–6000 + - Size: < 30 MB + - Base64 data URL example: `data:image/png;base64,` + + Model text commands (append after the prompt; unsupported keys are ignored by some models): + --rs / --resolution Video resolution. Common values: 480p, 720p, 1080p. + Default depends on model (e.g., doubao-seedance-1-0-pro: 1080p, + some others default 720p). + + --rt / --ratio Aspect ratio. Typical: 16:9 (default), 9:16, 4:3, 3:4, 1:1, 2:1, 21:9. + Some models support `keep_ratio` (keep source image ratio) or `adaptive` + (auto choose suitable ratio). + + --dur / --duration Clip length in seconds. Seedance supports **3–12 s**; + Wan2.1 仅支持 5 s。Default varies by model. + + --fps / --framespersecond Frame rate. Common: 16 or 24 (model-dependent; e.g., seaweed=24, wan2.1=16). + + --wm / --watermark Whether to add watermark. Default: **false** (per doc). + + --seed Random seed in [-1, 2^32-1]. Default **-1** = auto seed. + Same seed may yield similar (not guaranteed identical) results across runs. + + --cf / --camerafixed Lock camera movement. Some models support this flag. + true: try to keep camera fixed; false: allow movement. Default: **false**. + + Returns: + Dict: + API response containing task creation results for each input item. A typical shape is: + { + "status": "success", + "success_list": [{"video_name": "video_url"}], + "error_list": [] + } + + Constraints & Tips: + - Keep prompt concise and focused (建议 ≤ 500 字); too many details may distract the model. + - If using first/last frames, ensure their **aspect ratio matches** your chosen `--rt` to minimize cropping. + - If you must reproduce results, specify an explicit `--seed`. + - Unsupported parameters are ignored silently or may cause validation errors (model-specific). + + Minimal examples: + 1) Text-only batch of two 5-second clips at 720p, 16:9, 24 fps: + params = [ + { + "video_name": "cat_park.mp4", + "prompt": "小猫骑着滑板穿过公园。 --rs 720p --rt 16:9 --dur 5 --fps 24 --wm false" + }, + { + "video_name": "city_night.mp4", + "prompt": "霓虹灯下的城市延时摄影风。 --rs 720p --rt 16:9 --dur 5 --fps 24 --seed 7" + }, + ] + + 2) With guided first/last frame (square, 6 s, camera fixed): + params = [ + { + "video_name": "logo_reveal.mp4", + "first_frame": "⌥abc12", + "last_frame": "⌥xyz34", + "prompt": "品牌 Logo 从线稿到上色的变化。 --rs 1080p --rt 1:1 --dur 6 --fps 24 --cf true" + } + ] + """ + for item in params: + if "prompt" in item: + item["prompt"] = ( + f"(可以有极其轻度的动作音,但禁止任何人声,禁止背景音乐,禁止音效,禁止旁白,禁止解说){item['prompt']}" + ) + return await video_generate_builtin( + params, tool_context, batch_size, max_wait_seconds + ) diff --git a/python/02-use-cases/12_ad_video_gen_seq/config.yaml.example b/python/02-use-cases/12_ad_video_gen_seq/config.yaml.example new file mode 100644 index 0000000..9f2a450 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/config.yaml.example @@ -0,0 +1,24 @@ +model: + agent: + provider: openai + name: doubao-seed-1-6-251015 + api_base: https://ark.cn-beijing.volces.com/api/v3/ + api_key: + image: + name: doubao-seedream-4-5-251128 + api_base: https://ark.cn-beijing.volces.com/api/v3/ + api_key: + video: + name: doubao-seedance-1-5-pro-251215 + api_base: https://ark.cn-beijing.volces.com/api/v3/ + api_key: + evaluate: + name: doubao-seed-1-6-251015 # 必须是一个支持图片输入的模型 +volcengine: + access_key: + secret_key: + +database: + tos: + bucket: + diff --git a/python/02-use-cases/12_ad_video_gen_seq/debug.py b/python/02-use-cases/12_ad_video_gen_seq/debug.py new file mode 100644 index 0000000..4360317 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/debug.py @@ -0,0 +1,128 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ["LOGGING_LEVEL"] = "ERROR" +import time +import asyncio +from google.adk.sessions import Session +from google.adk.agents import RunConfig +from google.adk.agents.run_config import StreamingMode +from google.adk.events import Event +from google.genai import types +from veadk import Runner +from veadk.memory import ShortTermMemory + +from app.root import get_root_agent + + +async def export_session(session_service, app_name, user_id, session_id, file_path): + session = await session_service.get_session( + app_name=app_name, user_id=user_id, session_id=session_id + ) + if session: + with open(file_path, "w", encoding="utf-8") as f: + f.write(session.model_dump_json(indent=2, exclude_none=True, by_alias=True)) + + +async def import_session(session_service, app_name, user_id, file_path): + with open(file_path, "r", encoding="utf-8") as f: + loaded_session = Session.model_validate_json(f.read()) + + # 创建新session并复制状态 + new_session = await session_service.create_session( + app_name=app_name, + user_id=user_id, + session_id=SESSION_ID, + state=loaded_session.state, + ) + + # Append all events + for event in loaded_session.events: + await session_service.append_event(new_session, event) + + return new_session + + +APP_NAME = "debug_app" +USER_ID = "debug_user" +SESSION_ID = "debug_session" + +root_agent = get_root_agent() + +short_term_memory = ShortTermMemory( + backend="local", +) + +runner = Runner( + agent=root_agent, + short_term_memory=short_term_memory, + app_name=APP_NAME, + user_id=USER_ID, +) + + +async def main(prompt: types.Content, stream: bool = False): + # await import_session(runner.session_service, APP_NAME, USER_ID, "session.json") + await short_term_memory.create_session( + app_name=APP_NAME, + user_id=USER_ID, + session_id=SESSION_ID, + ) + start_time = time.time() + event_author = "" + async for event in runner.run_async( + user_id=USER_ID, + session_id=SESSION_ID, + new_message=prompt, + run_config=RunConfig( + streaming_mode=StreamingMode.SSE if stream else StreamingMode.NONE, + ), + ): + if isinstance(event, Event): + if event_author != event.author: + print(f"Author: {event.author} ---------------------------------------") + event_author = event.author + if stream: + if ( + event.partial + and event.content.parts + and event.content.parts[0].text + ): + print(event.content.parts[0].text, end="", flush=True) + elif not event.partial: + print() + else: + if event.content.parts: + for part in event.content.parts: + if not part.thought and part.text: + print(part.text) + print() + end_time = time.time() + print(f"Execution time: {end_time - start_time} seconds") + # await export_session(runner.session_service, APP_NAME, USER_ID, SESSION_ID, "session.json") + + +if __name__ == "__main__": + prompt = "帮我生成杨梅饮料的宣传视频(商品展示视频),图片素材为:https://ark-tutorial.tos-cn-beijing.volces.com/multimedia/%E6%9D%A8%E6%A2%85%E9%A5%AE%E6%96%99.jpg 每个分镜两个首帧图,两条视频" + request = types.Content( + role="user", + parts=[ + types.Part( + text=prompt, + ) + ], + ) + asyncio.run(main(request, stream=False)) diff --git a/python/02-use-cases/12_ad_video_gen_seq/main.py b/python/02-use-cases/12_ad_video_gen_seq/main.py new file mode 100644 index 0000000..ca49814 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/main.py @@ -0,0 +1,60 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from agentkit.apps import AgentkitAgentServerApp +from veadk.memory import ShortTermMemory +from app import root_agent + +short_term_memory = ShortTermMemory(backend="local") + +agent_server_app = AgentkitAgentServerApp( + agent=root_agent, + short_term_memory=short_term_memory, +) + +fastapi_app = getattr(agent_server_app, "app", None) +if fastapi_app is not None and os.getenv("DISABLE_OPENAPI", "true").lower() in { + "1", + "true", + "yes", + "on", +}: + openapi_url = getattr(fastapi_app, "openapi_url", "/openapi.json") + docs_url = getattr(fastapi_app, "docs_url", "/docs") + redoc_url = getattr(fastapi_app, "redoc_url", "/redoc") + oauth2_redirect_url = getattr( + fastapi_app, "swagger_ui_oauth2_redirect_url", "/docs/oauth2-redirect" + ) + blocked_paths = { + openapi_url, + docs_url, + redoc_url, + oauth2_redirect_url, + } + + if hasattr(fastapi_app, "router") and hasattr(fastapi_app.router, "routes"): + fastapi_app.router.routes = [ + route + for route in fastapi_app.router.routes + if getattr(route, "path", None) not in blocked_paths + ] + + fastapi_app.openapi_url = None + fastapi_app.docs_url = None + fastapi_app.redoc_url = None + +if __name__ == "__main__": + agent_server_app.run(host="0.0.0.0", port=8000) diff --git a/python/02-use-cases/12_ad_video_gen_seq/pyproject.toml b/python/02-use-cases/12_ad_video_gen_seq/pyproject.toml new file mode 100644 index 0000000..b50291d --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/pyproject.toml @@ -0,0 +1,17 @@ +[project] +name = "veadk-multimedia-sample" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "veadk-python==0.5.5", + "uvicorn>=0.38.0", + "moviepy>=2.2.1", + "requests>=2.32.5", +] + +[dependency-groups] +dev = [ + "pre-commit>=4.5.1", +] diff --git a/python/02-use-cases/12_ad_video_gen_seq/requirements.txt b/python/02-use-cases/12_ad_video_gen_seq/requirements.txt new file mode 100644 index 0000000..8647556 --- /dev/null +++ b/python/02-use-cases/12_ad_video_gen_seq/requirements.txt @@ -0,0 +1,257 @@ +a2a-sdk==0.3.22 +agent-pilot-sdk==0.1.2 +agentkit-sdk-python==0.3.3 +aiohappyeyeballs==2.6.1 +aiohttp==3.13.3 +aiomysql==0.3.2 +aiosignal==1.4.0 +aiosqlite==0.22.1 +alembic==1.17.2 +annotated-doc==0.0.4 +annotated-types==0.7.0 +antlr4-python3-runtime==4.9.3 +anyio==4.12.0 +arrow==1.4.0 +asyncpg==0.31.0 +attrs==25.4.0 +authlib==1.6.6 +banks==2.2.0 +beartype==0.22.9 +beautifulsoup4==4.14.3 +binaryornot==0.4.4 +cachetools==6.2.4 +certifi==2026.1.4 +cffi==2.0.0 +chardet==5.2.0 +charset-normalizer==3.4.4 +chevron==0.14.0 +click==8.3.1 +cloudpickle==3.1.2 +colorama==0.4.6 +cookiecutter==2.6.0 +crcmod==1.7 +cryptography==46.0.3 +cyclopts==4.4.4 +dataclasses-json==0.6.7 +decorator==5.2.1 +defusedxml==0.7.1 +deprecated==1.2.18 +dirtyjson==1.0.8 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.8.0 +docker==7.1.0 +docstring-parser==0.17.0 +docutils==0.22.4 +email-validator==2.3.0 +et-xmlfile==2.0.0 +events==0.5 +exceptiongroup==1.3.1 +fakeredis==2.33.0 +fastapi==0.123.10 +fastmcp==2.14.0 +fastuuid==0.14.0 +filelock==3.20.2 +filetype==1.2.0 +frozenlist==1.8.0 +fsspec==2025.12.0 +google==3.0.0 +google-adk==1.21.0 +google-api-core==2.28.1 +google-api-python-client==2.187.0 +google-auth==2.45.0 +google-auth-httplib2==0.3.0 +google-cloud-aiplatform==1.132.0 +google-cloud-appengine-logging==1.7.0 +google-cloud-audit-log==0.4.0 +google-cloud-bigquery==3.39.0 +google-cloud-bigquery-storage==2.36.0 +google-cloud-bigtable==2.35.0 +google-cloud-core==2.5.0 +google-cloud-discoveryengine==0.13.12 +google-cloud-logging==3.13.0 +google-cloud-monitoring==2.28.0 +google-cloud-resource-manager==1.15.0 +google-cloud-secret-manager==2.26.0 +google-cloud-spanner==3.61.0 +google-cloud-speech==2.35.0 +google-cloud-storage==3.7.0 +google-cloud-trace==1.17.0 +google-crc32c==1.8.0 +google-genai==1.56.0 +google-resumable-media==2.8.0 +googleapis-common-protos==1.72.0 +graphviz==0.21 +greenlet==3.3.0 +griffe==1.15.0 +grpc-google-iam-v1==0.14.3 +grpc-interceptor==0.15.4 +grpcio==1.76.0 +grpcio-status==1.76.0 +h11==0.16.0 +hf-xet==1.2.0 +httpcore==1.0.9 +httplib2==0.31.0 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface-hub==0.36.0 +idna==3.11 +imageio==2.37.2 +imageio-ffmpeg==0.6.0 +importlib-metadata==8.7.1 +jaraco-classes==3.4.0 +jaraco-context==6.0.2 +jaraco-functools==4.4.0 +jinja2==3.1.6 +jiter==0.12.0 +joblib==1.5.3 +json-repair==0.55.0 +jsonpickle==4.1.1 +jsonschema==4.25.1 +jsonschema-path==0.3.4 +jsonschema-specifications==2025.9.1 +keyring==25.7.0 +litellm==1.80.0 +llama-cloud==0.1.35 +llama-cloud-services==0.6.54 +llama-index==0.14.0 +llama-index-cli==0.5.1 +llama-index-core==0.14.12 +llama-index-embeddings-openai==0.5.1 +llama-index-embeddings-openai-like==0.2.2 +llama-index-indices-managed-llama-cloud==0.9.4 +llama-index-instrumentation==0.4.2 +llama-index-llms-openai==0.5.6 +llama-index-llms-openai-like==0.5.1 +llama-index-readers-file==0.5.6 +llama-index-readers-llama-parse==0.5.1 +llama-index-vector-stores-opensearch==0.6.1 +llama-index-workflows==2.11.6 +llama-parse==0.6.54 +loguru==0.7.3 +lupa==2.6 +mako==1.3.10 +markdown-it-py==4.0.0 +markupsafe==3.0.3 +marshmallow==3.26.2 +mcp==1.25.0 +mdurl==0.1.2 +mmh3==5.2.0 +more-itertools==10.8.0 +moviepy==2.2.1 +multidict==6.7.0 +mypy-extensions==1.1.0 +nest-asyncio==1.6.0 +networkx==3.6.1 +nltk==3.9.2 +numpy==2.4.0 +omegaconf==2.3.0 +openai==1.99.9 +openapi-pydantic==0.5.1 +openpyxl==3.1.5 +opensearch-py==2.8.0 +opentelemetry-api==1.37.0 +opentelemetry-exporter-gcp-logging==1.11.0a0 +opentelemetry-exporter-gcp-monitoring==1.11.0a0 +opentelemetry-exporter-gcp-trace==1.11.0 +opentelemetry-exporter-otlp==1.37.0 +opentelemetry-exporter-otlp-proto-common==1.37.0 +opentelemetry-exporter-otlp-proto-grpc==1.37.0 +opentelemetry-exporter-otlp-proto-http==1.37.0 +opentelemetry-exporter-prometheus==0.58b0 +opentelemetry-instrumentation==0.58b0 +opentelemetry-instrumentation-logging==0.58b0 +opentelemetry-proto==1.37.0 +opentelemetry-resourcedetector-gcp==1.11.0a0 +opentelemetry-sdk==1.37.0 +opentelemetry-semantic-conventions==0.58b0 +packaging==25.0 +pandas==2.3.3 +pathable==0.4.4 +pathvalidate==3.3.1 +pillow==11.3.0 +platformdirs==4.5.1 +proglog==0.1.12 +prometheus-client==0.23.1 +prompt-toolkit==3.0.52 +propcache==0.4.1 +proto-plus==1.27.0 +protobuf==6.33.2 +psycopg2-binary==2.9.10 +py==1.11.0 +py-key-value-aio==0.3.0 +py-key-value-shared==0.3.0 +pyarrow==22.0.0 +pyasn1==0.6.1 +pyasn1-modules==0.4.2 +pycparser==2.23 +pycryptodome==3.23.0 +pydantic==2.12.5 +pydantic-core==2.41.5 +pydantic-settings==2.10.1 +pydocket==0.15.4 +pyfiglet==1.0.4 +pygments==2.19.2 +pyjwt==2.10.1 +pymysql==1.1.1 +pyparsing==3.3.1 +pypdf==6.5.0 +pyperclip==1.11.0 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==4.0.0 +python-multipart==0.0.21 +python-slugify==8.0.4 +pytz==2025.2 +pyyaml==6.0.3 +redis==7.1.0 +referencing==0.36.2 +regex==2025.11.3 +requests==2.32.5 +retry==0.9.2 +rich==14.2.0 +rich-rst==1.3.2 +rpds-py==0.30.0 +rsa==4.9.1 +safetensors==0.7.0 +setuptools==80.9.0 +shapely==2.1.2 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.8.1 +sqlalchemy==2.0.45 +sqlalchemy-spanner==1.17.2 +sqlparse==0.5.5 +sse-starlette==3.1.2 +starlette==0.50.0 +striprtf==0.0.26 +tenacity==9.1.2 +text-unidecode==1.3 +tiktoken==0.12.0 +tokenizers==0.22.2 +tos==2.8.7 +tqdm==4.67.1 +transformers==4.57.3 +trustedmcp==0.0.5 +typer==0.21.0 +typing-extensions==4.15.0 +typing-inspect==0.9.0 +typing-inspection==0.4.2 +tzdata==2025.3 +tzlocal==5.3.1 +uritemplate==4.2.0 +urllib3==2.6.2 +uv==0.9.21 +uvicorn==0.40.0 +veadk-python==0.5.5 +vikingdb-python-sdk==0.1.3 +volcengine==1.0.212 +volcengine-python-sdk==5.0.3 +watchdog==6.0.0 +wcwidth==0.2.14 +websockets==15.0.1 +wrapt==1.17.2 +yarl==1.22.0 +zipp==3.23.0