代码

import base64
import wave
from agno.agent import Agent
from agno.models.openai import OpenAIChat
from typing import Iterator

# 音频配置
SAMPLE_RATE = 24000  # Hz (24kHz)
CHANNELS = 1  # 单声道
SAMPLE_WIDTH = 2  # Bytes (16 bits)

agent = Agent(
    model=OpenAIChat(
        id="gpt-4o-audio-preview",
        modalities=["text", "audio"],
        audio={
            "voice": "alloy",
            "format": "pcm16",  # 流式传输必需
        },
    ),
    debug_mode=True,
    add_history_to_messages=True,
)

# 流式提问
output_stream: Iterator[RunResponse] = agent.run(
    "金毛寻回犬适合做家庭犬吗?", 
    stream=True
)

with wave.open("tmp/answer_1.wav", "wb") as wav_file:
    wav_file.setnchannels(CHANNELS)
    wav_file.setsampwidth(SAMPLE_WIDTH)
    wav_file.setframerate(SAMPLE_RATE)
    
    for response in output_stream:
        if response.response_audio:
            if response.response_audio.transcript:
                print(response.response_audio.transcript, end="", flush=True)
            if response.response_audio.content:
                try:
                    pcm_bytes = base64.b64decode(response.response_audio.content)
                    wav_file.writeframes(pcm_bytes)
                except Exception as e:
                    print(f"解码音频时出错: {e}")
print()

用法

1

创建虚拟环境

打开 Terminal 并创建一个 python 虚拟环境。
python3 -m venv .venv
source .venv/bin/activate
2

设置您的 API 密钥

export OPENAI_API_KEY=xxx
3

安装库

pip install -U openai agno
4

运行代理

python cookbook/agent_concepts/multimodal/audio_streaming.py