🔀 构建企业级AI API网关:多模型调度与成本优化实战
当你的AI应用接入3个以上LLM提供商、日调用量超过10万次时,直接调用各厂商API的原始模式会迅速崩溃:API Key管理混乱、成本无法追踪、故障无自动切换、延迟无法优化。本文将手把手教你构建企业级AI API网关,解决这些痛点。
一、为什么需要AI API网关?
一个典型的中型AI应用可能同时使用: - GPT-4o 处理复杂推理任务 - Claude 3.5 Sonnet 处理长文本分析 - Qwen2.5-72B 处理中文对话 - DeepSeek-V3 处理代码生成 - Gemini 2.0 Flash 处理多模态和低成本任务
直接管理这些API会遇到四大问题:
- Key散落:每个服务各自管理API Key,泄露风险高
- 成本黑洞:无法按项目/团队/模型维度追踪Token消耗
- 单点故障:某个提供商宕机,整个系统瘫痪
- 调度低效:不同任务应路由到不同模型,但缺少智能分发机制
二、主流开源方案对比
2026年最成熟的三个AI API网关方案:
LiteLLM Proxy:最活跃的开源项目,支持100+模型提供商,内置负载均衡和故障转移,Python生态友好。
OneAPI:国人开发,中文文档完善,支持多令牌管理和自定义渠道分组,适合国内团队。
New API:基于OneAPI重构,UI更现代化,增加了额度管理、兑换码等功能,适合SaaS场景。
LiteLLM Proxy 核心配置
# litellm_config.yaml
model_list:
# GPT-4o - 复杂推理任务
- model_name: "gpt-4o"
litellm_params:
model: "openai/gpt-4o"
api_key: "os.environ/OPENAI_API_KEY"
rpm: 60 # 每分钟请求数限制
max_tokens: 4096
# GPT-4o 备用实例(不同区域)
- model_name: "gpt-4o"
litellm_params:
model: "azure/gpt-4o-eastus"
api_base: "os.environ/AZURE_ENDPOINT"
api_key: "os.environ/AZURE_API_KEY"
rpm: 60
# Claude 3.5 Sonnet
- model_name: "claude-sonnet"
litellm_params:
model: "anthropic/claude-3-5-sonnet-20241022"
api_key: "os.environ/ANTHROPIC_API_KEY"
rpm: 50
# Qwen2.5-72B(通过阿里云DashScope)
- model_name: "qwen-72b"
litellm_params:
model: "openai/qwen2.5-72b-instruct"
api_key: "os.environ/DASHSCOPE_API_KEY"
api_base: "https://dashscope.aliyuncs.com/compatible-mode/v1"
rpm: 120
# DeepSeek-V3
- model_name: "deepseek-v3"
litellm_params:
model: "deepseek/deepseek-chat"
api_key: "os.environ/DEEPSEEK_API_KEY"
rpm: 100
router_settings:
routing_strategy: "latency-based-routing" # 基于延迟路由
num_retries: 3
timeout: 30
allowed_fails: 2
cooldown_time: 30 # 失败后冷却30秒
enable_pre_call_checks: true
context_window_fallbacks:
- gpt-4o: ["claude-sonnet"] # 上下文窗口溢出时回退
general_settings:
master_key: "sk-your-master-key"
database_url: "postgresql://user:pass@localhost:5432/litellm"
store_model_in_db: true
三、智能路由策略实现
核心路由策略有三种,你可以组合使用:
1. 基于成本的路由
from litellm import Router
router = Router(
model_list=[
{"model_name": "smart-router", "litellm_params": {"model": "gpt-4o"}},
{"model_name": "smart-router", "litellm_params": {"model": "deepseek/deepseek-chat"}},
{"model_name": "smart-router", "litellm_params": {"model": "openai/qwen2.5-72b-instruct"}},
],
routing_strategy="cost-based-routing", # 自动选择最便宜的模型
budget_limits={
"daily_budget": 100.0, # 每日预算$100
"monthly_budget": 2000.0, # 每月预算$2000
}
)
# 自动路由到最便宜的可用模型
response = await router.acompletion(
model="smart-router",
messages=[{"role": "user", "content": "你好"}]
)
2. 基于任务类型的路由
import re
from litellm import Router
class TaskBasedRouter:
"""根据任务类型智能选择模型"""
def __init__(self):
self.router = Router(model_list=[...])
self.task_patterns = {
"code": re.compile(r"(代码|函数|debug|编程|python|java|bug|修复)", re.I),
"creative": re.compile(r"(写|创作|故事|文案|诗歌|小说)", re.I),
"analysis": re.compile(r"(分析|总结|对比|评估|研究)", re.I),
"qa": re.compile(r"(什么是|怎么|为什么|如何|请解释)", re.I),
}
self.model_mapping = {
"code": "deepseek-v3", # 代码任务 → DeepSeek
"creative": "claude-sonnet", # 创作任务 → Claude
"analysis": "gpt-4o", # 分析任务 → GPT-4o
"qa": "qwen-72b", # 问答任务 → Qwen
"default": "qwen-72b", # 默认 → 成本最低
}
def classify_and_route(self, user_message: str) -> str:
for task_type, pattern in self.task_patterns.items():
if pattern.search(user_message):
return self.model_mapping[task_type]
return self.model_mapping["default"]
async def chat(self, messages: list):
user_msg = messages[-1]["content"]
model = self.classify_and_route(user_msg)
print(f"[路由] 任务类型识别 → 模型: {model}")
return await self.router.acompletion(model=model, messages=messages)
3. 基于质量评分的路由(A/B测试)
class QualityAwareRouter:
"""基于质量评分动态调整路由权重"""
def __init__(self, router: Router):
self.router = router
self.quality_scores = {
"gpt-4o": {"score": 0.95, "cost_per_1k": 0.005},
"claude-sonnet": {"score": 0.93, "cost_per_1k": 0.003},
"qwen-72b": {"score": 0.88, "cost_per_1k": 0.001},
"deepseek-v3": {"score": 0.90, "cost_per_1k": 0.0005},
}
def get_best_model(self, quality_threshold: float = 0.85,
budget_per_1k: float = 0.003) -> str:
"""在质量阈值内选择最便宜的模型"""
candidates = [
(name, info) for name, info in self.quality_scores.items()
if info["score"] >= quality_threshold
]
if not candidates:
return "gpt-4o" # 兜底
# 在满足预算的候选中选质量最高的
budget_candidates = [
(name, info) for name, info in candidates
if info["cost_per_1k"] <= budget_per_1k
]
if budget_candidates:
return max(budget_candidates, key=lambda x: x[1]["score"])[0]
return min(candidates, key=lambda x: x[1]["cost_per_1k"])[0]
四、语义缓存:降低重复调用成本
对于相似的用户查询,语义缓存可以避免重复调用LLM,节省30-60%的API成本:
import hashlib
import numpy as np
from sentence_transformers import SentenceTransformer
import redis
class SemanticCache:
def __init__(self, redis_url="redis://localhost:6379",
similarity_threshold=0.92):
self.redis = redis.from_url(redis_url)
self.encoder = SentenceTransformer("BAAI/bge-m3")
self.threshold = similarity_threshold
def _get_embedding(self, text: str) -> np.ndarray:
return self.encoder.encode(text, normalize_embeddings=True)
def get(self, query: str) -> str | None:
"""查找语义相似的缓存结果"""
query_embedding = self._get_embedding(query)
# 从Redis中检索所有缓存的embedding
for key in self.redis.scan_iter("llm_cache:*"):
cached = self.redis.hgetall(key)
cached_embedding = np.frombuffer(cached[b"embedding"], dtype=np.float32)
similarity = np.dot(query_embedding, cached_embedding)
if similarity >= self.threshold:
print(f"[缓存命中] 相似度: {similarity:.4f}")
return cached[b"response"].decode("utf-8")
return None
def set(self, query: str, response: str, model: str, ttl: int = 3600):
"""缓存查询结果"""
embedding = self._get_embedding(query)
cache_key = f"llm_cache:{hashlib.md5(query.encode()).hexdigest()}"
self.redis.hset(cache_key, mapping={
"query": query,
"response": response,
"model": model,
"embedding": embedding.tobytes()
})
self.redis.expire(cache_key, ttl)
五、监控与告警
from prometheus_client import Histogram, Counter, Gauge
# 定义指标
REQUEST_LATENCY = Histogram(
"llm_request_latency_seconds",
"LLM请求延迟",
["model", "status"]
)
TOKEN_USAGE = Counter(
"llm_tokens_total",
"Token消耗总量",
["model", "type"] # type: prompt/completion
)
ACTIVE_REQUESTS = Gauge(
"llm_active_requests",
"当前活跃请求数",
["model"]
)
ERROR_RATE = Counter(
"llm_errors_total",
"错误请求数",
["model", "error_type"]
)
# 告警规则 (Prometheus AlertManager)
alert_rules = """
groups:
- name: llm_alerts
rules:
- alert: HighLLMLatency
expr: histogram_quantile(0.95, llm_request_latency_seconds) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "LLM P95延迟超过10秒"
- alert: HighErrorRate
expr: rate(llm_errors_total[5m]) / rate(llm_request_latency_seconds_count[5m]) > 0.1
for: 3m
labels:
severity: critical
annotations:
summary: "LLM错误率超过10%"
- alert: BudgetExceeded
expr: llm_monthly_cost_usd > 2000
labels:
severity: critical
annotations:
summary: "月度LLM费用超出预算"
"""
六、Token预算管理
class TokenBudgetManager:
"""按团队/项目管理Token预算"""
def __init__(self, redis_client):
self.redis = redis_client
def check_budget(self, project_id: str, model: str,
estimated_tokens: int) -> bool:
"""检查是否超出预算"""
monthly_key = f"budget:{project_id}:{model}:monthly"
current_usage = int(self.redis.get(monthly_key) or 0)
monthly_limit = self._get_limit(project_id, model)
if current_usage + estimated_tokens > monthly_limit:
raise BudgetExceededError(
f"项目 {project_id} 的 {model} 月度额度已用尽 "
f"({current_usage}/{monthly_limit} tokens)"
)
return True
def record_usage(self, project_id: str, model: str,
prompt_tokens: int, completion_tokens: int):
"""记录Token消耗"""
monthly_key = f"budget:{project_id}:{model}:monthly"
daily_key = f"budget:{project_id}:{model}:daily"
total = prompt_tokens + completion_tokens
pipe = self.redis.pipeline()
pipe.incrby(monthly_key, total)
pipe.incrby(daily_key, total)
# 月度key过期时间35天
pipe.expire(monthly_key, 35 * 86400)
# 日度key过期时间2天
pipe.expire(daily_key, 2 * 86400)
pipe.execute()
总结
构建企业级AI API网关的核心要素:
- 统一接入:用LiteLLM Proxy或OneAPI统一管理所有模型提供商
- 智能路由:基于成本、延迟、质量三个维度组合调度
- 语义缓存:对相似查询缓存结果,节省30-60%成本
- 预算管控:按团队/项目/模型精细管理Token额度
- 监控告警:Prometheus + Grafana实时监控延迟、错误率和成本
这套架构已经在多个日调用量百万级的AI产品中验证,平均降低40%的API成本,同时将可用性提升到99.9%以上。