
在生产环境中部署大语言模型
在生产环境中运行大语言模型(LLM)成本高昂且对延迟敏感。本指南介绍降低成本和提升性能的实用技术。
使用 vLLM 进行模型服务
pip install vllm
# 以 OpenAI 兼容 API 提供服务
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Llama-3-8B-Instruct \
--tensor-parallel-size 2 \
--max-model-len 4096 \
--gpu-memory-utilization 0.90 \
--max-num-seqs 256
from vllm import LLM, SamplingParams
llm = LLM(
model="meta-llama/Llama-3-8B-Instruct",
tensor_parallel_size=2,
quantization="awq", # 使用量化模型
)
outputs = llm.generate(
["解释 Python 中的 async/await", "什么是 Docker?"],
SamplingParams(temperature=0.7, max_tokens=512),
)

提示缓存
import hashlib
import redis
import json
class CachedLLMClient:
def __init__(self, client, redis_client):
self.client = client
self.redis = redis_client
self.ttl = 3600 # 1 小时
def _cache_key(self, messages, **kwargs):
content = json.dumps({"messages": messages, "kwargs": kwargs}, sort_keys=True)
return f"llm:{hashlib.sha256(content.encode()).hexdigest()}"
def generate(self, messages, **kwargs):
cache_key = self._cache_key(messages, **kwargs)
# 先检查缓存
cached = self.redis.get(cache_key)
if cached:
return json.loads(cached)
# 生成
response = self.client.chat.completions.create(
model="gpt-4o-mini",
messages=messages,
**kwargs,
)
result = response.choices[0].message.content
# 缓存结果
self.redis.setex(cache_key, self.ttl, json.dumps(result))
return result
请求批处理
import asyncio
from dataclasses import dataclass
@dataclass
class BatchRequest:
messages: list
future: asyncio.Future
class BatchedLLMClient:
def __init__(self, max_batch_size=10, max_wait_ms=50):
self.max_batch_size = max_batch_size
self.max_wait_ms = max_wait_ms
self.queue = []
self.lock = asyncio.Lock()
async def generate(self, messages):
future = asyncio.get_event_loop().create_future()
async with self.lock:
self.queue.append(BatchRequest(messages, future))
if len(self.queue) == 1:
asyncio.create_task(self._process_batch())
return await future
async def _process_batch(self):
await asyncio.sleep(self.max_wait_ms / 1000)
async with self.lock:
batch = self.queue[:self.max_batch_size]
self.queue = self.queue[self.max_batch_size:]
tasks = [self._call_single(r.messages) for r in batch]
responses = await asyncio.gather(*tasks)
for req, resp in zip(batch, responses):
req.future.set_result(resp)

量化以降低成本
# AWQ 4-bit 量化将 VRAM 需求降低 4 倍,质量损失极小
from vllm import LLM
llm = LLM(
model="TheBloke/Llama-2-7B-Chat-AWQ",
quantization="awq",
)
# 内存对比:
# 全精度(float16):7B 模型 = 14GB VRAM
# AWQ 4-bit:7B 模型 = ~4GB VRAM
成本优化:模型路由
class LLMRouter:
def classify_complexity(self, prompt):
if len(prompt) < 100:
return "simple"
if any(kw in prompt.lower() for kw in ["code", "analyze", "compare"]):
return "complex"
return "simple"
def generate(self, prompt):
complexity = self.classify_complexity(prompt)
model = "gpt-4o-mini" if complexity == "simple" else "gpt-4o"
# gpt-4o-mini 比 gpt-4o 便宜 10 倍
return self.client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
).choices[0].message.content

监控
from prometheus_client import Histogram, Counter
LATENCY = Histogram("llm_request_duration_seconds", "LLM 延迟", ["model"])
TOKENS = Counter("llm_tokens_total", "总 token 数", ["model", "type"])
def monitored_generate(client, messages, model):
import time
start = time.time()
response = client.chat.completions.create(model=model, messages=messages)
TOKENS.labels(model=model, type="prompt").inc(response.usage.prompt_tokens)
TOKENS.labels(model=model, type="completion").inc(response.usage.completion_tokens)
LATENCY.labels(model=model).observe(time.time() - start)
return response.choices[0].message.content
总结
生产环境中的 LLM 部署需要:
- vLLM/TGI:专为最大化吞吐量设计的服务框架
- 缓存:对重复查询进行精确和语义缓存
- 批处理:将请求分组以提高 GPU 利用率
- 路由:将简单任务发送到更便宜的模型
- 量化:AWQ/GGUF 可将 VRAM 需求降低 4 倍,质量损失极小
- 监控:跟踪每个模型的延迟、token 数和成本