
2026 年构建 LLM 应用意味着要在一个成熟但快速发展的生态系统中工作。RAG 减少幻觉;函数调用实现结构化输出;流式响应改善用户体验。
RAG(检索增强生成)
from openai import OpenAI
import numpy as np
from typing import List
client = OpenAI()
def chunk_document(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
words = text.split()
return [
' '.join(words[i:i + chunk_size])
for i in range(0, len(words), chunk_size - overlap)
if words[i:i + chunk_size]
]
def embed(text: str) -> List[float]:
return client.embeddings.create(
model="text-embedding-3-small", input=text
).data[0].embedding
def cosine_similarity(a, b) -> float:
a, b = np.array(a), np.array(b)
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def retrieve(query: str, index: list, top_k: int = 5) -> list:
q_emb = embed(query)
scored = sorted(index, key=lambda d: cosine_similarity(q_emb, d['embedding']), reverse=True)
return scored[:top_k]
def answer_with_rag(query: str, index: list) -> str:
docs = retrieve(query, index)
context = "\n\n".join(f"Source: {d['title']}\n{d['content']}" for d in docs)
return client.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": "Answer ONLY using the provided context. Cite your sources."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
],
temperature=0.1
).choices[0].message.content

函数调用实现结构化输出
tools = [{
"type": "function",
"function": {
"name": "search_products",
"description": "Search the product catalog",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string"},
"category": {"type": "string", "enum": ["electronics", "clothing", "books"]},
"max_price": {"type": "number"}
},
"required": ["query"]
}
}
}]
def process_with_tools(user_message: str, history: list) -> str:
messages = history + [{"role": "user", "content": user_message}]
resp = client.chat.completions.create(model="gpt-4o", messages=messages, tools=tools)
msg = resp.choices[0].message
if msg.tool_calls:
messages.append(msg)
for call in msg.tool_calls:
fn_name = call.function.name
args = json.loads(call.function.arguments)
result = search_products(**args) if fn_name == "search_products" else {}
messages.append({"role": "tool", "tool_call_id": call.id, "content": json.dumps(result)})
final = client.chat.completions.create(model="gpt-4o", messages=messages)
return final.choices[0].message.content
return msg.content

流式响应
from fastapi.responses import StreamingResponse
@app.post("/chat/stream")
async def chat_stream(request: ChatRequest):
async def generate():
stream = await client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": request.message}],
stream=True
)
async for chunk in stream:
if chunk.choices[0].delta.content:
yield f"data: {json.dumps({'text': chunk.choices[0].delta.content})}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")

成本优化
# Cache embeddings (deterministic)
from functools import lru_cache
@lru_cache(maxsize=10000)
def cached_embed(text: str) -> tuple:
return tuple(embed(text))
# Route to cheaper model for simple queries
def select_model(query: str) -> str:
simple_patterns = ['what is', 'define ', 'list the', 'when was']
return "gpt-4o-mini" if any(query.lower().startswith(p) for p in simple_patterns) else "gpt-4o"
# Compress context to stay within token budget
def compress_context(docs: list, max_tokens: int = 2000) -> str:
context, total = [], 0
for doc in docs:
if total + len(doc['content']) > max_tokens * 4:
break
context.append(doc['content'])
total += len(doc['content'])
return "\n\n".join(context)
评估
def evaluate_rag(cases: list, rag_fn) -> dict:
results = []
for case in cases:
answer = rag_fn(case['question'])
results.append({
'question': case['question'],
'answer': answer,
'relevance': check_relevance(answer, case['expected']),
'faithfulness': check_faithfulness(answer, case['context']),
})
avg_relevance = sum(r['relevance'] for r in results) / len(results)
avg_faithfulness = sum(r['faithfulness'] for r in results) / len(results)
return {'avg_relevance': avg_relevance, 'avg_faithfulness': avg_faithfulness, 'n': len(results)}
LLM 应用需要不同的思维方式:概率系统需要从一开始就进行评估,而不是事后才考虑。
→ 使用 JSON Viewer 工具格式化你的提示模板。