使用 RAG、函数调用和生产模式构建 LLM 应用

构建生产级 LLM 应用：RAG 用于知识接地，函数调用用于结构化输出，流式响应，成本优化，提示工程和评估。

2026 年构建 LLM 应用意味着要在一个成熟但快速发展的生态系统中工作。RAG 减少幻觉；函数调用实现结构化输出；流式响应改善用户体验。

RAG（检索增强生成）

from openai import OpenAI
import numpy as np
from typing import List

client = OpenAI()

def chunk_document(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
    words = text.split()
    return [
        ' '.join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size - overlap)
        if words[i:i + chunk_size]
    ]

def embed(text: str) -> List[float]:
    return client.embeddings.create(
        model="text-embedding-3-small", input=text
    ).data[0].embedding

def cosine_similarity(a, b) -> float:
    a, b = np.array(a), np.array(b)
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def retrieve(query: str, index: list, top_k: int = 5) -> list:
    q_emb = embed(query)
    scored = sorted(index, key=lambda d: cosine_similarity(q_emb, d['embedding']), reverse=True)
    return scored[:top_k]

def answer_with_rag(query: str, index: list) -> str:
    docs = retrieve(query, index)
    context = "\n\n".join(f"Source: {d['title']}\n{d['content']}" for d in docs)

    return client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Answer ONLY using the provided context. Cite your sources."},
            {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
        ],
        temperature=0.1
    ).choices[0].message.content

使用 RAG、函数调用和生产模式构建 LLM 应用插图

函数调用实现结构化输出

tools = [{
    "type": "function",
    "function": {
        "name": "search_products",
        "description": "Search the product catalog",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {"type": "string"},
                "category": {"type": "string", "enum": ["electronics", "clothing", "books"]},
                "max_price": {"type": "number"}
            },
            "required": ["query"]
        }
    }
}]

def process_with_tools(user_message: str, history: list) -> str:
    messages = history + [{"role": "user", "content": user_message}]
    resp = client.chat.completions.create(model="gpt-4o", messages=messages, tools=tools)
    msg = resp.choices[0].message

    if msg.tool_calls:
        messages.append(msg)
        for call in msg.tool_calls:
            fn_name = call.function.name
            args = json.loads(call.function.arguments)
            result = search_products(**args) if fn_name == "search_products" else {}
            messages.append({"role": "tool", "tool_call_id": call.id, "content": json.dumps(result)})
        final = client.chat.completions.create(model="gpt-4o", messages=messages)
        return final.choices[0].message.content

    return msg.content

使用 RAG、函数调用和生产模式构建 LLM 应用插图

流式响应

from fastapi.responses import StreamingResponse

@app.post("/chat/stream")
async def chat_stream(request: ChatRequest):
    async def generate():
        stream = await client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": request.message}],
            stream=True
        )
        async for chunk in stream:
            if chunk.choices[0].delta.content:
                yield f"data: {json.dumps({'text': chunk.choices[0].delta.content})}\n\n"
        yield "data: [DONE]\n\n"

    return StreamingResponse(generate(), media_type="text/event-stream")

使用 RAG、函数调用和生产模式构建 LLM 应用插图

成本优化

# Cache embeddings (deterministic)
from functools import lru_cache

@lru_cache(maxsize=10000)
def cached_embed(text: str) -> tuple:
    return tuple(embed(text))

# Route to cheaper model for simple queries
def select_model(query: str) -> str:
    simple_patterns = ['what is', 'define ', 'list the', 'when was']
    return "gpt-4o-mini" if any(query.lower().startswith(p) for p in simple_patterns) else "gpt-4o"

# Compress context to stay within token budget
def compress_context(docs: list, max_tokens: int = 2000) -> str:
    context, total = [], 0
    for doc in docs:
        if total + len(doc['content']) > max_tokens * 4:
            break
        context.append(doc['content'])
        total += len(doc['content'])
    return "\n\n".join(context)

评估

def evaluate_rag(cases: list, rag_fn) -> dict:
    results = []
    for case in cases:
        answer = rag_fn(case['question'])
        results.append({
            'question': case['question'],
            'answer': answer,
            'relevance': check_relevance(answer, case['expected']),
            'faithfulness': check_faithfulness(answer, case['context']),
        })
    avg_relevance = sum(r['relevance'] for r in results) / len(results)
    avg_faithfulness = sum(r['faithfulness'] for r in results) / len(results)
    return {'avg_relevance': avg_relevance, 'avg_faithfulness': avg_faithfulness, 'n': len(results)}

LLM 应用需要不同的思维方式：概率系统需要从一开始就进行评估，而不是事后才考虑。

→ 使用 JSON Viewer 工具格式化你的提示模板。

页面加载失败

使用 RAG、函数调用和生产模式构建 LLM 应用

RAG（检索增强生成）

函数调用实现结构化输出

流式响应

成本优化

评估