
向量嵌入与语义搜索
生成嵌入
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('BAAI/bge-m3') # 1024 维,多语言
sentences = [
"The quick brown fox jumps over the lazy dog",
"A fast orange fox leaped above a sleepy canine",
"Python is a programming language",
]
embeddings = model.encode(sentences, normalize_embeddings=True)
# 通过点积计算余弦相似度(归一化向量)
print(np.dot(embeddings[0], embeddings[1])) # ~0.92 - 语义相似
print(np.dot(embeddings[0], embeddings[2])) # ~0.15 - 不同主题

Qdrant 向量数据库
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import uuid
client = QdrantClient(host="localhost", port=6333)
client.create_collection("docs", vectors_config=VectorParams(size=1024, distance=Distance.COSINE))
def index(documents):
texts = [d["text"] for d in documents]
embs = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
points = [PointStruct(id=str(uuid.uuid4()), vector=e.tolist(), payload=d)
for d, e in zip(documents, embs)]
client.upload_points("docs", points=points, batch_size=100)
def search(query, limit=10):
q = model.encode([query], normalize_embeddings=True)[0]
results = client.search("docs", query_vector=q.tolist(), limit=limit, score_threshold=0.7)
return [{"text": r.payload["text"], "score": r.score} for r in results]

混合搜索(稠密 + 稀疏,通过 RRF)
# Qdrant RRF 融合稠密和稀疏搜索
results = client.query_points(
"hybrid_docs",
prefetch=[
models.Prefetch(query=dense_emb.tolist(), using="dense", limit=20),
models.Prefetch(query=sparse_vec, using="sparse", limit=20),
],
query=models.FusionQuery(fusion=models.Fusion.RRF),
limit=10,
)

FAISS ANN 索引
import faiss
class FAISSIndex:
def __init__(self, dims=1024):
quantizer = faiss.IndexFlatIP(dims)
self.index = faiss.IndexIVFFlat(quantizer, dims, 1024)
self.index.nprobe = 32
def build(self, embeddings):
self.index.train(embeddings)
self.index.add(embeddings)
def search(self, query, k=10):
return self.index.search(query.reshape(1, -1), k)
基于嵌入的推荐
from sklearn.metrics.pairwise import cosine_similarity
class RecommendationEngine:
def fit(self, items):
texts = [f"{i['title']} {i['description']}" for i in items]
self.embeddings = model.encode(texts, normalize_embeddings=True)
self.ids = [i["id"] for i in items]
def recommend(self, user_history, n=10):
idx = [self.ids.index(id) for id in user_history if id in self.ids]
user_emb = self.embeddings[idx].mean(axis=0)
user_emb /= np.linalg.norm(user_emb)
scores = cosine_similarity([user_emb], self.embeddings)[0]
ranked = np.argsort(scores)[::-1]
return [self.ids[i] for i in ranked if self.ids[i] not in user_history][:n]
生产实践建议
| 问题 |
解决方案 |
| 索引新鲜度 |
基于队列的实时更新 |
| 冷启动 |
基于内容的回退 |
| 延迟 |
ANN + L1 缓存 |
| 质量 |
A/B 测试嵌入模型 |