
数据库中的向量
pgvector 为 PostgreSQL 添加了向量操作——无需单独的向量数据库。将嵌入向量与关系数据一起存储。

设置
-- 启用 pgvector 扩展
CREATE EXTENSION vector;
-- 为 products 表添加嵌入向量列
ALTER TABLE products ADD COLUMN embedding vector(1536); -- OpenAI ada-002 维度
-- 或者创建新表
CREATE TABLE documents (
id BIGSERIAL PRIMARY KEY,
content TEXT NOT NULL,
metadata JSONB,
embedding vector(1536),
created_at TIMESTAMPTZ DEFAULT NOW()
);
存储嵌入向量
import { OpenAI } from 'openai'
import { Pool } from 'pg'
const openai = new OpenAI()
const pool = new Pool({ connectionString: process.env.DATABASE_URL })
async function embedAndStore(text: string, metadata: Record<string, any>) {
// 从 OpenAI 获取嵌入向量
const response = await openai.embeddings.create({
model: 'text-embedding-ada-002',
input: text,
})
const embedding = response.data[0].embedding // 1536 维数组
// 存储到 PostgreSQL
const result = await pool.query(
'INSERT INTO documents (content, metadata, embedding) VALUES ($1, $2, $3) RETURNING id',
[text, JSON.stringify(metadata), JSON.stringify(embedding)]
)
return result.rows[0].id
}

相似性搜索
-- L2 距离(欧几里得)——值越小越相似
SELECT id, content, embedding <-> $1 AS distance
FROM documents
ORDER BY embedding <-> $1
LIMIT 10;
-- 余弦相似度——值越大越相似
SELECT id, content, 1 - (embedding <=> $1) AS similarity
FROM documents
ORDER BY embedding <=> $1
LIMIT 10;
-- 内积
SELECT id, content, (embedding <#> $1) * -1 AS score
FROM documents
ORDER BY embedding <#> $1
LIMIT 10;
async function semanticSearch(query: string, topK = 10) {
// 对查询进行嵌入
const response = await openai.embeddings.create({
model: 'text-embedding-ada-002',
input: query,
})
const queryEmbedding = response.data[0].embedding
// 搜索
const result = await pool.query(
`SELECT id, content, metadata, 1 - (embedding <=> $1) AS similarity
FROM documents
WHERE 1 - (embedding <=> $1) > 0.7 -- 最小相似度阈值
ORDER BY embedding <=> $1
LIMIT $2`,
[JSON.stringify(queryEmbedding), topK]
)
return result.rows
}
HNSW 索引(快速近似搜索)
-- IVFFlat(适用于大多数情况)
CREATE INDEX ON documents USING ivfflat (embedding vector_cosine_ops)
WITH (lists = 100); -- lists = sqrt(rows)
-- HNSW(查询更快,构建更慢,内存更多)
CREATE INDEX ON documents USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- m: 每个节点的连接数(越大越准确,内存越多)
-- ef_construction: 构建质量(越大越准确,构建越慢)
-- 设置搜索深度(越大越准确,但更慢)
SET hnsw.ef_search = 100;

混合搜索(向量 + 关键词)
-- 使用 RRF 结合语义搜索和全文搜索
WITH semantic AS (
SELECT id, ROW_NUMBER() OVER (ORDER BY embedding <=> $1) AS rank
FROM documents
LIMIT 50
),
keyword AS (
SELECT id, ROW_NUMBER() OVER (ORDER BY ts_rank(search_vector, query) DESC) AS rank
FROM documents, to_tsquery('english', $2) query
WHERE search_vector @@ query
LIMIT 50
),
rrf AS (
SELECT
COALESCE(s.id, k.id) AS id,
COALESCE(1.0 / (60 + s.rank), 0) + COALESCE(1.0 / (60 + k.rank), 0) AS score
FROM semantic s
FULL OUTER JOIN keyword k USING (id)
)
SELECT d.id, d.content, r.score
FROM rrf r
JOIN documents d ON d.id = r.id
ORDER BY r.score DESC
LIMIT 10;
大规模性能
对于 100 万+ 向量:
- 使用 HNSW,参数为
m=16, ef_construction=128 - 在索引构建期间设置
maintenance_work_mem = 4GB - 使用
pgvector.ivfflat.probes = 10以获得更好的召回率 - 考虑使用 pgvectorscale 实现大规模下 28 倍更快的查询
- 或者对于 1 亿+ 向量,使用专用向量数据库(Pinecone、Weaviate、Qdrant)
文档分块策略
// 将长文档分割成块以获得更好的搜索效果
function chunkText(text: string, maxTokens = 512, overlap = 50): string[] {
const words = text.split(' ')
const chunks: string[] = []
for (let i = 0; i < words.length; i += maxTokens - overlap) {
chunks.push(words.slice(i, i + maxTokens).join(' '))
}
return chunks
}