
Prometheus + Grafana 栈
Prometheus 通过拉取模型收集指标。Grafana 将其可视化。AlertManager 路由告警。

Prometheus 搭建(docker-compose)
services:
prometheus:
image: prom/prometheus:latest
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./alerts.yml:/etc/prometheus/alerts.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
grafana:
image: grafana/grafana:latest
environment:
GF_SECURITY_ADMIN_PASSWORD: admin
GF_USERS_ALLOW_SIGN_UP: "false"
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/datasources:/etc/grafana/provisioning/datasources
ports:
- "3000:3000"
alertmanager:
image: prom/alertmanager:latest
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
ports:
- "9093:9093"
node-exporter:
image: prom/node-exporter:latest
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
ports:
- "9100:9100"

Prometheus 配置
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['alertmanager:9093']
rule_files:
- "alerts.yml"
scrape_configs:
- job_name: 'api'
static_configs:
- targets: ['api:3000']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
# Kubernetes 服务发现
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true

Node.js 自定义指标
import { collectDefaultMetrics, Counter, Histogram, Gauge, Registry } from 'prom-client'
const registry = new Registry()
collectDefaultMetrics({ register: registry })
// HTTP 请求计数器
const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'route', 'status'],
registers: [registry],
})
// 请求持续时间直方图
const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration in seconds',
labelNames: ['method', 'route', 'status'],
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [registry],
})
// 活跃连接数仪表盘
const activeConnections = new Gauge({
name: 'active_connections',
help: 'Number of active connections',
registers: [registry],
})
// 自定义业务指标
const ordersProcessed = new Counter({
name: 'orders_processed_total',
help: 'Total orders processed',
labelNames: ['status'],
registers: [registry],
})
// 中间件
app.use((req, res, next) => {
const end = httpRequestDuration.startTimer()
res.on('finish', () => {
const labels = { method: req.method, route: req.route?.path ?? req.path, status: res.statusCode.toString() }
httpRequestsTotal.inc(labels)
end(labels)
})
next()
})
// 指标端点
app.get('/metrics', async (req, res) => {
res.set('Content-Type', registry.contentType)
res.end(await registry.metrics())
})
告警规则
# alerts.yml
groups:
- name: api_alerts
rules:
- alert: HighErrorRate
expr: |
rate(http_requests_total{status=~"5.."}[5m])
/
rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High error rate on {{ $labels.job }}"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: SlowResponses
expr: |
histogram_quantile(0.95,
rate(http_request_duration_seconds_bucket[5m])
) > 1.0
for: 5m
labels:
severity: warning
annotations:
summary: "P95 latency above 1s"
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
for: 5m
labels:
severity: critical
有用的 PromQL 查询
# 请求速率(每秒,5 分钟窗口)
rate(http_requests_total[5m])
# 错误率百分比
100 * rate(http_requests_total{status=~"5.."}[5m])
/ rate(http_requests_total[5m])
# P99 延迟
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))
# 按 Pod 的 CPU 使用率
rate(container_cpu_usage_seconds_total{namespace="production"}[5m])
# 内存使用量
container_memory_working_set_bytes{namespace="production"} / 1024 / 1024
# 数据库连接池利用率
pg_pool_active_connections / pg_pool_max_connections