正在加载,请稍候…

Prometheus + Grafana 监控栈:从搭建到生产告警

构建完整的监控栈——Prometheus 抓取、PromQL 查询、Grafana 仪表盘、告警规则、AlertManager、Node.js 自定义指标和 K

Prometheus + Grafana 监控栈:从搭建到生产告警

Prometheus + Grafana 栈

Prometheus 通过拉取模型收集指标。Grafana 将其可视化。AlertManager 路由告警。

Prometheus + Grafana 监控栈:从搭建到生产告警 插图

Prometheus 搭建(docker-compose)

services:
  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alerts.yml:/etc/prometheus/alerts.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
    ports:
      - "9090:9090"
  
  grafana:
    image: grafana/grafana:latest
    environment:
      GF_SECURITY_ADMIN_PASSWORD: admin
      GF_USERS_ALLOW_SIGN_UP: "false"
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards
      - ./grafana/datasources:/etc/grafana/provisioning/datasources
    ports:
      - "3000:3000"
  
  alertmanager:
    image: prom/alertmanager:latest
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9093:9093"
  
  node-exporter:
    image: prom/node-exporter:latest
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
    ports:
      - "9100:9100"

Prometheus + Grafana 监控栈:从搭建到生产告警 插图

Prometheus 配置

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

rule_files:
  - "alerts.yml"

scrape_configs:
  - job_name: 'api'
    static_configs:
      - targets: ['api:3000']
    metrics_path: /metrics
    scrape_interval: 15s
  
  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']
  
  # Kubernetes 服务发现
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true

Prometheus + Grafana 监控栈:从搭建到生产告警 插图

Node.js 自定义指标

import { collectDefaultMetrics, Counter, Histogram, Gauge, Registry } from 'prom-client'

const registry = new Registry()
collectDefaultMetrics({ register: registry })

// HTTP 请求计数器
const httpRequestsTotal = new Counter({
  name: 'http_requests_total',
  help: 'Total HTTP requests',
  labelNames: ['method', 'route', 'status'],
  registers: [registry],
})

// 请求持续时间直方图
const httpRequestDuration = new Histogram({
  name: 'http_request_duration_seconds',
  help: 'HTTP request duration in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
  registers: [registry],
})

// 活跃连接数仪表盘
const activeConnections = new Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [registry],
})

// 自定义业务指标
const ordersProcessed = new Counter({
  name: 'orders_processed_total',
  help: 'Total orders processed',
  labelNames: ['status'],
  registers: [registry],
})

// 中间件
app.use((req, res, next) => {
  const end = httpRequestDuration.startTimer()
  
  res.on('finish', () => {
    const labels = { method: req.method, route: req.route?.path ?? req.path, status: res.statusCode.toString() }
    httpRequestsTotal.inc(labels)
    end(labels)
  })
  
  next()
})

// 指标端点
app.get('/metrics', async (req, res) => {
  res.set('Content-Type', registry.contentType)
  res.end(await registry.metrics())
})

告警规则

# alerts.yml
groups:
  - name: api_alerts
    rules:
      - alert: HighErrorRate
        expr: |
          rate(http_requests_total{status=~"5.."}[5m])
          /
          rate(http_requests_total[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is {{ $value | humanizePercentage }}"
      
      - alert: SlowResponses
        expr: |
          histogram_quantile(0.95,
            rate(http_request_duration_seconds_bucket[5m])
          ) > 1.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "P95 latency above 1s"
      
      - alert: PodCrashLooping
        expr: rate(kube_pod_container_status_restarts_total[15m]) > 0
        for: 5m
        labels:
          severity: critical

有用的 PromQL 查询

# 请求速率(每秒,5 分钟窗口)
rate(http_requests_total[5m])

# 错误率百分比
100 * rate(http_requests_total{status=~"5.."}[5m])
    / rate(http_requests_total[5m])

# P99 延迟
histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m]))

# 按 Pod 的 CPU 使用率
rate(container_cpu_usage_seconds_total{namespace="production"}[5m])

# 内存使用量
container_memory_working_set_bytes{namespace="production"} / 1024 / 1024

# 数据库连接池利用率
pg_pool_active_connections / pg_pool_max_connections