
Kubernetes 部署策略
选择合适的部署策略需要在风险、速度和复杂性之间取得平衡。

滚动更新(默认)
apiVersion: apps/v1
kind: Deployment
metadata:
name: api
spec:
replicas: 6
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 2 # 更新期间允许超出期望副本数的最大 Pod 数
maxUnavailable: 1 # 更新期间允许不可用的最大 Pod 数
selector:
matchLabels:
app: api
template:
metadata:
labels:
app: api
spec:
containers:
- name: api
image: myapp/api:v2
readinessProbe:
httpGet:
path: /health/ready
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
failureThreshold: 3
livenessProbe:
httpGet:
path: /health/live
port: 3000
initialDelaySeconds: 15
periodSeconds: 20
terminationGracePeriodSeconds: 30 # 让请求完成
# 执行滚动更新
kubectl set image deployment/api api=myapp/api:v2
# 监控滚动更新状态
kubectl rollout status deployment/api
# 如果需要,回滚
kubectl rollout undo deployment/api
kubectl rollout undo deployment/api --to-revision=2

蓝绿部署
# Blue(当前生产环境)
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-blue
spec:
replicas: 5
selector:
matchLabels:
app: api
version: blue
---
# Green(新版本)
apiVersion: apps/v1
kind: Deployment
metadata:
name: api-green
spec:
replicas: 5
selector:
matchLabels:
app: api
version: green
---
# Service 初始指向 blue
apiVersion: v1
kind: Service
metadata:
name: api
spec:
selector:
app: api
version: blue # 改为 "green" 以切换流量
ports:
- port: 80
targetPort: 3000
# 将所有流量切换到 green
kubectl patch service api -p '{"spec":{"selector":{"version":"green"}}}'
# 验证 green 是否正常工作
kubectl get pods -l version=green
# 确认 green 稳定后,缩容 blue
kubectl scale deployment api-blue --replicas=0
# 或完全删除 blue
kubectl delete deployment api-blue

使用 Argo Rollouts 进行金丝雀发布
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: api
spec:
replicas: 10
strategy:
canary:
steps:
- setWeight: 10 # 10% 流量到金丝雀
- pause: { duration: 5m }
- setWeight: 30
- pause: { duration: 10m }
- analysis:
templates:
- templateName: success-rate
- setWeight: 60
- pause: { duration: 10m }
- setWeight: 100
canaryMetadata:
labels:
version: canary
stableMetadata:
labels:
version: stable
selector:
matchLabels:
app: api
template:
metadata:
labels:
app: api
spec:
containers:
- name: api
image: myapp/api:v2
---
# 用于自动升级的分析模板
apiVersion: argoproj.io/v1alpha1
kind: AnalysisTemplate
metadata:
name: success-rate
spec:
metrics:
- name: success-rate
interval: 1m
successCondition: result[0] >= 0.95
failureLimit: 3
provider:
prometheus:
address: http://prometheus:9090
query: |
sum(rate(http_requests_total{job="api",status!~"5.."}[5m]))
/
sum(rate(http_requests_total{job="api"}[5m]))
PodDisruptionBudget
# 确保在自愿中断期间的最小可用性
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: api-pdb
spec:
minAvailable: 4 # 始终保持 4 个 Pod 运行
selector:
matchLabels:
app: api
重要的健康检查
// Liveness vs Readiness vs Startup probes
app.get('/health/live', (req, res) => {
// Liveness:进程是否存活?(失败则重启)
res.json({ status: 'alive' })
})
app.get('/health/ready', async (req, res) => {
// Readiness:是否可以处理流量?(失败则从负载均衡中移除)
try {
await db.query('SELECT 1')
await redis.ping()
res.json({ status: 'ready' })
} catch (err) {
res.status(503).json({ status: 'not ready', reason: err.message })
}
})
app.get('/health/startup', (req, res) => {
// Startup:初始化是否完成?(仅在启动期间)
if (appStarted) {
res.json({ status: 'started' })
} else {
res.status(503).json({ status: 'starting' })
}
})