指标采集
Spring Boot 通过 Micrometer 提供统一的指标采集门面,屏蔽底层监控系统差异(Prometheus、Datadog、InfluxDB 等),只需切换依赖即可对接不同后端。
架构概览
应用代码
│ 使用 Micrometer API(MeterRegistry)
▼
MeterRegistry(抽象层)
│ 适配不同后端
├── PrometheusMeterRegistry → Prometheus 拉取
├── DatadogMeterRegistry → Datadog 推送
├── InfluxMeterRegistry → InfluxDB 推送
└── SimpleMeterRegistry → 内存(开发调试)
快速开始
<!-- Actuator(暴露 /actuator/metrics 端点) -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<!-- Prometheus 适配器(生产推荐) -->
<dependency>
<groupId>io.micrometer</groupId>
<artifactId>micrometer-registry-prometheus</artifactId>
</dependency>management:
endpoints:
web:
exposure:
include: health, info, metrics, prometheus
metrics:
tags:
application: ${spring.application.name} # 全局 Tag,所有指标都带上
env: ${spring.profiles.active:default}
prometheus:
metrics:
export:
enabled: true访问 /actuator/prometheus 即可获得 Prometheus 格式的指标文本。
四种指标类型
Counter(计数器)— 只增不减
@Service
@RequiredArgsConstructor
public class OrderService {
private final MeterRegistry registry;
// 方式一:直接使用
public void createOrder(Order order) {
processOrder(order);
// 每次下单 +1,带业务 Tag
registry.counter("order.created",
"status", "success",
"channel", order.getChannel()
).increment();
}
// 方式二:预先绑定(性能更好,避免每次查找)
private final Counter errorCounter;
public OrderService(MeterRegistry registry) {
this.registry = registry;
this.errorCounter = Counter.builder("order.error")
.description("订单处理错误次数")
.tag("type", "business")
.register(registry);
}
public void handleError() {
errorCounter.increment();
}
}Gauge(仪表盘)— 实时值,可升可降
@Component
public class QueueMetrics {
private final BlockingQueue<Task> taskQueue = new LinkedBlockingQueue<>();
@Autowired
public QueueMetrics(MeterRegistry registry) {
// 自动追踪集合大小,无需手动更新
Gauge.builder("task.queue.size", taskQueue, Collection::size)
.description("任务队列当前大小")
.tag("queue", "main")
.register(registry);
}
// 也可追踪 AtomicInteger
private final AtomicInteger activeConnections = new AtomicInteger(0);
@Autowired
public void registerConnectionGauge(MeterRegistry registry) {
Gauge.builder("db.connections.active", activeConnections, AtomicInteger::get)
.description("当前活跃数据库连接数")
.register(registry);
}
}Timer(计时器)— 耗时分布 + 调用次数
@Service
@RequiredArgsConstructor
public class PaymentService {
private final MeterRegistry registry;
// 方式一:手动计时
public PaymentResult pay(PaymentRequest request) {
return Timer.builder("payment.duration")
.description("支付接口耗时")
.tag("method", request.getMethod())
.register(registry)
.recordCallable(() -> doPayment(request));
}
// 方式二:Sample(适合跨方法计时)
public void processWithSample(Order order) {
Timer.Sample sample = Timer.start(registry);
try {
processOrder(order);
sample.stop(registry.timer("order.process.duration", "result", "success"));
} catch (Exception e) {
sample.stop(registry.timer("order.process.duration", "result", "error"));
throw e;
}
}
}DistributionSummary(分布摘要)— 数值分布(如请求体大小)
@Component
public class RequestMetrics {
private final DistributionSummary requestSizeSummary;
public RequestMetrics(MeterRegistry registry) {
this.requestSizeSummary = DistributionSummary.builder("http.request.size")
.description("HTTP 请求体大小(字节)")
.baseUnit("bytes")
.percentilePrecision(2)
.publishPercentiles(0.5, 0.90, 0.95, 0.99) // 计算分位数
.register(registry);
}
public void record(long bytes) {
requestSizeSummary.record(bytes);
}
}@Timed 注解(推荐简洁写法)
<!-- 需要 AOP 支持 -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-aop</artifactId>
</dependency>@Configuration
public class TimedConfig {
// 开启 @Timed 切面
@Bean
public TimedAspect timedAspect(MeterRegistry registry) {
return new TimedAspect(registry);
}
}
@Service
public class UserService {
// 自动记录方法耗时、调用次数、异常情况
@Timed(
value = "user.getById.duration",
description = "根据 ID 查询用户耗时",
percentiles = {0.5, 0.95, 0.99},
extraTags = {"layer", "service"}
)
public User getById(Long id) {
return userRepository.findById(id).orElseThrow();
}
}内置自动指标
Spring Boot 自动注册以下指标(无需任何代码):
| 指标前缀 | 说明 |
|---|---|
jvm.memory.* | 堆/非堆内存使用量 |
jvm.gc.* | GC 次数、耗时 |
jvm.threads.* | 线程数(活跃、守护、峰值) |
jvm.classes.* | 类加载数 |
process.cpu.usage | 进程 CPU 使用率 |
system.cpu.usage | 系统 CPU 使用率 |
http.server.requests | HTTP 请求数、耗时(按 URI、method、status 分组) |
hikaricp.* | 连接池状态(活跃连接、等待、超时) |
spring.data.repository.* | Spring Data 方法耗时 |
executor.* | 线程池指标(队列大小、活跃线程) |
cache.* | Spring Cache 命中率 |
logback.events | 各级别日志计数 |
自定义指标:业务 SLA 监控
@Component
@RequiredArgsConstructor
@Slf4j
public class BusinessMetrics {
private final MeterRegistry registry;
// 滑动窗口统计(最近 5 分钟的 p99 延迟)
private final Timer orderTimer;
private final Counter orderSuccessCounter;
private final Counter orderFailCounter;
@Autowired
public BusinessMetrics(MeterRegistry registry) {
this.registry = registry;
this.orderTimer = Timer.builder("business.order.duration")
.description("订单创建端到端耗时")
.publishPercentiles(0.50, 0.90, 0.95, 0.99)
.publishPercentileHistogram() // 开启直方图(支持 Prometheus histogram_quantile)
.sla(Duration.ofMillis(100), Duration.ofMillis(500), Duration.ofSeconds(1))
.minimumExpectedValue(Duration.ofMillis(1))
.maximumExpectedValue(Duration.ofSeconds(10))
.register(registry);
this.orderSuccessCounter = registry.counter("business.order.total", "result", "success");
this.orderFailCounter = registry.counter("business.order.total", "result", "fail");
}
public void recordOrder(Duration duration, boolean success) {
orderTimer.record(duration);
if (success) orderSuccessCounter.increment();
else orderFailCounter.increment();
}
}Prometheus 集成
Prometheus 配置(prometheus.yml)
scrape_configs:
- job_name: 'spring-boot-app'
metrics_path: '/actuator/prometheus'
scrape_interval: 15s
static_configs:
- targets: ['app-host:8080']
# 多实例(配合服务发现)
# kubernetes_sd_configs:
# - role: pod常用 PromQL 查询
# 接口 QPS(每秒请求数)
rate(http_server_requests_seconds_count{application="myapp"}[1m])
# 接口 P99 延迟
histogram_quantile(0.99,
rate(http_server_requests_seconds_bucket{application="myapp"}[5m]))
# 接口错误率
rate(http_server_requests_seconds_count{status=~"5.."}[1m])
/ rate(http_server_requests_seconds_count[1m])
# JVM 堆内存使用率
jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}
# HikariCP 连接等待时间
hikaricp_connections_pending{pool="HikariPool-1"}
# 线程池活跃线程数
executor_active_threads{name="taskExecutor"}Grafana 仪表盘
Spring Boot 官方提供了现成的 Grafana Dashboard(导入 ID 即可使用):
| Dashboard | Grafana ID | 说明 |
|---|---|---|
| JVM Overview | 4701 | 内存、GC、线程、类加载 |
| Spring Boot Statistics | 6756 | HTTP 请求、JVM、数据库 |
| HikariCP | 14094 | 连接池详细指标 |
| Spring Boot 3.x | 19004 | Spring Boot 3 适配版 |
在 Grafana 中 Import → Dashboard ID 直接导入。
自定义 MeterRegistry 过滤与重命名
@Configuration
public class MetricsConfig {
// 过滤不需要的指标(降低 Prometheus 存储压力)
@Bean
public MeterFilter denyHighCardinalityFilter() {
return MeterFilter.denyNameStartsWith("jvm.gc.pause");
}
// 重命名指标
@Bean
public MeterFilter renameFilter() {
return MeterFilter.renameTag("http.server.requests", "uri", "endpoint");
}
// 限制 Tag 值基数(防止高基数 URI 导致 Prometheus 爆炸)
@Bean
public MeterFilter uriTagFilter() {
return new MeterFilter() {
@Override
public Meter.Id map(Meter.Id id) {
if (id.getName().startsWith("http.server.requests")) {
String uri = id.getTag("uri");
// 将动态路径参数替换为 {id}(已由 Spring 自动处理)
if (uri != null && uri.contains("/actuator")) {
return id.replaceTags(Tags.of("uri", "/actuator/**"));
}
}
return id;
}
};
}
// 统一给所有指标添加 Tag
@Bean
public MeterFilter commonTagsFilter() {
return MeterFilter.commonTags(Tags.of(
"region", System.getenv().getOrDefault("REGION", "unknown"),
"instance", InetAddress.getLocalHost().getHostName()
));
}
}告警规则示例(Prometheus Alertmanager)
# alert-rules.yml
groups:
- name: spring-boot
rules:
# 接口错误率超过 5%
- alert: HighErrorRate
expr: |
rate(http_server_requests_seconds_count{status=~"5.."}[5m])
/ rate(http_server_requests_seconds_count[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "接口错误率过高: {{ $labels.application }}"
description: "错误率 {{ $value | humanizePercentage }}"
# P99 延迟超过 1 秒
- alert: HighLatencyP99
expr: |
histogram_quantile(0.99,
rate(http_server_requests_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "P99 延迟过高: {{ $labels.uri }}"
# JVM 堆内存使用率超 85%
- alert: JvmHeapHighUsage
expr: |
jvm_memory_used_bytes{area="heap"}
/ jvm_memory_max_bytes{area="heap"} > 0.85
for: 5m
labels:
severity: warning