指标采集

Spring Boot 通过 Micrometer 提供统一的指标采集门面,屏蔽底层监控系统差异(Prometheus、Datadog、InfluxDB 等),只需切换依赖即可对接不同后端。


架构概览

应用代码
  │  使用 Micrometer API(MeterRegistry)
  ▼
MeterRegistry(抽象层)
  │  适配不同后端
  ├── PrometheusMeterRegistry  → Prometheus 拉取
  ├── DatadogMeterRegistry     → Datadog 推送
  ├── InfluxMeterRegistry      → InfluxDB 推送
  └── SimpleMeterRegistry      → 内存(开发调试)

快速开始

<!-- Actuator(暴露 /actuator/metrics 端点) -->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
 
<!-- Prometheus 适配器(生产推荐) -->
<dependency>
    <groupId>io.micrometer</groupId>
    <artifactId>micrometer-registry-prometheus</artifactId>
</dependency>
management:
  endpoints:
    web:
      exposure:
        include: health, info, metrics, prometheus
  metrics:
    tags:
      application: ${spring.application.name}   # 全局 Tag,所有指标都带上
      env: ${spring.profiles.active:default}
  prometheus:
    metrics:
      export:
        enabled: true

访问 /actuator/prometheus 即可获得 Prometheus 格式的指标文本。


四种指标类型

Counter(计数器)— 只增不减

@Service
@RequiredArgsConstructor
public class OrderService {
 
    private final MeterRegistry registry;
 
    // 方式一:直接使用
    public void createOrder(Order order) {
        processOrder(order);
        // 每次下单 +1,带业务 Tag
        registry.counter("order.created",
            "status", "success",
            "channel", order.getChannel()
        ).increment();
    }
 
    // 方式二:预先绑定(性能更好,避免每次查找)
    private final Counter errorCounter;
 
    public OrderService(MeterRegistry registry) {
        this.registry = registry;
        this.errorCounter = Counter.builder("order.error")
            .description("订单处理错误次数")
            .tag("type", "business")
            .register(registry);
    }
 
    public void handleError() {
        errorCounter.increment();
    }
}

Gauge(仪表盘)— 实时值,可升可降

@Component
public class QueueMetrics {
 
    private final BlockingQueue<Task> taskQueue = new LinkedBlockingQueue<>();
 
    @Autowired
    public QueueMetrics(MeterRegistry registry) {
        // 自动追踪集合大小,无需手动更新
        Gauge.builder("task.queue.size", taskQueue, Collection::size)
            .description("任务队列当前大小")
            .tag("queue", "main")
            .register(registry);
    }
 
    // 也可追踪 AtomicInteger
    private final AtomicInteger activeConnections = new AtomicInteger(0);
 
    @Autowired
    public void registerConnectionGauge(MeterRegistry registry) {
        Gauge.builder("db.connections.active", activeConnections, AtomicInteger::get)
            .description("当前活跃数据库连接数")
            .register(registry);
    }
}

Timer(计时器)— 耗时分布 + 调用次数

@Service
@RequiredArgsConstructor
public class PaymentService {
 
    private final MeterRegistry registry;
 
    // 方式一:手动计时
    public PaymentResult pay(PaymentRequest request) {
        return Timer.builder("payment.duration")
            .description("支付接口耗时")
            .tag("method", request.getMethod())
            .register(registry)
            .recordCallable(() -> doPayment(request));
    }
 
    // 方式二:Sample(适合跨方法计时)
    public void processWithSample(Order order) {
        Timer.Sample sample = Timer.start(registry);
        try {
            processOrder(order);
            sample.stop(registry.timer("order.process.duration", "result", "success"));
        } catch (Exception e) {
            sample.stop(registry.timer("order.process.duration", "result", "error"));
            throw e;
        }
    }
}

DistributionSummary(分布摘要)— 数值分布(如请求体大小)

@Component
public class RequestMetrics {
 
    private final DistributionSummary requestSizeSummary;
 
    public RequestMetrics(MeterRegistry registry) {
        this.requestSizeSummary = DistributionSummary.builder("http.request.size")
            .description("HTTP 请求体大小(字节)")
            .baseUnit("bytes")
            .percentilePrecision(2)
            .publishPercentiles(0.5, 0.90, 0.95, 0.99)  // 计算分位数
            .register(registry);
    }
 
    public void record(long bytes) {
        requestSizeSummary.record(bytes);
    }
}

@Timed 注解(推荐简洁写法)

<!-- 需要 AOP 支持 -->
<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-aop</artifactId>
</dependency>
@Configuration
public class TimedConfig {
    // 开启 @Timed 切面
    @Bean
    public TimedAspect timedAspect(MeterRegistry registry) {
        return new TimedAspect(registry);
    }
}
 
@Service
public class UserService {
 
    // 自动记录方法耗时、调用次数、异常情况
    @Timed(
        value = "user.getById.duration",
        description = "根据 ID 查询用户耗时",
        percentiles = {0.5, 0.95, 0.99},
        extraTags = {"layer", "service"}
    )
    public User getById(Long id) {
        return userRepository.findById(id).orElseThrow();
    }
}

内置自动指标

Spring Boot 自动注册以下指标(无需任何代码):

指标前缀说明
jvm.memory.*堆/非堆内存使用量
jvm.gc.*GC 次数、耗时
jvm.threads.*线程数(活跃、守护、峰值)
jvm.classes.*类加载数
process.cpu.usage进程 CPU 使用率
system.cpu.usage系统 CPU 使用率
http.server.requestsHTTP 请求数、耗时(按 URI、method、status 分组)
hikaricp.*连接池状态(活跃连接、等待、超时)
spring.data.repository.*Spring Data 方法耗时
executor.*线程池指标(队列大小、活跃线程)
cache.*Spring Cache 命中率
logback.events各级别日志计数

自定义指标:业务 SLA 监控

@Component
@RequiredArgsConstructor
@Slf4j
public class BusinessMetrics {
 
    private final MeterRegistry registry;
 
    // 滑动窗口统计(最近 5 分钟的 p99 延迟)
    private final Timer orderTimer;
    private final Counter orderSuccessCounter;
    private final Counter orderFailCounter;
 
    @Autowired
    public BusinessMetrics(MeterRegistry registry) {
        this.registry = registry;
 
        this.orderTimer = Timer.builder("business.order.duration")
            .description("订单创建端到端耗时")
            .publishPercentiles(0.50, 0.90, 0.95, 0.99)
            .publishPercentileHistogram()   // 开启直方图(支持 Prometheus histogram_quantile)
            .sla(Duration.ofMillis(100), Duration.ofMillis(500), Duration.ofSeconds(1))
            .minimumExpectedValue(Duration.ofMillis(1))
            .maximumExpectedValue(Duration.ofSeconds(10))
            .register(registry);
 
        this.orderSuccessCounter = registry.counter("business.order.total", "result", "success");
        this.orderFailCounter = registry.counter("business.order.total", "result", "fail");
    }
 
    public void recordOrder(Duration duration, boolean success) {
        orderTimer.record(duration);
        if (success) orderSuccessCounter.increment();
        else orderFailCounter.increment();
    }
}

Prometheus 集成

Prometheus 配置(prometheus.yml

scrape_configs:
  - job_name: 'spring-boot-app'
    metrics_path: '/actuator/prometheus'
    scrape_interval: 15s
    static_configs:
      - targets: ['app-host:8080']
    # 多实例(配合服务发现)
    # kubernetes_sd_configs:
    #   - role: pod

常用 PromQL 查询

# 接口 QPS(每秒请求数)
rate(http_server_requests_seconds_count{application="myapp"}[1m])
 
# 接口 P99 延迟
histogram_quantile(0.99,
  rate(http_server_requests_seconds_bucket{application="myapp"}[5m]))
 
# 接口错误率
rate(http_server_requests_seconds_count{status=~"5.."}[1m])
  / rate(http_server_requests_seconds_count[1m])
 
# JVM 堆内存使用率
jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}
 
# HikariCP 连接等待时间
hikaricp_connections_pending{pool="HikariPool-1"}
 
# 线程池活跃线程数
executor_active_threads{name="taskExecutor"}

Grafana 仪表盘

Spring Boot 官方提供了现成的 Grafana Dashboard(导入 ID 即可使用):

DashboardGrafana ID说明
JVM Overview4701内存、GC、线程、类加载
Spring Boot Statistics6756HTTP 请求、JVM、数据库
HikariCP14094连接池详细指标
Spring Boot 3.x19004Spring Boot 3 适配版

在 Grafana 中 Import → Dashboard ID 直接导入。


自定义 MeterRegistry 过滤与重命名

@Configuration
public class MetricsConfig {
 
    // 过滤不需要的指标(降低 Prometheus 存储压力)
    @Bean
    public MeterFilter denyHighCardinalityFilter() {
        return MeterFilter.denyNameStartsWith("jvm.gc.pause");
    }
 
    // 重命名指标
    @Bean
    public MeterFilter renameFilter() {
        return MeterFilter.renameTag("http.server.requests", "uri", "endpoint");
    }
 
    // 限制 Tag 值基数(防止高基数 URI 导致 Prometheus 爆炸)
    @Bean
    public MeterFilter uriTagFilter() {
        return new MeterFilter() {
            @Override
            public Meter.Id map(Meter.Id id) {
                if (id.getName().startsWith("http.server.requests")) {
                    String uri = id.getTag("uri");
                    // 将动态路径参数替换为 {id}(已由 Spring 自动处理)
                    if (uri != null && uri.contains("/actuator")) {
                        return id.replaceTags(Tags.of("uri", "/actuator/**"));
                    }
                }
                return id;
            }
        };
    }
 
    // 统一给所有指标添加 Tag
    @Bean
    public MeterFilter commonTagsFilter() {
        return MeterFilter.commonTags(Tags.of(
            "region", System.getenv().getOrDefault("REGION", "unknown"),
            "instance", InetAddress.getLocalHost().getHostName()
        ));
    }
}

告警规则示例(Prometheus Alertmanager)

# alert-rules.yml
groups:
  - name: spring-boot
    rules:
      # 接口错误率超过 5%
      - alert: HighErrorRate
        expr: |
          rate(http_server_requests_seconds_count{status=~"5.."}[5m])
          / rate(http_server_requests_seconds_count[5m]) > 0.05
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "接口错误率过高: {{ $labels.application }}"
          description: "错误率 {{ $value | humanizePercentage }}"
 
      # P99 延迟超过 1 秒
      - alert: HighLatencyP99
        expr: |
          histogram_quantile(0.99,
            rate(http_server_requests_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "P99 延迟过高: {{ $labels.uri }}"
 
      # JVM 堆内存使用率超 85%
      - alert: JvmHeapHighUsage
        expr: |
          jvm_memory_used_bytes{area="heap"}
          / jvm_memory_max_bytes{area="heap"} > 0.85
        for: 5m
        labels:
          severity: warning

相关链接