1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
@Component public class ServiceCallChainAnalyzer { @Autowired private MeterRegistry meterRegistry;
public CallChainHealth analyzeCallChainHealth() { CallChainHealth health = new CallChainHealth(); Map<String, ServiceMetrics> serviceMetrics = new HashMap<>(); String[] services = {"user-service", "order-service", "inventory-service", "payment-service"}; for (String serviceName : services) { ServiceMetrics metrics = analyzeServiceMetrics(serviceName); serviceMetrics.put(serviceName, metrics); if (metrics.getAvgResponseTime() > 5000) { health.addAbnormalService(serviceName, "响应时间过长: " + metrics.getAvgResponseTime() + "ms"); } if (metrics.getErrorRate() > 0.1) { health.addAbnormalService(serviceName, "错误率过高: " + String.format("%.2f%%", metrics.getErrorRate() * 100)); } } health.setServiceMetrics(serviceMetrics); analyzeFailurePropagation(health); return health; } private ServiceMetrics analyzeServiceMetrics(String serviceName) { ServiceMetrics metrics = new ServiceMetrics(); Timer responseTimer = meterRegistry.find("http.server.requests") .tag("service", serviceName) .timer(); if (responseTimer != null) { metrics.setAvgResponseTime(responseTimer.mean(TimeUnit.MILLISECONDS)); metrics.setMaxResponseTime(responseTimer.max(TimeUnit.MILLISECONDS)); } Counter successCounter = meterRegistry.find("http.requests.success") .tag("service", serviceName) .counter(); Counter errorCounter = meterRegistry.find("http.requests.error") .tag("service", serviceName) .counter(); if (successCounter != null && errorCounter != null) { double totalRequests = successCounter.count() + errorCounter.count(); metrics.setErrorRate(totalRequests > 0 ? errorCounter.count() / totalRequests : 0); } return metrics; } private void analyzeFailurePropagation(CallChainHealth health) { Map<String, ServiceMetrics> metrics = health.getServiceMetrics(); long abnormalServices = metrics.values().stream() .mapToLong(m -> m.getAvgResponseTime() > 5000 ? 1 : 0) .sum(); if (abnormalServices >= 3) { health.setSnowballEffect(true); health.addAlert("检测到雪崩效应:" + abnormalServices + "个服务响应异常"); } String rootCauseService = identifyRootCause(metrics); health.setRootCauseService(rootCauseService); log.warn("调用链分析完成,故障源头: {}, 雪崩效应: {}", rootCauseService, health.isSnowballEffect()); } private String identifyRootCause(Map<String, ServiceMetrics> metrics) { return metrics.entrySet().stream() .max(Comparator.comparing(entry -> entry.getValue().getAvgResponseTime() * (1 + entry.getValue().getErrorRate()))) .map(Map.Entry::getKey) .orElse("unknown"); } @Data public static class CallChainHealth { private Map<String, ServiceMetrics> serviceMetrics; private Map<String, String> abnormalServices = new HashMap<>(); private List<String> alerts = new ArrayList<>(); private boolean snowballEffect; private String rootCauseService; public void addAbnormalService(String serviceName, String reason) { abnormalServices.put(serviceName, reason); } public void addAlert(String alert) { alerts.add(alert); } } @Data public static class ServiceMetrics { private double avgResponseTime; private double maxResponseTime; private double errorRate; private long requestCount; } }
|