ai-servers/llm-gateway/internal/provider/health.go

121 lines
2.6 KiB
Go

package provider
import (
"sync"
"time"
)
// HealthEvent represents a single request outcome for a provider.
type HealthEvent struct {
Timestamp time.Time
LatencyMS int64
IsError bool
ErrorMsg string
}
// ProviderHealth is the computed health status for a provider.
type ProviderHealth struct {
Provider string `json:"provider"`
Status string `json:"status"` // healthy, degraded, down
ErrorRate float64 `json:"error_rate"`
AvgLatency float64 `json:"avg_latency_ms"`
Total int `json:"total"`
Errors int `json:"errors"`
}
// HealthTracker tracks per-provider health using a sliding window.
type HealthTracker struct {
mu sync.RWMutex
windows map[string][]HealthEvent
windowDu time.Duration
}
// NewHealthTracker creates a health tracker with the given window duration.
func NewHealthTracker(window time.Duration) *HealthTracker {
if window == 0 {
window = 5 * time.Minute
}
return &HealthTracker{
windows: make(map[string][]HealthEvent),
windowDu: window,
}
}
// Record adds a health event for a provider.
func (h *HealthTracker) Record(provider string, latencyMS int64, err error) {
event := HealthEvent{
Timestamp: time.Now(),
LatencyMS: latencyMS,
IsError: err != nil,
}
if err != nil {
event.ErrorMsg = err.Error()
}
h.mu.Lock()
defer h.mu.Unlock()
h.windows[provider] = append(h.windows[provider], event)
h.prune(provider)
}
// Status returns computed health for all tracked providers.
func (h *HealthTracker) Status() []ProviderHealth {
h.mu.RLock()
defer h.mu.RUnlock()
cutoff := time.Now().Add(-h.windowDu)
var results []ProviderHealth
for provider, events := range h.windows {
var total, errors int
var totalLatency int64
for _, e := range events {
if e.Timestamp.Before(cutoff) {
continue
}
total++
totalLatency += e.LatencyMS
if e.IsError {
errors++
}
}
if total == 0 {
continue
}
errorRate := float64(errors) / float64(total)
status := "healthy"
if errorRate >= 0.5 {
status = "down"
} else if errorRate >= 0.1 {
status = "degraded"
}
results = append(results, ProviderHealth{
Provider: provider,
Status: status,
ErrorRate: errorRate,
AvgLatency: float64(totalLatency) / float64(total),
Total: total,
Errors: errors,
})
}
return results
}
// prune removes events outside the window. Must be called with lock held.
func (h *HealthTracker) prune(provider string) {
cutoff := time.Now().Add(-h.windowDu)
events := h.windows[provider]
i := 0
for i < len(events) && events[i].Timestamp.Before(cutoff) {
i++
}
if i > 0 {
h.windows[provider] = events[i:]
}
}