264 lines
6.3 KiB
Go
264 lines
6.3 KiB
Go
package provider
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"llm-gateway/internal/config"
|
|
)
|
|
|
|
// CircuitState represents the state of a circuit breaker.
|
|
type CircuitState int
|
|
|
|
const (
|
|
CircuitClosed CircuitState = iota // normal operation
|
|
CircuitOpen // blocking requests
|
|
CircuitHalfOpen // testing with probe request
|
|
)
|
|
|
|
func (s CircuitState) String() string {
|
|
switch s {
|
|
case CircuitClosed:
|
|
return "closed"
|
|
case CircuitOpen:
|
|
return "open"
|
|
case CircuitHalfOpen:
|
|
return "half-open"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// ProviderCircuit tracks circuit breaker state for a single provider.
|
|
type ProviderCircuit struct {
|
|
State CircuitState
|
|
OpenedAt time.Time
|
|
LastProbe time.Time
|
|
}
|
|
|
|
// HealthEvent represents a single request outcome for a provider.
|
|
type HealthEvent struct {
|
|
Timestamp time.Time
|
|
LatencyMS int64
|
|
IsError bool
|
|
ErrorMsg string
|
|
}
|
|
|
|
// ProviderHealth is the computed health status for a provider.
|
|
type ProviderHealth struct {
|
|
Provider string `json:"provider"`
|
|
Status string `json:"status"` // healthy, degraded, down
|
|
ErrorRate float64 `json:"error_rate"`
|
|
AvgLatency float64 `json:"avg_latency_ms"`
|
|
Total int `json:"total"`
|
|
Errors int `json:"errors"`
|
|
CircuitState string `json:"circuit_state"`
|
|
}
|
|
|
|
// HealthTracker tracks per-provider health using a sliding window.
|
|
type HealthTracker struct {
|
|
mu sync.RWMutex
|
|
windows map[string][]HealthEvent
|
|
windowDu time.Duration
|
|
circuits map[string]*ProviderCircuit
|
|
cbConfig config.CircuitBreakerConfig
|
|
OnStateChange func(provider string, from, to CircuitState)
|
|
}
|
|
|
|
// NewHealthTracker creates a health tracker with the given window duration.
|
|
func NewHealthTracker(window time.Duration, cbCfg config.CircuitBreakerConfig) *HealthTracker {
|
|
if window == 0 {
|
|
window = 5 * time.Minute
|
|
}
|
|
return &HealthTracker{
|
|
windows: make(map[string][]HealthEvent),
|
|
circuits: make(map[string]*ProviderCircuit),
|
|
windowDu: window,
|
|
cbConfig: cbCfg,
|
|
}
|
|
}
|
|
|
|
// IsAvailable returns true if the provider's circuit breaker allows requests.
|
|
func (h *HealthTracker) IsAvailable(provider string) bool {
|
|
if !h.cbConfig.Enabled {
|
|
return true
|
|
}
|
|
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
|
|
circuit, ok := h.circuits[provider]
|
|
if !ok {
|
|
return true // no circuit = closed = available
|
|
}
|
|
|
|
switch circuit.State {
|
|
case CircuitOpen:
|
|
// Check if cooldown has elapsed -> transition to half-open
|
|
if time.Since(circuit.OpenedAt) >= h.cbConfig.CooldownDuration {
|
|
return true // will transition to half-open on next record
|
|
}
|
|
return false
|
|
case CircuitHalfOpen:
|
|
return true // allow probe
|
|
default:
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Record adds a health event for a provider and evaluates circuit transitions.
|
|
func (h *HealthTracker) Record(provider string, latencyMS int64, err error) {
|
|
event := HealthEvent{
|
|
Timestamp: time.Now(),
|
|
LatencyMS: latencyMS,
|
|
IsError: err != nil,
|
|
}
|
|
if err != nil {
|
|
event.ErrorMsg = err.Error()
|
|
}
|
|
|
|
h.mu.Lock()
|
|
defer h.mu.Unlock()
|
|
|
|
h.windows[provider] = append(h.windows[provider], event)
|
|
h.prune(provider)
|
|
|
|
if h.cbConfig.Enabled {
|
|
h.evaluateCircuit(provider, err)
|
|
}
|
|
}
|
|
|
|
// evaluateCircuit transitions circuit breaker state. Must be called with lock held.
|
|
func (h *HealthTracker) evaluateCircuit(providerName string, lastErr error) {
|
|
circuit, ok := h.circuits[providerName]
|
|
if !ok {
|
|
circuit = &ProviderCircuit{State: CircuitClosed}
|
|
h.circuits[providerName] = circuit
|
|
}
|
|
|
|
prevState := circuit.State
|
|
|
|
switch circuit.State {
|
|
case CircuitClosed:
|
|
// Check if error threshold exceeded
|
|
errorRate, total := h.errorRateUnlocked(providerName)
|
|
if total >= h.cbConfig.MinRequests && errorRate >= h.cbConfig.ErrorThreshold {
|
|
circuit.State = CircuitOpen
|
|
circuit.OpenedAt = time.Now()
|
|
}
|
|
case CircuitOpen:
|
|
// Check if cooldown elapsed -> half-open
|
|
if time.Since(circuit.OpenedAt) >= h.cbConfig.CooldownDuration {
|
|
circuit.State = CircuitHalfOpen
|
|
circuit.LastProbe = time.Now()
|
|
// Evaluate the probe result immediately
|
|
if lastErr == nil {
|
|
circuit.State = CircuitClosed
|
|
} else {
|
|
circuit.State = CircuitOpen
|
|
circuit.OpenedAt = time.Now()
|
|
}
|
|
}
|
|
case CircuitHalfOpen:
|
|
if lastErr == nil {
|
|
circuit.State = CircuitClosed
|
|
} else {
|
|
circuit.State = CircuitOpen
|
|
circuit.OpenedAt = time.Now()
|
|
}
|
|
}
|
|
|
|
if circuit.State != prevState && h.OnStateChange != nil {
|
|
cb := h.OnStateChange
|
|
from, to := prevState, circuit.State
|
|
// Call outside lock to avoid deadlocks
|
|
go cb(providerName, from, to)
|
|
}
|
|
}
|
|
|
|
// errorRateUnlocked computes error rate within window. Must be called with lock held.
|
|
func (h *HealthTracker) errorRateUnlocked(provider string) (float64, int) {
|
|
cutoff := time.Now().Add(-h.windowDu)
|
|
events := h.windows[provider]
|
|
var total, errors int
|
|
for _, e := range events {
|
|
if e.Timestamp.Before(cutoff) {
|
|
continue
|
|
}
|
|
total++
|
|
if e.IsError {
|
|
errors++
|
|
}
|
|
}
|
|
if total == 0 {
|
|
return 0, 0
|
|
}
|
|
return float64(errors) / float64(total), total
|
|
}
|
|
|
|
// Status returns computed health for all tracked providers.
|
|
func (h *HealthTracker) Status() []ProviderHealth {
|
|
h.mu.RLock()
|
|
defer h.mu.RUnlock()
|
|
|
|
cutoff := time.Now().Add(-h.windowDu)
|
|
var results []ProviderHealth
|
|
|
|
for provider, events := range h.windows {
|
|
var total, errors int
|
|
var totalLatency int64
|
|
|
|
for _, e := range events {
|
|
if e.Timestamp.Before(cutoff) {
|
|
continue
|
|
}
|
|
total++
|
|
totalLatency += e.LatencyMS
|
|
if e.IsError {
|
|
errors++
|
|
}
|
|
}
|
|
|
|
if total == 0 {
|
|
continue
|
|
}
|
|
|
|
errorRate := float64(errors) / float64(total)
|
|
status := "healthy"
|
|
if errorRate >= 0.5 {
|
|
status = "down"
|
|
} else if errorRate >= 0.1 {
|
|
status = "degraded"
|
|
}
|
|
|
|
circuitState := "closed"
|
|
if circuit, ok := h.circuits[provider]; ok {
|
|
circuitState = circuit.State.String()
|
|
}
|
|
|
|
results = append(results, ProviderHealth{
|
|
Provider: provider,
|
|
Status: status,
|
|
ErrorRate: errorRate,
|
|
AvgLatency: float64(totalLatency) / float64(total),
|
|
Total: total,
|
|
Errors: errors,
|
|
CircuitState: circuitState,
|
|
})
|
|
}
|
|
|
|
return results
|
|
}
|
|
|
|
// prune removes events outside the window. Must be called with lock held.
|
|
func (h *HealthTracker) prune(provider string) {
|
|
cutoff := time.Now().Add(-h.windowDu)
|
|
events := h.windows[provider]
|
|
i := 0
|
|
for i < len(events) && events[i].Timestamp.Before(cutoff) {
|
|
i++
|
|
}
|
|
if i > 0 {
|
|
h.windows[provider] = events[i:]
|
|
}
|
|
}
|