ai-servers/llm-gateway/internal/provider/health.go
Ray Andrew 90adf6f3a8
feat(gateway): add circuit breaker, retry, and concurrency limit support
feat(gateway): add debug logging with file storage and retention

feat(gateway): add audit logging for user actions

feat(gateway): add request ID tracking and rate limit headers

feat(gateway): add model aliases and load balancing strategies

feat(gateway): add config hot-reload via SIGHUP

feat(gateway): add CORS support

feat(gateway): add data export API and dashboard endpoints

feat(gateway): add dashboard pages for audit and debug logs

feat(gateway): add concurrent request limiting per token

feat(gateway): add streaming timeout support

feat(gateway): add migration support for new schema fields
2026-02-15 04:21:40 -06:00

254 lines
6 KiB
Go

package provider
import (
"sync"
"time"
"llm-gateway/internal/config"
)
// CircuitState represents the state of a circuit breaker.
type CircuitState int
const (
CircuitClosed CircuitState = iota // normal operation
CircuitOpen // blocking requests
CircuitHalfOpen // testing with probe request
)
func (s CircuitState) String() string {
switch s {
case CircuitClosed:
return "closed"
case CircuitOpen:
return "open"
case CircuitHalfOpen:
return "half-open"
default:
return "unknown"
}
}
// ProviderCircuit tracks circuit breaker state for a single provider.
type ProviderCircuit struct {
State CircuitState
OpenedAt time.Time
LastProbe time.Time
}
// HealthEvent represents a single request outcome for a provider.
type HealthEvent struct {
Timestamp time.Time
LatencyMS int64
IsError bool
ErrorMsg string
}
// ProviderHealth is the computed health status for a provider.
type ProviderHealth struct {
Provider string `json:"provider"`
Status string `json:"status"` // healthy, degraded, down
ErrorRate float64 `json:"error_rate"`
AvgLatency float64 `json:"avg_latency_ms"`
Total int `json:"total"`
Errors int `json:"errors"`
CircuitState string `json:"circuit_state"`
}
// HealthTracker tracks per-provider health using a sliding window.
type HealthTracker struct {
mu sync.RWMutex
windows map[string][]HealthEvent
windowDu time.Duration
circuits map[string]*ProviderCircuit
cbConfig config.CircuitBreakerConfig
}
// NewHealthTracker creates a health tracker with the given window duration.
func NewHealthTracker(window time.Duration, cbCfg config.CircuitBreakerConfig) *HealthTracker {
if window == 0 {
window = 5 * time.Minute
}
return &HealthTracker{
windows: make(map[string][]HealthEvent),
circuits: make(map[string]*ProviderCircuit),
windowDu: window,
cbConfig: cbCfg,
}
}
// IsAvailable returns true if the provider's circuit breaker allows requests.
func (h *HealthTracker) IsAvailable(provider string) bool {
if !h.cbConfig.Enabled {
return true
}
h.mu.RLock()
defer h.mu.RUnlock()
circuit, ok := h.circuits[provider]
if !ok {
return true // no circuit = closed = available
}
switch circuit.State {
case CircuitOpen:
// Check if cooldown has elapsed -> transition to half-open
if time.Since(circuit.OpenedAt) >= h.cbConfig.CooldownDuration {
return true // will transition to half-open on next record
}
return false
case CircuitHalfOpen:
return true // allow probe
default:
return true
}
}
// Record adds a health event for a provider and evaluates circuit transitions.
func (h *HealthTracker) Record(provider string, latencyMS int64, err error) {
event := HealthEvent{
Timestamp: time.Now(),
LatencyMS: latencyMS,
IsError: err != nil,
}
if err != nil {
event.ErrorMsg = err.Error()
}
h.mu.Lock()
defer h.mu.Unlock()
h.windows[provider] = append(h.windows[provider], event)
h.prune(provider)
if h.cbConfig.Enabled {
h.evaluateCircuit(provider, err)
}
}
// evaluateCircuit transitions circuit breaker state. Must be called with lock held.
func (h *HealthTracker) evaluateCircuit(providerName string, lastErr error) {
circuit, ok := h.circuits[providerName]
if !ok {
circuit = &ProviderCircuit{State: CircuitClosed}
h.circuits[providerName] = circuit
}
switch circuit.State {
case CircuitClosed:
// Check if error threshold exceeded
errorRate, total := h.errorRateUnlocked(providerName)
if total >= h.cbConfig.MinRequests && errorRate >= h.cbConfig.ErrorThreshold {
circuit.State = CircuitOpen
circuit.OpenedAt = time.Now()
}
case CircuitOpen:
// Check if cooldown elapsed -> half-open
if time.Since(circuit.OpenedAt) >= h.cbConfig.CooldownDuration {
circuit.State = CircuitHalfOpen
circuit.LastProbe = time.Now()
// Evaluate the probe result immediately
if lastErr == nil {
circuit.State = CircuitClosed
} else {
circuit.State = CircuitOpen
circuit.OpenedAt = time.Now()
}
}
case CircuitHalfOpen:
if lastErr == nil {
circuit.State = CircuitClosed
} else {
circuit.State = CircuitOpen
circuit.OpenedAt = time.Now()
}
}
}
// errorRateUnlocked computes error rate within window. Must be called with lock held.
func (h *HealthTracker) errorRateUnlocked(provider string) (float64, int) {
cutoff := time.Now().Add(-h.windowDu)
events := h.windows[provider]
var total, errors int
for _, e := range events {
if e.Timestamp.Before(cutoff) {
continue
}
total++
if e.IsError {
errors++
}
}
if total == 0 {
return 0, 0
}
return float64(errors) / float64(total), total
}
// Status returns computed health for all tracked providers.
func (h *HealthTracker) Status() []ProviderHealth {
h.mu.RLock()
defer h.mu.RUnlock()
cutoff := time.Now().Add(-h.windowDu)
var results []ProviderHealth
for provider, events := range h.windows {
var total, errors int
var totalLatency int64
for _, e := range events {
if e.Timestamp.Before(cutoff) {
continue
}
total++
totalLatency += e.LatencyMS
if e.IsError {
errors++
}
}
if total == 0 {
continue
}
errorRate := float64(errors) / float64(total)
status := "healthy"
if errorRate >= 0.5 {
status = "down"
} else if errorRate >= 0.1 {
status = "degraded"
}
circuitState := "closed"
if circuit, ok := h.circuits[provider]; ok {
circuitState = circuit.State.String()
}
results = append(results, ProviderHealth{
Provider: provider,
Status: status,
ErrorRate: errorRate,
AvgLatency: float64(totalLatency) / float64(total),
Total: total,
Errors: errors,
CircuitState: circuitState,
})
}
return results
}
// prune removes events outside the window. Must be called with lock held.
func (h *HealthTracker) prune(provider string) {
cutoff := time.Now().Add(-h.windowDu)
events := h.windows[provider]
i := 0
for i < len(events) && events[i].Timestamp.Before(cutoff) {
i++
}
if i > 0 {
h.windows[provider] = events[i:]
}
}