feat: replace LiteLLM with new-api for LLM proxy and add monitoring stack

2026-02-14 12:54:58 -06:00 · 2026-02-14 12:54:58 -06:00 · 964f9e43cd
commit 964f9e43cd
parent 718cf928f3
6 changed files with 447 additions and 41 deletions
--- a/.env.example
+++ b/.env.example
@ -4,9 +4,9 @@
 #   cp .env.example .env
 # ============================================

-# --- LiteLLM ---
-LITELLM_MASTER_KEY=sk-change-me-to-a-random-string
-LITELLM_DB_PASSWORD=change-me-to-a-random-string
+# --- new-api (LLM proxy) ---
+# Admin access token for new-api management API (also used by init-channels.sh)
+NEW_API_ACCESS_TOKEN=change-me-to-a-random-string
 OPENROUTER_API_KEY=sk-or-...
 SILICONFLOW_API_KEY=sk-...
 DEEPINFRA_API_KEY=...
@ -14,9 +14,12 @@ GROQ_API_KEY=gsk_...
 CEREBRAS_API_KEY=...

 # --- Open WebUI ---
-# Virtual key from LiteLLM (create in LiteLLM UI → Virtual Keys)
+# API token created in new-api (or via init-channels.sh)
 OPENWEBUI_API_KEY=sk-...

+# --- Grafana ---
+GRAFANA_ADMIN_PASSWORD=change-me-to-a-secure-password
+
 # --- Cloudflare Tunnel ---
 # Create a tunnel in Cloudflare Zero Trust dashboard → Networks → Tunnels
 # Copy the token from the tunnel install command
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -38,48 +38,68 @@ services:
      - ANONYMIZED_TELEMETRY=FALSE
    restart: unless-stopped

-  # ── Database for LiteLLM ──
-  litellm-db:
-    image: postgres:16-alpine
-    volumes:
-      - litellm-db-data:/var/lib/postgresql/data
-    environment:
-      - POSTGRES_DB=litellm
-      - POSTGRES_USER=litellm
-      - POSTGRES_PASSWORD=${LITELLM_DB_PASSWORD}
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD-SHELL", "pg_isready -U litellm"]
-      interval: 10s
-      timeout: 3s
-      retries: 3
+  # # ── Database for LiteLLM (DEPRECATED — kept for rollback) ──
+  # litellm-db:
+  #   image: postgres:16-alpine
+  #   volumes:
+  #     - litellm-db-data:/var/lib/postgresql/data
+  #   environment:
+  #     - POSTGRES_DB=litellm
+  #     - POSTGRES_USER=litellm
+  #     - POSTGRES_PASSWORD=${LITELLM_DB_PASSWORD}
+  #   restart: unless-stopped
+  #   healthcheck:
+  #     test: ["CMD-SHELL", "pg_isready -U litellm"]
+  #     interval: 10s
+  #     timeout: 3s
+  #     retries: 3

-  # ── LLM API proxy ──
-  litellm:
-    image: ghcr.io/berriai/litellm:main-latest
-    command: ["--config", "/app/config.yaml", "--port", "4000"]
-    volumes:
-      - ./litellm/config.yaml:/app/config.yaml:ro
+  # # ── LLM API proxy (DEPRECATED — replaced by new-api) ──
+  # litellm:
+  #   image: ghcr.io/berriai/litellm:main-latest
+  #   command: ["--config", "/app/config.yaml", "--port", "4000"]
+  #   volumes:
+  #     - ./litellm/config.yaml:/app/config.yaml:ro
+  #   ports:
+  #     - "0.0.0.0:4000:4000"
+  #   environment:
+  #     - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY}
+  #     - DATABASE_URL=postgresql://litellm:${LITELLM_DB_PASSWORD}@litellm-db:5432/litellm
+  #     - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
+  #     - SILICONFLOW_API_KEY=${SILICONFLOW_API_KEY}
+  #     - DEEPINFRA_API_KEY=${DEEPINFRA_API_KEY}
+  #     - GROQ_API_KEY=${GROQ_API_KEY}
+  #     - CEREBRAS_API_KEY=${CEREBRAS_API_KEY}
+  #   depends_on:
+  #     litellm-db:
+  #       condition: service_healthy
+  #   restart: unless-stopped
+  #   healthcheck:
+  #     test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:4000/health/liveliness')"]
+  #     interval: 15s
+  #     timeout: 5s
+  #     retries: 5
+  #     start_period: 30s
+
+  # ── LLM API proxy (new-api) ──
+  new-api:
+    image: calciumion/new-api:latest
    ports:
-      - "0.0.0.0:4000:4000"
+      - "0.0.0.0:4000:3000"
+    volumes:
+      - new-api-data:/data
    environment:
-      - LITELLM_MASTER_KEY=${LITELLM_MASTER_KEY}
-      - DATABASE_URL=postgresql://litellm:${LITELLM_DB_PASSWORD}@litellm-db:5432/litellm
-      - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
-      - SILICONFLOW_API_KEY=${SILICONFLOW_API_KEY}
-      - DEEPINFRA_API_KEY=${DEEPINFRA_API_KEY}
-      - GROQ_API_KEY=${GROQ_API_KEY}
-      - CEREBRAS_API_KEY=${CEREBRAS_API_KEY}
-    depends_on:
-      litellm-db:
-        condition: service_healthy
+      - SQL_DSN=
+      - TZ=UTC
+      - ENABLE_METRIC=true
+      - INITIAL_ROOT_ACCESS_TOKEN=${NEW_API_ACCESS_TOKEN}
    restart: unless-stopped
    healthcheck:
-      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:4000/health/liveliness')"]
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/status"]
      interval: 15s
      timeout: 5s
      retries: 5
-      start_period: 30s
+      start_period: 10s

  # ── Chat UI ──
  open-webui:
@ -90,7 +110,7 @@ services:
      - "0.0.0.0:3000:8080"
    environment:
      - OLLAMA_BASE_URL=
-      - OPENAI_API_BASE_URL=http://litellm:4000/v1
+      - OPENAI_API_BASE_URL=http://new-api:3000/v1
      - OPENAI_API_KEY=${OPENWEBUI_API_KEY}
      - ENABLE_RAG_WEB_SEARCH=true
      - RAG_WEB_SEARCH_ENGINE=searxng
@ -99,7 +119,7 @@ services:
      - CHROMA_HTTP_PORT=8000
      - WEBUI_AUTH=true
    depends_on:
-      litellm:
+      new-api:
        condition: service_healthy
    restart: unless-stopped

@ -128,9 +148,69 @@ services:
    restart: unless-stopped
    network_mode: host

+  # ═══════════════════════════════════════════════
+  # Monitoring stack
+  # ═══════════════════════════════════════════════
+
+  # ── Metrics store (Prometheus-compatible) ──
+  victoriametrics:
+    image: victoriametrics/victoria-metrics:latest
+    volumes:
+      - victoriametrics-data:/victoria-metrics-data
+      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+    command:
+      - "-promscrape.config=/etc/prometheus/prometheus.yml"
+      - "-retentionPeriod=90d"
+      - "-storageDataPath=/victoria-metrics-data"
+    ports:
+      - "127.0.0.1:8428:8428"
+    restart: unless-stopped
+
+  # ── Dashboards ──
+  grafana:
+    image: grafana/grafana:latest
+    volumes:
+      - grafana-data:/var/lib/grafana
+    ports:
+      - "0.0.0.0:3001:3000"
+    environment:
+      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
+      - GF_USERS_ALLOW_SIGN_UP=false
+    depends_on:
+      - victoriametrics
+    restart: unless-stopped
+
+  # ── Host system metrics ──
+  node-exporter:
+    image: prom/node-exporter:latest
+    pid: host
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - "--path.procfs=/host/proc"
+      - "--path.sysfs=/host/sys"
+      - "--path.rootfs=/rootfs"
+      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
+    restart: unless-stopped
+
+  # ── Valkey/Redis metrics ──
+  redis-exporter:
+    image: oliver006/redis_exporter:latest
+    environment:
+      - REDIS_ADDR=redis://valkey:6379
+    depends_on:
+      valkey:
+        condition: service_healthy
+    restart: unless-stopped
+
 volumes:
  valkey-data:
  chromadb-data:
  litellm-db-data:
+  new-api-data:
  open-webui-data:
  tailscale-state:
+  victoriametrics-data:
+  grafana-data:
--- a/monitoring/prometheus.yml
+++ b/monitoring/prometheus.yml
@ -0,0 +1,15 @@
+global:
+  scrape_interval: 30s
+
+scrape_configs:
+  - job_name: 'new-api'
+    static_configs:
+      - targets: ['new-api:3000']
+
+  - job_name: 'node'
+    static_configs:
+      - targets: ['node-exporter:9100']
+
+  - job_name: 'valkey'
+    static_configs:
+      - targets: ['redis-exporter:9121']
--- a/new-api/CHANNELS.md
+++ b/new-api/CHANNELS.md
@ -0,0 +1,147 @@
+# new-api Channel Configuration
+
+After first start, access the new-api web UI at `http://<server>:4000` to configure channels.
+
+Default admin credentials: `root` / `123456` — **change immediately**.
+
+## API Token for Open WebUI
+
+Create an API token in new-api's token management. Use this token as `OPENWEBUI_API_KEY` in `.env`.
+
+## Channels to Create
+
+Configure each channel via **Channels > Add Channel** in the web UI.
+
+### 1. DeepInfra (Priority 1)
+
+| Field | Value |
+|---|---|
+| Name | DeepInfra |
+| Type | OpenAI |
+| Base URL | `https://api.deepinfra.com/v1/openai` |
+| Key | `$DEEPINFRA_API_KEY` |
+| Priority | 1 |
+| Models | See model mapping below |
+
+### 2. SiliconFlow (Priority 2)
+
+| Field | Value |
+|---|---|
+| Name | SiliconFlow |
+| Type | OpenAI |
+| Base URL | `https://api.siliconflow.com/v1` |
+| Key | `$SILICONFLOW_API_KEY` |
+| Priority | 2 |
+| Models | See model mapping below |
+
+### 3. OpenRouter (Priority 3)
+
+| Field | Value |
+|---|---|
+| Name | OpenRouter |
+| Type | OpenAI |
+| Base URL | `https://openrouter.ai/api/v1` |
+| Key | `$OPENROUTER_API_KEY` |
+| Priority | 3 |
+| Models | See model mapping below |
+
+### 4. Groq (Priority 1)
+
+| Field | Value |
+|---|---|
+| Name | Groq |
+| Type | OpenAI |
+| Base URL | `https://api.groq.com/openai/v1` |
+| Key | `$GROQ_API_KEY` |
+| Priority | 1 |
+| Models | `llama-3.3-70b` |
+
+### 5. Cerebras (Priority 1)
+
+| Field | Value |
+|---|---|
+| Name | Cerebras |
+| Type | OpenAI |
+| Base URL | `https://api.cerebras.ai/v1` |
+| Key | `$CEREBRAS_API_KEY` |
+| Priority | 1 |
+| Models | `llama-3.3-70b-cerebras` |
+
+## Model Mapping per Channel
+
+new-api uses model aliasing: the "model name" is what clients see, the "actual model" is what's sent to the provider.
+
+### DeepInfra Models
+
+| Client Model Name | Actual Provider Model |
+|---|---|
+| `deepseek-v3.2` | `deepseek-ai/DeepSeek-V3.2` |
+| `deepseek-r1` | `deepseek-ai/DeepSeek-R1` |
+| `gpt-oss` | `openai/gpt-oss-120b` |
+| `gpt-oss-20b` | `openai/gpt-oss-20b` |
+| `nemotron-super` | `nvidia/Llama-3.3-Nemotron-Super-49B-v1.5` |
+| `nemotron-nano` | `nvidia/NVIDIA-Nemotron-Nano-9B-v2` |
+| `devstral` | `mistralai/Devstral-Small-2505` |
+| `glm-4.6` | `zai-org/GLM-4.6` |
+| `glm-4.7` | `zai-org/GLM-4.7` |
+| `glm-5` | `zai-org/GLM-5` |
+| `kimi-k2` | `moonshotai/Kimi-K2-Instruct-0905` |
+| `kimi-k2.5` | `moonshotai/Kimi-K2.5` |
+| `deepseek-v3-free` | `deepseek-ai/DeepSeek-V3` |
+
+### SiliconFlow Models
+
+| Client Model Name | Actual Provider Model |
+|---|---|
+| `deepseek-v3.2` | `deepseek-ai/DeepSeek-V3.2` |
+| `glm-4.7` | `THUDM/GLM-4-32B-0414` |
+| `kimi-k2` | `moonshotai/Kimi-K2-Instruct-0905` |
+| `qwen3-coder` | `Qwen/Qwen3-Coder-480B-A35B-Instruct` |
+| `qwen3-coder-30b` | `Qwen/Qwen3-Coder-30B-A3B-Instruct` |
+
+### OpenRouter Models
+
+| Client Model Name | Actual Provider Model |
+|---|---|
+| `deepseek-v3.2` | `deepseek/deepseek-chat-v3-0324` |
+| `deepseek-v3-free` | `deepseek/deepseek-chat-v3-0324:free` |
+| `kimi-k2.5` | `moonshotai/kimi-k2.5` |
+| `minimax-m2.5` | `minimax/minimax-m2.5` |
+| `gpt-4.1-mini` | `openai/gpt-4.1-mini` |
+| `gpt-4.1` | `openai/gpt-4.1` |
+| `gemini-3-flash-preview` | `google/gemini-3-flash-preview` |
+| `gemini-2.5-pro` | `google/gemini-2.5-pro-preview` |
+| `claude-sonnet` | `anthropic/claude-sonnet-4` |
+| `trinity-large-preview` | `arcee-ai/trinity-large-preview` |
+
+### Groq Models
+
+| Client Model Name | Actual Provider Model |
+|---|---|
+| `llama-3.3-70b` | `llama-3.3-70b-versatile` |
+
+### Cerebras Models
+
+| Client Model Name | Actual Provider Model |
+|---|---|
+| `llama-3.3-70b-cerebras` | `llama-3.3-70b` |
+
+## Fallback Behavior
+
+new-api handles fallbacks via priority levels:
+- When a model exists on multiple channels, the highest priority (lowest number) channel is tried first
+- If it fails, it automatically falls back to the next priority level
+
+For example, `deepseek-v3.2` exists on:
+1. DeepInfra (priority 1) — tried first
+2. SiliconFlow (priority 2) — fallback
+3. OpenRouter (priority 3) — last resort
+
+## Grafana Setup
+
+After first start, access Grafana at `http://<server>:3001`:
+1. Login with `admin` / `$GRAFANA_ADMIN_PASSWORD`
+2. Add data source: **Prometheus** with URL `http://victoriametrics:8428`
+3. Import dashboards:
+   - Node Exporter Full: dashboard ID `1860`
+   - Redis: dashboard ID `763`
--- a/new-api/init-channels.sh
+++ b/new-api/init-channels.sh
@ -0,0 +1,161 @@
+#!/usr/bin/env bash
+# Configures new-api channels and token via the admin API.
+# Run once after first boot: ./new-api/init-channels.sh
+#
+# Requires these env vars (or .env file in project root):
+#   NEW_API_ACCESS_TOKEN  - admin access token (set via INITIAL_ROOT_ACCESS_TOKEN)
+#   DEEPINFRA_API_KEY
+#   SILICONFLOW_API_KEY
+#   OPENROUTER_API_KEY
+#   GROQ_API_KEY
+#   CEREBRAS_API_KEY
+#   OPENWEBUI_API_KEY     - token for Open WebUI to authenticate with new-api
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+ENV_FILE="${SCRIPT_DIR}/../.env"
+
+# Load .env if present
+if [[ -f "$ENV_FILE" ]]; then
+  set -a
+  # shellcheck disable=SC1090
+  source "$ENV_FILE"
+  set +a
+fi
+
+API_BASE="${NEW_API_BASE:-http://localhost:4000}"
+TOKEN="${NEW_API_ACCESS_TOKEN:?Set NEW_API_ACCESS_TOKEN (from INITIAL_ROOT_ACCESS_TOKEN)}"
+
+# ── Helper ──────────────────────────────────────────────
+create_channel() {
+  local name="$1" type="$2" key="$3" base_url="$4" priority="$5" models="$6" model_mapping="$7"
+
+  echo "Creating channel: ${name} (priority ${priority})..."
+
+  local payload
+  payload=$(python3 -c "
+import json, sys
+print(json.dumps({
+    'type': int(sys.argv[1]),
+    'name': sys.argv[2],
+    'key': sys.argv[3],
+    'base_url': sys.argv[4],
+    'models': sys.argv[5],
+    'model_mapping': sys.argv[6],
+    'priority': int(sys.argv[7]),
+    'status': 1,
+    'group': 'default',
+    'weight': 1,
+    'auto_ban': 1
+}))
+" "$type" "$name" "$key" "$base_url" "$models" "$model_mapping" "$priority")
+
+  local resp http_code body
+  resp=$(curl -s -w "\n%{http_code}" \
+    "${API_BASE}/api/channel/" \
+    -H "Authorization: Bearer ${TOKEN}" \
+    -H "Content-Type: application/json" \
+    -d "$payload")
+
+  http_code=$(echo "$resp" | tail -1)
+  body=$(echo "$resp" | sed '$d')
+
+  if [[ "$http_code" == "200" ]]; then
+    echo "  OK"
+  else
+    echo "  FAILED (HTTP ${http_code})"
+    echo "  ${body}" | head -c 500
+    echo
+  fi
+}
+
+# Wait for new-api to be ready
+echo "Waiting for new-api at ${API_BASE}..."
+for i in $(seq 1 30); do
+  if curl -sf "${API_BASE}/api/status" > /dev/null 2>&1; then
+    echo "new-api is ready."
+    break
+  fi
+  if [[ "$i" == "30" ]]; then
+    echo "ERROR: new-api did not become ready in time."
+    exit 1
+  fi
+  sleep 2
+done
+
+# ── Channel: DeepInfra (priority 1) ────────────────────
+create_channel "DeepInfra" 1 \
+  "${DEEPINFRA_API_KEY:?}" \
+  "https://api.deepinfra.com/v1/openai" \
+  1 \
+  "deepseek-v3.2,deepseek-r1,gpt-oss,gpt-oss-20b,nemotron-super,nemotron-nano,devstral,glm-4.6,glm-4.7,glm-5,kimi-k2,kimi-k2.5" \
+  '{"deepseek-v3.2":"deepseek-ai/DeepSeek-V3.2","deepseek-r1":"deepseek-ai/DeepSeek-R1","gpt-oss":"openai/gpt-oss-120b","gpt-oss-20b":"openai/gpt-oss-20b","nemotron-super":"nvidia/Llama-3.3-Nemotron-Super-49B-v1.5","nemotron-nano":"nvidia/NVIDIA-Nemotron-Nano-9B-v2","devstral":"mistralai/Devstral-Small-2505","glm-4.6":"zai-org/GLM-4.6","glm-4.7":"zai-org/GLM-4.7","glm-5":"zai-org/GLM-5","kimi-k2":"moonshotai/Kimi-K2-Instruct-0905","kimi-k2.5":"moonshotai/Kimi-K2.5"}'
+
+# ── Channel: SiliconFlow (priority 2) ──────────────────
+create_channel "SiliconFlow" 1 \
+  "${SILICONFLOW_API_KEY:?}" \
+  "https://api.siliconflow.com/v1" \
+  2 \
+  "deepseek-v3.2,glm-4.7,kimi-k2,qwen3-coder,qwen3-coder-30b" \
+  '{"deepseek-v3.2":"deepseek-ai/DeepSeek-V3.2","glm-4.7":"THUDM/GLM-4-32B-0414","kimi-k2":"moonshotai/Kimi-K2-Instruct-0905","qwen3-coder":"Qwen/Qwen3-Coder-480B-A35B-Instruct","qwen3-coder-30b":"Qwen/Qwen3-Coder-30B-A3B-Instruct"}'
+
+# ── Channel: OpenRouter (priority 3) ───────────────────
+create_channel "OpenRouter" 1 \
+  "${OPENROUTER_API_KEY:?}" \
+  "https://openrouter.ai/api/v1" \
+  3 \
+  "deepseek-v3.2,deepseek-v3-free,kimi-k2.5,minimax-m2.5,gpt-4.1-mini,gpt-4.1,gemini-3-flash-preview,gemini-2.5-pro,claude-sonnet,trinity-large-preview" \
+  '{"deepseek-v3.2":"deepseek/deepseek-chat-v3-0324","deepseek-v3-free":"deepseek/deepseek-chat-v3-0324:free","kimi-k2.5":"moonshotai/kimi-k2.5","minimax-m2.5":"minimax/minimax-m2.5","gpt-4.1-mini":"openai/gpt-4.1-mini","gpt-4.1":"openai/gpt-4.1","gemini-3-flash-preview":"google/gemini-3-flash-preview","gemini-2.5-pro":"google/gemini-2.5-pro-preview","claude-sonnet":"anthropic/claude-sonnet-4","trinity-large-preview":"arcee-ai/trinity-large-preview"}'
+
+# ── Channel: Groq (priority 1) ─────────────────────────
+create_channel "Groq" 1 \
+  "${GROQ_API_KEY:?}" \
+  "https://api.groq.com/openai/v1" \
+  1 \
+  "llama-3.3-70b" \
+  '{"llama-3.3-70b":"llama-3.3-70b-versatile"}'
+
+# ── Channel: Cerebras (priority 1) ─────────────────────
+create_channel "Cerebras" 1 \
+  "${CEREBRAS_API_KEY:?}" \
+  "https://api.cerebras.ai/v1" \
+  1 \
+  "llama-3.3-70b-cerebras" \
+  '{"llama-3.3-70b-cerebras":"llama-3.3-70b"}'
+
+# ── Create API token for Open WebUI ────────────────────
+if [[ -n "${OPENWEBUI_API_KEY:-}" ]]; then
+  echo ""
+  echo "Creating API token for Open WebUI..."
+  TOKEN_RESP=$(curl -s "${API_BASE}/api/token/" \
+    -H "Authorization: Bearer ${TOKEN}" \
+    -H "Content-Type: application/json" \
+    -d "$(python3 -c "
+import json
+print(json.dumps({
+    'name': 'open-webui',
+    'remain_quota': 0,
+    'unlimited_quota': True
+}))
+")")
+  echo "Token response: ${TOKEN_RESP}" | head -c 500
+  echo ""
+  echo ""
+  echo "NOTE: Use the token 'key' from the response above as OPENAI_API_KEY in Open WebUI."
+  echo "      Or create a token manually in the new-api UI."
+fi
+
+echo ""
+echo "══════════════════════════════════════"
+echo "Channel setup complete!"
+echo ""
+echo "Next steps:"
+echo "  1. Verify channels at ${API_BASE} (login: root / 123456 — CHANGE THIS)"
+echo "  2. Test a model:"
+echo "     curl ${API_BASE}/v1/chat/completions \\"
+echo "       -H 'Authorization: Bearer <token>' \\"
+echo "       -H 'Content-Type: application/json' \\"
+echo "       -d '{\"model\":\"deepseek-v3.2\",\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}'"
+echo "  3. Check Open WebUI can see models"
+echo "══════════════════════════════════════"
--- a/searxng/settings.yml
+++ b/searxng/settings.yml
@ -1,7 +1,7 @@
 use_default_settings: true

 general:
-  instance_name: "SearXNG"
+  instance_name: "SearRST"
  privacypolicy_url: false
  donation_url: false
  enable_metrics: false