ai-servers/llm-gateway.yaml

314 lines
8.7 KiB
YAML

server:
listen: "0.0.0.0:3000"
request_timeout: 300s
max_request_body_mb: 10
session_secret: "${SESSION_SECRET}"
default_admin:
username: "${ADMIN_USERNAME}"
password: "${ADMIN_PASSWORD}"
tokens:
- name: "open-webui"
key: "${OPENWEBUI_API_KEY}"
rate_limit_rpm: 0 # unlimited
daily_budget_usd: 5.0
- name: "rayandrew"
key: "${PERSONAL_API_KEY}"
rate_limit_rpm: 0 # unlimited
daily_budget_usd: 10.0
pricing_lookup:
# url: "https://raw.githubusercontent.com/pydantic/genai-prices/main/prices/data_slim.json" # default
refresh_interval: 6h
database:
path: "/data/gateway.db"
retention_days: 90
cache:
enabled: true
address: "valkey:6379"
ttl: 3600
providers:
- name: deepinfra
base_url: "https://api.deepinfra.com/v1/openai"
api_key: "${DEEPINFRA_API_KEY}"
priority: 1
timeout: 120s
- name: siliconflow
base_url: "https://api.siliconflow.com/v1"
api_key: "${SILICONFLOW_API_KEY}"
priority: 2
timeout: 120s
- name: openrouter
base_url: "https://openrouter.ai/api/v1"
api_key: "${OPENROUTER_API_KEY}"
priority: 3
timeout: 120s
- name: groq
base_url: "https://api.groq.com/openai/v1"
api_key: "${GROQ_API_KEY}"
priority: 1
timeout: 120s
- name: cerebras
base_url: "https://api.cerebras.ai/v1"
api_key: "${CEREBRAS_API_KEY}"
priority: 1
timeout: 120s
models:
# ═══ TIER 1: Free (OpenRouter free models, $0) ═══
# NOTE: Commented out — free models are heavily rate-limited upstream.
# Uncomment if you want best-effort free access.
# - name: "llama-3.3-70b-free"
# routes:
# - provider: openrouter
# model: "meta-llama/llama-3.3-70b-instruct:free"
# - name: "deepseek-r1-free"
# routes:
# - provider: openrouter
# model: "deepseek/deepseek-r1-0528:free"
# - name: "gpt-oss-free"
# routes:
# - provider: openrouter
# model: "openai/gpt-oss-120b:free"
# - name: "gpt-oss-20b-free"
# routes:
# - provider: openrouter
# model: "openai/gpt-oss-20b:free"
# - name: "qwen3-coder-free"
# routes:
# - provider: openrouter
# model: "qwen/qwen3-coder:free"
# - name: "qwen3-235b-free"
# routes:
# - provider: openrouter
# model: "qwen/qwen3-235b-a22b-thinking-2507"
# - name: "glm-4.5-air-free"
# routes:
# - provider: openrouter
# model: "z-ai/glm-4.5-air:free"
# - name: "nemotron-nano-free"
# routes:
# - provider: openrouter
# model: "nvidia/nemotron-nano-9b-v2:free"
# - name: "trinity-large-free"
# routes:
# - provider: openrouter
# model: "arcee-ai/trinity-large-preview:free"
# - name: "mistral-small-free"
# routes:
# - provider: openrouter
# model: "mistralai/mistral-small-3.1-24b-instruct:free"
# - name: "gemma-3-27b-free"
# routes:
# - provider: openrouter
# model: "google/gemma-3-27b-it:free"
# - name: "step-3.5-flash-free"
# routes:
# - provider: openrouter
# model: "stepfun/step-3.5-flash:free"
# ═══ TIER 2: Low cost (Groq, Cerebras — free tier with rate limits) ═══
- name: "llama-3.3-70b"
routes:
- provider: groq
model: "llama-3.3-70b-versatile"
pricing: { input: 0.59, output: 0.79 }
- provider: deepinfra
model: "meta-llama/Llama-3.3-70B-Instruct-Turbo"
pricing: { input: 0.23, output: 0.40 }
- provider: cerebras
model: "llama-3.3-70b"
pricing: { input: 0.85, output: 1.20 }
- name: "llama-3.1-8b"
routes:
- provider: groq
model: "llama-3.1-8b-instant"
pricing: { input: 0.05, output: 0.08 }
- provider: cerebras
model: "llama3.1-8b"
pricing: { input: 0.10, output: 0.10 }
- provider: deepinfra
model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
pricing: { input: 0.03, output: 0.05 }
- name: "gpt-oss"
routes:
- provider: groq
model: "openai/gpt-oss-120b"
pricing: { input: 0.15, output: 0.60 }
- provider: cerebras
model: "gpt-oss-120b"
pricing: { input: 0.35, output: 0.75 }
- provider: deepinfra
model: "openai/gpt-oss-120b"
pricing: { input: 0.05, output: 0.24 }
- name: "gpt-oss-20b"
routes:
- provider: groq
model: "openai/gpt-oss-20b"
pricing: { input: 0.075, output: 0.30 }
- provider: deepinfra
model: "openai/gpt-oss-20b"
pricing: { input: 0.04, output: 0.16 }
- name: "llama-4-scout"
routes:
- provider: groq
model: "meta-llama/llama-4-scout-17b-16e-instruct"
pricing: { input: 0.11, output: 0.34 }
- name: "llama-4-maverick"
routes:
- provider: groq
model: "meta-llama/llama-4-maverick-17b-128e-instruct"
pricing: { input: 0.20, output: 0.60 }
- name: "qwen3-32b"
routes:
- provider: groq
model: "qwen/qwen3-32b"
pricing: { input: 0.29, output: 0.59 }
- provider: cerebras
model: "qwen-3-32b"
# ═══ TIER 3: DeepSeek V3.2 (cheapest flagship) ═══
- name: "deepseek-v3.2"
routes:
- provider: deepinfra
model: "deepseek-ai/DeepSeek-V3.2"
pricing: { input: 0.26, output: 0.38 }
- provider: siliconflow
model: "deepseek-ai/DeepSeek-V3.2"
pricing: { input: 0.27, output: 0.42 }
- provider: openrouter
model: "deepseek/deepseek-chat-v3-0324"
pricing: { input: 0.30, output: 0.88 }
# ═══ TIER 4: Ultra-cheap DeepInfra ═══
- name: "nemotron-super"
routes:
- provider: deepinfra
model: "nvidia/Llama-3.3-Nemotron-Super-49B-v1.5"
pricing: { input: 0.10, output: 0.40 }
- name: "nemotron-nano"
routes:
- provider: deepinfra
model: "nvidia/NVIDIA-Nemotron-Nano-9B-v2"
pricing: { input: 0.04, output: 0.16 }
# ═══ TIER 5: DeepSeek R1 & reasoning ═══
- name: "deepseek-r1"
routes:
- provider: deepinfra
model: "deepseek-ai/DeepSeek-R1-0528"
- provider: openrouter
model: "deepseek/deepseek-r1"
- name: "deepseek-r1-distill-llama-70b"
routes:
- provider: deepinfra
model: "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"
- name: "devstral-small"
routes:
- provider: openrouter
model: "mistralai/devstral-small"
- name: "devstral-medium"
routes:
- provider: openrouter
model: "mistralai/devstral-medium"
# ═══ TIER 6: GLM ═══
- name: "glm-4.6"
routes:
- provider: deepinfra
model: "zai-org/GLM-4.6"
pricing: { input: 0.60, output: 1.90 }
- name: "glm-4.7"
routes:
- provider: deepinfra
model: "zai-org/GLM-4.7"
pricing: { input: 0.40, output: 1.75 }
- provider: cerebras
model: "zai-glm-4.7"
pricing: { input: 2.25, output: 2.75 }
- provider: siliconflow
model: "THUDM/GLM-4-32B-0414"
- name: "glm-5"
routes:
- provider: deepinfra
model: "zai-org/GLM-5"
pricing: { input: 0.80, output: 2.56 }
# ═══ TIER 7: Kimi ═══
- name: "kimi-k2"
routes:
- provider: groq
model: "moonshotai/kimi-k2-instruct-0905"
pricing: { input: 1.00, output: 3.00 }
- provider: deepinfra
model: "moonshotai/Kimi-K2-Instruct-0905"
pricing: { input: 0.50, output: 2.00 }
- provider: siliconflow
model: "moonshotai/Kimi-K2-Instruct-0905"
pricing: { input: 0.58, output: 2.29 }
- name: "kimi-k2.5"
routes:
- provider: deepinfra
model: "moonshotai/Kimi-K2.5"
pricing: { input: 0.45, output: 2.25 }
- provider: openrouter
model: "moonshotai/kimi-k2.5"
# ═══ TIER 8: SiliconFlow (Qwen) ═══
- name: "qwen3-coder"
routes:
- provider: siliconflow
model: "Qwen/Qwen3-Coder-480B-A35B-Instruct"
pricing: { input: 1.14, output: 2.28 }
- name: "qwen3-coder-30b"
routes:
- provider: siliconflow
model: "Qwen/Qwen3-Coder-30B-A3B-Instruct"
# ═══ TIER 9: OpenRouter premium (paid) ═══
- name: "minimax-m2.5"
routes:
- provider: openrouter
model: "minimax/minimax-m2.5"
- name: "gpt-4.1-mini"
routes:
- provider: openrouter
model: "openai/gpt-4.1-mini"
- name: "gpt-4.1"
routes:
- provider: openrouter
model: "openai/gpt-4.1"
- name: "gemini-3-flash-preview"
routes:
- provider: openrouter
model: "google/gemini-3-flash-preview"
- name: "gemini-2.5-pro"
routes:
- provider: openrouter
model: "google/gemini-2.5-pro-preview"
- name: "claude-sonnet"
routes:
- provider: openrouter
model: "anthropic/claude-sonnet-4"