diff --git a/litellm/config.yaml b/litellm/config.yaml index 9cbb146..2c235e4 100644 --- a/litellm/config.yaml +++ b/litellm/config.yaml @@ -1,18 +1,104 @@ model_list: - # --- OpenRouter models --- + # ═══════════════════════════════════════════════ + # TIER 1: Free providers (try first) + # ═══════════════════════════════════════════════ + + # --- Groq (free tier, very fast) --- + - model_name: llama-3.3-70b + litellm_params: + model: groq/llama-3.3-70b-versatile + api_key: os.environ/GROQ_API_KEY + + # --- Cerebras (free tier, very fast) --- + - model_name: llama-3.3-70b-cerebras + litellm_params: + model: cerebras/llama-3.3-70b + api_key: os.environ/CEREBRAS_API_KEY + + # --- OpenRouter free models --- + - model_name: deepseek-v3-free + litellm_params: + model: openrouter/deepseek/deepseek-chat-v3-0324:free + api_key: os.environ/OPENROUTER_API_KEY + + # ═══════════════════════════════════════════════ + # TIER 2: SiliconFlow (cheapest paid, ~3-5x cheaper than OpenRouter) + # ═══════════════════════════════════════════════ + + # DeepSeek V3 — best value daily driver ($0.13 in / $0.28 out per M) + - model_name: deepseek-v3 + litellm_params: + model: openai/deepseek-ai/DeepSeek-V3-0324 + api_base: https://api.siliconflow.com/v1 + api_key: os.environ/SILICONFLOW_API_KEY + + # DeepSeek V3.2 via SiliconFlow (cheaper than DeepInfra) + - model_name: deepseek-v3.2 + litellm_params: + model: openai/deepseek-ai/DeepSeek-V3-0324 + api_base: https://api.siliconflow.com/v1 + api_key: os.environ/SILICONFLOW_API_KEY + + # GLM-4.7 via SiliconFlow + - model_name: glm-4.7 + litellm_params: + model: openai/THUDM/GLM-4-32B-0414 + api_base: https://api.siliconflow.com/v1 + api_key: os.environ/SILICONFLOW_API_KEY + + # Qwen3 Coder 480B MoE via SiliconFlow ($1.14 in / $2.28 out per M) + - model_name: qwen3-coder + litellm_params: + model: openai/Qwen/Qwen3-Coder-480B-A35B-Instruct + api_base: https://api.siliconflow.com/v1 + api_key: os.environ/SILICONFLOW_API_KEY + + # Qwen3 Coder 30B — cheaper alternative for simpler tasks + - model_name: qwen3-coder-30b + litellm_params: + model: openai/Qwen/Qwen3-Coder-30B-A3B-Instruct + api_base: https://api.siliconflow.com/v1 + api_key: os.environ/SILICONFLOW_API_KEY + + # ═══════════════════════════════════════════════ + # TIER 3: DeepInfra (good mid-range pricing) + # ═══════════════════════════════════════════════ + + # DeepSeek V3 fallback (if SiliconFlow is down) + - model_name: deepseek-v3 + litellm_params: + model: deepinfra/deepseek-ai/DeepSeek-V3-0324 + api_key: os.environ/DEEPINFRA_API_KEY + + - model_name: deepseek-r1 + litellm_params: + model: deepinfra/deepseek-ai/DeepSeek-R1 + api_key: os.environ/DEEPINFRA_API_KEY + + - model_name: devstral + litellm_params: + model: deepinfra/mistralai/Devstral-Small-2505 + api_key: os.environ/DEEPINFRA_API_KEY + + # ═══════════════════════════════════════════════ + # TIER 4: OpenRouter (most expensive, widest selection) + # ═══════════════════════════════════════════════ + + # Kimi K2.5 — DeepInfra is cheapest ($0.45 in / $2.25 out per M) + - model_name: kimi-k2.5 + litellm_params: + model: deepinfra/moonshotai/Kimi-K2.5 + api_key: os.environ/DEEPINFRA_API_KEY + + # Kimi K2.5 fallback via OpenRouter - model_name: kimi-k2.5 litellm_params: model: openrouter/moonshotai/kimi-k2.5 api_key: os.environ/OPENROUTER_API_KEY - - model_name: devstral + - model_name: minimax-m2.5 litellm_params: - model: openrouter/mistralai/devstral-small - api_key: os.environ/OPENROUTER_API_KEY - - - model_name: minimax-m2 - litellm_params: - model: openrouter/minimax/minimax-m1 + model: openrouter/minimax/minimax-m2.5 api_key: os.environ/OPENROUTER_API_KEY - model_name: gpt-oss @@ -20,7 +106,17 @@ model_list: model: openrouter/openai/gpt-4.1-mini api_key: os.environ/OPENROUTER_API_KEY - # --- OpenRouter models (writing) --- + - model_name: gemini-3-flash-preview + litellm_params: + model: openrouter/google/gemini-3-flash-preview + api_key: os.environ/OPENROUTER_API_KEY + + - model_name: trinity-large-preview + litellm_params: + model: openrouter/arcee-ai/trinity-large-preview + api_key: os.environ/OPENROUTER_API_KEY + + # --- OpenRouter premium models --- - model_name: gemini-2.5-pro litellm_params: model: openrouter/google/gemini-2.5-pro-preview @@ -36,66 +132,29 @@ model_list: model: openrouter/openai/gpt-4.1 api_key: os.environ/OPENROUTER_API_KEY + # DeepSeek V3 last-resort fallback via OpenRouter - model_name: deepseek-v3 litellm_params: - model: openrouter/deepseek/deepseek-chat-v3-0324:free + model: openrouter/deepseek/deepseek-chat-v3-0324 api_key: os.environ/OPENROUTER_API_KEY - # --- SiliconFlow models --- - - model_name: glm-4.7 - litellm_params: - model: openai/THUDM/GLM-4-32B-0414 - api_base: https://api.siliconflow.com/v1 - api_key: os.environ/SILICONFLOW_API_KEY - - - model_name: qwen3-coder - litellm_params: - model: openrouter/qwen/qwen3-coder - api_key: os.environ/OPENROUTER_API_KEY - - - model_name: qwen3-coder-480b-sf - litellm_params: - model: openai/Qwen/Qwen3-Coder-480B-A35B-Instruct - api_base: https://api.siliconflow.com/v1 - api_key: os.environ/SILICONFLOW_API_KEY - - - model_name: qwen3-coder-30b-sf - litellm_params: - model: openai/Qwen/Qwen3-Coder-30B-A3B-Instruct - api_base: https://api.siliconflow.com/v1 - api_key: os.environ/SILICONFLOW_API_KEY - - # --- DeepInfra models --- - - model_name: deepseek-v3.2 - litellm_params: - model: deepinfra/deepseek-ai/DeepSeek-V3-0324 - api_key: os.environ/DEEPINFRA_API_KEY - - - model_name: devstral-deepinfra - litellm_params: - model: deepinfra/mistralai/Devstral-Small-2505 - api_key: os.environ/DEEPINFRA_API_KEY - - - model_name: deepseek-r1 - litellm_params: - model: deepinfra/deepseek-ai/DeepSeek-R1 - api_key: os.environ/DEEPINFRA_API_KEY - - # --- Groq (free/fast) --- - - model_name: llama-3.3-70b - litellm_params: - model: groq/llama-3.3-70b-versatile - api_key: os.environ/GROQ_API_KEY - - # --- Cerebras (free/fast) --- - - model_name: llama-3.3-70b-cerebras - litellm_params: - model: cerebras/llama-3.3-70b - api_key: os.environ/CEREBRAS_API_KEY - general_settings: master_key: os.environ/LITELLM_MASTER_KEY litellm_settings: drop_params: true set_verbose: false + num_retries: 2 + request_timeout: 600 + + # ── Response caching via Valkey (reuses SearXNG's instance) ── + cache: true + cache_params: + type: redis + host: valkey + port: 6379 + ttl: 3600 + + # ── Budget limit: $3/day to prevent surprise bills ── + max_budget: 3.0 + budget_duration: "1d"