fix: resolve all critical runtime errors and bugs from audit

- Add COMPLETIONS_API_KEY to config.py (env var + auto-generated fallback) - Fix perplexity auto-search: upstream sends logprobs=true, parse_llama_stream_chunk extracts per-token logprobs, all_logprobs populated during streaming - Fix all /api/models endpoints to target LLAMA_SERVER_BASE (port 8081) not OLLAMA_BASE - Fix RAG embedding endpoint URL from port 11434 (Ollama) to 8081 (llama-server) - Correct misleading error messages: 'inference server' not 'Ollama' - Remove raw_results leak from SSE event stream in /api/search - Fix weather query extractor: pattern-match instead of unconditional suffix append - Escape FTS5 operator keywords (AND/OR/NOT/NEAR) in memory search - Move auth.py BODY_LIMIT_DEFAULT_BYTES imports to module level - Change RAG injection log level from warning to info - Fix all 8 test files after modular refactor (rewire imports from correct modules) - Update AGENTS.md and README.md to reflect v1.8.0 changes
2026-06-27 15:10:32 -07:00
parent 41a8708c0d
commit 193829b7ff
20 changed files with 457 additions and 896 deletions
--- a/routers/models.py
+++ b/routers/models.py
@@ -8,7 +8,7 @@ import httpx
 import psutil
 from fastapi import APIRouter, HTTPException, Request

-from config import OLLAMA_BASE
+from config import LLAMA_SERVER_BASE
 from gpu import get_gpu_stats
 from security import read_json_body, BODY_LIMIT_DEFAULT_BYTES

@@ -20,34 +20,33 @@ router = APIRouter()
 async def list_models():
    async with httpx.AsyncClient() as client:
        try:
-            resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
+            resp = await client.get(f"{LLAMA_SERVER_BASE}/v1/models", timeout=10)
            data = resp.json()
            models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
            return {"models": models}
        except httpx.ConnectError:
-            raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")
+            raise HTTPException(status_code=502, detail="Cannot connect to inference server.")


@router.get("/api/ps")
 async def running_models():
    async with httpx.AsyncClient() as client:
        try:
-            resp = await client.get(f"{OLLAMA_BASE}/api/ps", timeout=10)
+            resp = await client.get(f"{LLAMA_SERVER_BASE}/v1/models", timeout=10)
            return resp.json()
        except httpx.ConnectError:
-            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
+            raise HTTPException(status_code=502, detail="Cannot connect to inference server.")


@router.post("/api/show")
 async def show_model(request: Request):
-    from security import BODY_LIMIT_DEFAULT_BYTES
    body = await read_json_body(request, BODY_LIMIT_DEFAULT_BYTES)
    async with httpx.AsyncClient() as client:
        try:
-            resp = await client.post(f"{OLLAMA_BASE}/api/show", json=body, timeout=10)
+            resp = await client.post(f"{LLAMA_SERVER_BASE}/api/show", json=body, timeout=10)
            return resp.json()
        except httpx.ConnectError:
-            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
+            raise HTTPException(status_code=502, detail="Cannot connect to inference server.")


@router.get("/api/stats")