fix: resolve all critical runtime errors and bugs from audit
- Add COMPLETIONS_API_KEY to config.py (env var + auto-generated fallback) - Fix perplexity auto-search: upstream sends logprobs=true, parse_llama_stream_chunk extracts per-token logprobs, all_logprobs populated during streaming - Fix all /api/models endpoints to target LLAMA_SERVER_BASE (port 8081) not OLLAMA_BASE - Fix RAG embedding endpoint URL from port 11434 (Ollama) to 8081 (llama-server) - Correct misleading error messages: 'inference server' not 'Ollama' - Remove raw_results leak from SSE event stream in /api/search - Fix weather query extractor: pattern-match instead of unconditional suffix append - Escape FTS5 operator keywords (AND/OR/NOT/NEAR) in memory search - Move auth.py BODY_LIMIT_DEFAULT_BYTES imports to module level - Change RAG injection log level from warning to info - Fix all 8 test files after modular refactor (rewire imports from correct modules) - Update AGENTS.md and README.md to reflect v1.8.0 changes
This commit is contained in:
@@ -26,7 +26,7 @@ def parse_llama_stream_chunk(line: str) -> tuple:
|
||||
if line.startswith("data: "):
|
||||
line = line[6:]
|
||||
if line.strip() == "[DONE]":
|
||||
return None, True, {}
|
||||
return None, True, {}, []
|
||||
try:
|
||||
chunk = json.loads(line)
|
||||
choices = chunk.get("choices", [])
|
||||
@@ -35,10 +35,17 @@ def parse_llama_stream_chunk(line: str) -> tuple:
|
||||
token = delta.get("content")
|
||||
finish = choices[0].get("finish_reason")
|
||||
stats = {}
|
||||
logprobs_list = []
|
||||
logprobs_info = choices[0].get("logprobs")
|
||||
if logprobs_info:
|
||||
content_logprobs = logprobs_info.get("content", [])
|
||||
for entry in content_logprobs:
|
||||
if "logprob" in entry:
|
||||
logprobs_list.append({"logprob": entry["logprob"]})
|
||||
if finish == "stop":
|
||||
usage = chunk.get("usage", {})
|
||||
stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
|
||||
return token, finish == "stop", stats
|
||||
return token, finish == "stop", stats, logprobs_list
|
||||
if "message" in chunk and "content" in chunk["message"]:
|
||||
token = chunk["message"]["content"]
|
||||
done = chunk.get("done", False)
|
||||
@@ -47,10 +54,10 @@ def parse_llama_stream_chunk(line: str) -> tuple:
|
||||
eval_count = chunk.get("eval_count", 0)
|
||||
eval_duration = chunk.get("eval_duration", 0)
|
||||
stats["tokens_per_sec"] = (eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
|
||||
return token, done, stats
|
||||
return token, done, stats, []
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None, False, {}
|
||||
return None, False, {}, []
|
||||
|
||||
|
||||
@router.post("/api/chat")
|
||||
@@ -97,7 +104,7 @@ async def chat(request: Request):
|
||||
for row in history_rows:
|
||||
messages.append({"role": row["role"], "content": row["content"]})
|
||||
|
||||
ollama_payload = {"model": model, "messages": messages, "stream": True}
|
||||
upstream_payload = {"model": model, "messages": messages, "stream": True, "logprobs": True}
|
||||
|
||||
async def stream_response():
|
||||
full_response = []
|
||||
@@ -111,12 +118,14 @@ async def chat(request: Request):
|
||||
try:
|
||||
async with client.stream(
|
||||
"POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
|
||||
json=ollama_payload,
|
||||
json=upstream_payload,
|
||||
timeout=httpx.Timeout(300.0, connect=10.0),
|
||||
) as resp:
|
||||
async for line in resp.aiter_lines():
|
||||
if line.strip():
|
||||
token, done, stats = parse_llama_stream_chunk(line)
|
||||
token, done, stats, chunk_logprobs = parse_llama_stream_chunk(line)
|
||||
if chunk_logprobs:
|
||||
all_logprobs.extend(chunk_logprobs)
|
||||
if token:
|
||||
full_response.append(token)
|
||||
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
|
||||
@@ -153,7 +162,7 @@ async def chat(request: Request):
|
||||
) as resp2:
|
||||
async for line in resp2.aiter_lines():
|
||||
if line.strip():
|
||||
token2, done2, _ = parse_llama_stream_chunk(line)
|
||||
token2, done2, _, _ = parse_llama_stream_chunk(line)
|
||||
if token2:
|
||||
augmented_response.append(token2)
|
||||
if done2:
|
||||
@@ -194,9 +203,9 @@ async def chat(request: Request):
|
||||
except httpx.RemoteProtocolError:
|
||||
pass
|
||||
except httpx.ConnectError:
|
||||
yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
|
||||
yield f"data: {json.dumps({'error': 'Cannot connect to inference server. Is it running?'})}\n\n"
|
||||
except Exception as e:
|
||||
incident_key = log_incident("chat_stream", message="Ollama stream failure during chat response",
|
||||
incident_key = log_incident("chat_stream", message="Inference stream failure during chat response",
|
||||
request=request, exc=e)
|
||||
yield f"data: {json.dumps({'error': 'Chat response generation failed before completion. Use the incident key for support lookup.', 'error_key': incident_key})}\n\n"
|
||||
|
||||
|
||||
@@ -178,7 +178,7 @@ async def _stream_chat(payload: dict, model: str, conv_id: str, request: Request
|
||||
async for line in resp.aiter_lines():
|
||||
if not line.strip():
|
||||
continue
|
||||
token, done, _ = parse_llama_stream_chunk(line)
|
||||
token, done, _, _ = parse_llama_stream_chunk(line)
|
||||
if token:
|
||||
full_response.append(token)
|
||||
yield _build_openai_chunk(token, model, conv_id)
|
||||
@@ -222,7 +222,7 @@ async def _blocking_chat(payload: dict, model: str, conv_id: str, request: Reque
|
||||
async for line in resp.aiter_lines():
|
||||
if not line.strip():
|
||||
continue
|
||||
token, done, _ = parse_llama_stream_chunk(line)
|
||||
token, done, _, _ = parse_llama_stream_chunk(line)
|
||||
if token:
|
||||
full_response.append(token)
|
||||
if done:
|
||||
|
||||
@@ -8,7 +8,7 @@ import httpx
|
||||
import psutil
|
||||
from fastapi import APIRouter, HTTPException, Request
|
||||
|
||||
from config import OLLAMA_BASE
|
||||
from config import LLAMA_SERVER_BASE
|
||||
from gpu import get_gpu_stats
|
||||
from security import read_json_body, BODY_LIMIT_DEFAULT_BYTES
|
||||
|
||||
@@ -20,34 +20,33 @@ router = APIRouter()
|
||||
async def list_models():
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
|
||||
resp = await client.get(f"{LLAMA_SERVER_BASE}/v1/models", timeout=10)
|
||||
data = resp.json()
|
||||
models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
|
||||
return {"models": models}
|
||||
except httpx.ConnectError:
|
||||
raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")
|
||||
raise HTTPException(status_code=502, detail="Cannot connect to inference server.")
|
||||
|
||||
|
||||
@router.get("/api/ps")
|
||||
async def running_models():
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
resp = await client.get(f"{OLLAMA_BASE}/api/ps", timeout=10)
|
||||
resp = await client.get(f"{LLAMA_SERVER_BASE}/v1/models", timeout=10)
|
||||
return resp.json()
|
||||
except httpx.ConnectError:
|
||||
raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
|
||||
raise HTTPException(status_code=502, detail="Cannot connect to inference server.")
|
||||
|
||||
|
||||
@router.post("/api/show")
|
||||
async def show_model(request: Request):
|
||||
from security import BODY_LIMIT_DEFAULT_BYTES
|
||||
body = await read_json_body(request, BODY_LIMIT_DEFAULT_BYTES)
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
resp = await client.post(f"{OLLAMA_BASE}/api/show", json=body, timeout=10)
|
||||
resp = await client.post(f"{LLAMA_SERVER_BASE}/api/show", json=body, timeout=10)
|
||||
return resp.json()
|
||||
except httpx.ConnectError:
|
||||
raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
|
||||
raise HTTPException(status_code=502, detail="Cannot connect to inference server.")
|
||||
|
||||
|
||||
@router.get("/api/stats")
|
||||
|
||||
@@ -35,14 +35,14 @@ async def explicit_search(request: Request):
|
||||
|
||||
if not conv_id:
|
||||
conv_id = str(uuid.uuid4())
|
||||
title = f"🔍 {query[:70]}..." if len(query) > 70 else f"🔍 {query}"
|
||||
title = query[:70] + "..." if len(query) > 70 else query
|
||||
db.execute("INSERT INTO conversations (id, title, model, created_at, updated_at) VALUES (?, ?, ?, ?, ?)",
|
||||
(conv_id, title, model, now, now))
|
||||
else:
|
||||
db.execute("UPDATE conversations SET updated_at = ? WHERE id = ?", (now, conv_id))
|
||||
|
||||
db.execute("INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
|
||||
(conv_id, "user", f"🔍 {query}", now))
|
||||
(conv_id, "user", query, now))
|
||||
db.commit()
|
||||
db.close()
|
||||
|
||||
@@ -80,7 +80,7 @@ async def explicit_search(request: Request):
|
||||
) as resp:
|
||||
async for line in resp.aiter_lines():
|
||||
if line.strip():
|
||||
token, done, _ = parse_llama_stream_chunk(line)
|
||||
token, done, _, _ = parse_llama_stream_chunk(line)
|
||||
if token:
|
||||
full_response.append(token)
|
||||
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
|
||||
@@ -102,7 +102,6 @@ async def explicit_search(request: Request):
|
||||
db2.commit()
|
||||
db2.close()
|
||||
|
||||
yield f"data: {json.dumps({'raw_results': results, 'conversation_id': conv_id})}\n\n"
|
||||
yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'searched': True})}\n\n"
|
||||
|
||||
return StreamingResponse(stream_search(), media_type="text/event-stream")
|
||||
|
||||
Reference in New Issue
Block a user