feat: switch from Ollama to llama-server OpenAI-compat API, fix streaming parser

2026-06-14 12:37:29 -07:00
parent 71b48d940f
commit 347650b507
2 changed files with 2354 additions and 47 deletions
--- a/app.py
+++ b/app.py
@@ -57,7 +57,7 @@ log.addHandler(syslog_handler)
 # --- Configuration ---
 VERSION = "1.7.6"
-OLLAMA_BASE = "http://localhost:11434"
+OLLAMA_BASE = os.environ.get("OLLAMA_BASE", "http://localhost:11434")
 SEARXNG_BASE = "http://localhost:8888"
 BASE_DIR = Path(__file__).parent
 DB_PATH = BASE_DIR / "jarvischat.db"
@@ -1480,14 +1480,24 @@ async def index(request: Request):
    return templates.TemplateResponse(request, "index.html", {"version": VERSION})
 #@app.get("/api/models")
 #async def list_models():
 #    async with httpx.AsyncClient() as client:
 #        try:
 #            resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
 #            return resp.json()
 #        except httpx.ConnectError:
 #            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
@app.get("/api/models")
 async def list_models():
    async with httpx.AsyncClient() as client:
        try:
-            resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
+            resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
-            return resp.json()
+            data = resp.json()
            models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
            return {"models": models}
        except httpx.ConnectError:
-            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
+            raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")
@app.get("/api/ps")
@@ -1962,16 +1972,12 @@ async def explicit_search(request: Request):
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line.strip():
-                            try:
+                            token, done, _ = parse_llama_stream_chunk(line)
-                                chunk = json.loads(line)
+                            if token:
-                                if "message" in chunk and "content" in chunk["message"]:
+                                full_response.append(token)
-                                    token = chunk["message"]["content"]
+                                yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
-                                    full_response.append(token)
+                            if done:
-                                    yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
+                                break
                                if chunk.get("done"):
                                    break
                            except json.JSONDecodeError:
                                pass
            except Exception as e:
                incident_key = log_incident(
                    "search_summarization_stream",
@@ -2041,6 +2047,42 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
    return "\n\n---\n\n".join(parts) if parts else ""
 def parse_llama_stream_chunk(line: str) -> tuple[str | None, bool, dict]:
    """Parse OpenAI-compatible SSE chunk. Returns (token, is_done, stats)."""
    if line.startswith("data: "):
        line = line[6:]
    if line.strip() == "[DONE]":
        return None, True, {}
    try:
        chunk = json.loads(line)
        # OpenAI format
        choices = chunk.get("choices", [])
        if choices:
            delta = choices[0].get("delta", {})
            token = delta.get("content")
            finish = choices[0].get("finish_reason")
            stats = {}
            if finish == "stop":
                usage = chunk.get("usage", {})
                stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
            return token, finish == "stop", stats
        # Ollama format fallback
        if "message" in chunk and "content" in chunk["message"]:
            token = chunk["message"]["content"]
            done = chunk.get("done", False)
            stats = {}
            if done:
                eval_count = chunk.get("eval_count", 0)
                eval_duration = chunk.get("eval_duration", 0)
                stats["tokens_per_sec"] = (
                    (eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
                )
            return token, done, stats
    except json.JSONDecodeError:
        pass
    return None, False, {}
@app.post("/api/chat")
 async def chat(request: Request):
    body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES)
@@ -2099,7 +2141,6 @@ async def chat(request: Request):
        "model": model,
        "messages": messages,
        "stream": True,
        "logprobs": True,
    }
    async def stream_response():
@@ -2120,25 +2161,12 @@ async def chat(request: Request):
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line.strip():
-                            try:
+                            token, done, stats = parse_llama_stream_chunk(line)
-                                chunk = json.loads(line)
+                            if token:
-                                if "message" in chunk and "content" in chunk["message"]:
+                                full_response.append(token)
-                                    token = chunk["message"]["content"]
+                                yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
-                                    full_response.append(token)
+                            if done:
-                                    yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
+                                tokens_per_sec = stats.get("tokens_per_sec", 0.0)
                                if "logprobs" in chunk and chunk["logprobs"]:
                                    all_logprobs.extend(chunk["logprobs"])
                                if chunk.get("done"):
                                    eval_count = chunk.get("eval_count", 0)
                                    eval_duration = chunk.get("eval_duration", 0)
                                    tokens_per_sec = (
                                        (eval_count / (eval_duration / 1e9))
                                        if eval_duration > 0
                                        else 0
                                    )
                                    break
                            except json.JSONDecodeError:
                                pass
                assistant_msg = "".join(full_response)
                perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0
@@ -2186,19 +2214,11 @@ async def chat(request: Request):
                        ) as resp2:
                            async for line in resp2.aiter_lines():
                                if line.strip():
-                                    try:
+                                    token2, done2, _ = parse_llama_stream_chunk(line)
-                                        chunk = json.loads(line)
+                                    if token2:
-                                        if (
+                                        augmented_response.append(token2)
-                                            "message" in chunk
+                                    if done2:
-                                            and "content" in chunk["message"]
+                                        break
                                        ):
                                            augmented_response.append(
                                                chunk["message"]["content"]
                                            )
                                        if chunk.get("done"):
                                            break
                                    except json.JSONDecodeError:
                                        pass
                        raw_response = "".join(augmented_response) or assistant_msg
                        cleaned_response = clean_hedging(raw_response)
@@ -2251,6 +2271,8 @@ async def chat(request: Request):
                yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n"
            except httpx.RemoteProtocolError:
                pass  # llama-server closes connection after [DONE] — normal
            except httpx.ConnectError:
                yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
            except Exception as e:
--- a/app.py.bak
+++ b/app.py.bak