feat: RAG pipeline + OpenAI SSE streaming, llama-server cluster integration

feat: switch from Ollama to llama-server OpenAI-compat API, fix streaming parser
2026-06-14 13:57:09 -07:00 · 2026-06-14 12:37:29 -07:00
5 changed files with 2408 additions and 67 deletions
--- a/app.py
+++ b/app.py
@@ -56,8 +56,8 @@ syslog_handler.setFormatter(
 log.addHandler(syslog_handler)
 # --- Configuration ---
-VERSION = "1.7.8"
+VERSION = "1.7.6"
-OLLAMA_BASE = "http://localhost:11434"
+OLLAMA_BASE = os.environ.get("OLLAMA_BASE", "http://localhost:11434")
 SEARXNG_BASE = "http://localhost:8888"
 BASE_DIR = Path(__file__).parent
 DB_PATH = BASE_DIR / "jarvischat.db"
@@ -1480,14 +1480,24 @@ async def index(request: Request):
    return templates.TemplateResponse(request, "index.html", {"version": VERSION})
 #@app.get("/api/models")
 #async def list_models():
 #    async with httpx.AsyncClient() as client:
 #        try:
 #            resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
 #            return resp.json()
 #        except httpx.ConnectError:
 #            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
@app.get("/api/models")
 async def list_models():
    async with httpx.AsyncClient() as client:
        try:
-            resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
+            resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
-            return resp.json()
+            data = resp.json()
            models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
            return {"models": models}
        except httpx.ConnectError:
-            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
+            raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")
@app.get("/api/ps")
@@ -1962,16 +1972,12 @@ async def explicit_search(request: Request):
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line.strip():
-                            try:
+                            token, done, _ = parse_llama_stream_chunk(line)
-                                chunk = json.loads(line)
+                            if token:
-                                if "message" in chunk and "content" in chunk["message"]:
+                                full_response.append(token)
-                                    token = chunk["message"]["content"]
+                                yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
-                                    full_response.append(token)
+                            if done:
-                                    yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
+                                break
                                if chunk.get("done"):
                                    break
                            except json.JSONDecodeError:
                                pass
            except Exception as e:
                incident_key = log_incident(
                    "search_summarization_stream",
@@ -2010,7 +2016,32 @@ async def explicit_search(request: Request):
 # =============================================================================
-def build_system_prompt(db, extra_prompt="", user_message=""):
+
 async def query_rag(query: str, limit: int = 3) -> list[dict]:
    """Query Qdrant for semantically relevant chunks."""
    try:
        async with httpx.AsyncClient() as client:
            embed_resp = await client.post(
                "http://192.168.50.108:11434/api/embeddings",
                json={"model": "mxbai-embed-large", "prompt": query},
                timeout=10.0,
            )
            if embed_resp.status_code != 200:
                return []
            vector = embed_resp.json()["embedding"]
            search_resp = await client.post(
                "http://192.168.50.108:6333/collections/jarvis_rag/points/search",
                json={"vector": vector, "limit": limit, "with_payload": True},
                timeout=10.0,
            )
            if search_resp.status_code != 200:
                return []
            return search_resp.json().get("result", [])
    except Exception as e:
        log.warning(f"RAG query error: {e}")
        return []
 async def build_system_prompt(db, extra_prompt="", user_message=""):
    """Build the full system prompt: profile + memories + preset."""
    parts = []
    settings = {
@@ -2030,6 +2061,17 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
            parts.append("## Relevant Context from Memory\n" + "\n".join(memory_lines))
            log.debug(f"Injected {len(memories)} memories into context")
    if user_message:
        try:
            rag_results = await query_rag(user_message)
            if rag_results:
                rag_lines = [r["payload"]["text"] for r in rag_results if r["score"] > 0.25]
                if rag_lines:
                    parts.append("## Retrieved Context\n" + "\n\n---\n\n".join(rag_lines))
                    log.warning(f"RAG injected {len(rag_lines)} chunks into context")
        except Exception as e:
            log.warning(f"RAG injection error: {e}")
    if settings.get("skills_enabled", "true") == "true":
        active_skills = [s for s in list_skills_with_state(db) if s["enabled"]]
        if active_skills:
@@ -2041,6 +2083,42 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
    return "\n\n---\n\n".join(parts) if parts else ""
 def parse_llama_stream_chunk(line: str) -> tuple[str | None, bool, dict]:
    """Parse OpenAI-compatible SSE chunk. Returns (token, is_done, stats)."""
    if line.startswith("data: "):
        line = line[6:]
    if line.strip() == "[DONE]":
        return None, True, {}
    try:
        chunk = json.loads(line)
        # OpenAI format
        choices = chunk.get("choices", [])
        if choices:
            delta = choices[0].get("delta", {})
            token = delta.get("content")
            finish = choices[0].get("finish_reason")
            stats = {}
            if finish == "stop":
                usage = chunk.get("usage", {})
                stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
            return token, finish == "stop", stats
        # Ollama format fallback
        if "message" in chunk and "content" in chunk["message"]:
            token = chunk["message"]["content"]
            done = chunk.get("done", False)
            stats = {}
            if done:
                eval_count = chunk.get("eval_count", 0)
                eval_duration = chunk.get("eval_duration", 0)
                stats["tokens_per_sec"] = (
                    (eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
                )
            return token, done, stats
    except json.JSONDecodeError:
        pass
    return None, False, {}
@app.post("/api/chat")
 async def chat(request: Request):
    body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES)
@@ -2086,7 +2164,7 @@ async def chat(request: Request):
        "SELECT role, content FROM messages WHERE conversation_id = ? ORDER BY id ASC",
        (conv_id,),
    ).fetchall()
-    system_prompt = build_system_prompt(db, preset_prompt, user_message)
+    system_prompt = await build_system_prompt(db, preset_prompt, user_message)
    db.close()
    messages = []
@@ -2099,7 +2177,6 @@ async def chat(request: Request):
        "model": model,
        "messages": messages,
        "stream": True,
        "logprobs": True,
    }
    async def stream_response():
@@ -2120,25 +2197,12 @@ async def chat(request: Request):
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line.strip():
-                            try:
+                            token, done, stats = parse_llama_stream_chunk(line)
-                                chunk = json.loads(line)
+                            if token:
-                                if "message" in chunk and "content" in chunk["message"]:
+                                full_response.append(token)
-                                    token = chunk["message"]["content"]
+                                yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
-                                    full_response.append(token)
+                            if done:
-                                    yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
+                                tokens_per_sec = stats.get("tokens_per_sec", 0.0)
                                if "logprobs" in chunk and chunk["logprobs"]:
                                    all_logprobs.extend(chunk["logprobs"])
                                if chunk.get("done"):
                                    eval_count = chunk.get("eval_count", 0)
                                    eval_duration = chunk.get("eval_duration", 0)
                                    tokens_per_sec = (
                                        (eval_count / (eval_duration / 1e9))
                                        if eval_duration > 0
                                        else 0
                                    )
                                    break
                            except json.JSONDecodeError:
                                pass
                assistant_msg = "".join(full_response)
                perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0
@@ -2186,19 +2250,11 @@ async def chat(request: Request):
                        ) as resp2:
                            async for line in resp2.aiter_lines():
                                if line.strip():
-                                    try:
+                                    token2, done2, _ = parse_llama_stream_chunk(line)
-                                        chunk = json.loads(line)
+                                    if token2:
-                                        if (
+                                        augmented_response.append(token2)
-                                            "message" in chunk
+                                    if done2:
-                                            and "content" in chunk["message"]
+                                        break
                                        ):
                                            augmented_response.append(
                                                chunk["message"]["content"]
                                            )
                                        if chunk.get("done"):
                                            break
                                    except json.JSONDecodeError:
                                        pass
                        raw_response = "".join(augmented_response) or assistant_msg
                        cleaned_response = clean_hedging(raw_response)
@@ -2251,6 +2307,8 @@ async def chat(request: Request):
                yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n"
            except httpx.RemoteProtocolError:
                pass  # llama-server closes connection after [DONE] — normal
            except httpx.ConnectError:
                yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
            except Exception as e:
--- a/app.py.bak
+++ b/app.py.bak
--- a/docs/images/screenshot.png
+++ b/docs/images/screenshot.png
--- a/docs/wiki/current-wip.md
+++ b/docs/wiki/current-wip.md
@@ -4,7 +4,7 @@ Last updated: 2026-04-27
 Owner: Gramps + Copilot
 Scope: issues, bugs, security exposures, and feature enhancements.
-Total identified items: 27
+Total identified items: 26
 ## Priority Definitions
 - P0: Critical risk or data-loss/security exposure; do first.
@@ -60,23 +60,22 @@ Total identified items: 27
 12. Add unit/integration tests for: remember/forget parsing, refusal detection, search fallback, SSE done/error shape.
 13. Add conversation title sanitization and length constraints.
 14. Ensure default preset semantics are correct (currently all seeded presets are marked default).
 15. Add preflight validation for required model/preset selection and block send with clear user guidance instead of timing out.
 ### P2 Important Features
-16. Skills system: load markdown skill files with YAML frontmatter from skills directory.
+15. Skills system: load markdown skill files with YAML frontmatter from skills directory.
-17. Skills registry API: list/enable/disable skills and expose active skills to UI.
+16. Skills registry API: list/enable/disable skills and expose active skills to UI.
-18. Inject active skill instructions into system prompt with bounded token budget.
+17. Inject active skill instructions into system prompt with bounded token budget.
-19. Tool execution guardrails: allowlist, confirmation mode, and execution logs.
+18. Tool execution guardrails: allowlist, confirmation mode, and execution logs.
-20. Heartbeat scheduler (cron/systemd timer) for daily check-ins.
+19. Heartbeat scheduler (cron/systemd timer) for daily check-ins.
-21. Heartbeat endpoint for generated briefings and anomaly summaries.
+20. Heartbeat endpoint for generated briefings and anomaly summaries.
-22. Model info UI panel (description, updated date, best-use purpose).
+21. Model info UI panel (description, updated date, best-use purpose).
-23. Default model selection improvements and persistence validation.
+22. Default model selection improvements and persistence validation.
-24. Hidden model list support (exclude models from dropdown).
+23. Hidden model list support (exclude models from dropdown).
-25. Model update action from UI (trigger controlled model pull).
+24. Model update action from UI (trigger controlled model pull).
 ### P3 Nice to Have
-26. Conversation search/filter and export tooling.
+25. Conversation search/filter and export tooling.
-27. Keyboard shortcuts, retry button, and source-link polish.
+26. Keyboard shortcuts, retry button, and source-link polish.
 ## Maintenance Rules
 - Keep this file as the single source of truth.
--- a/readme.md
+++ b/readme.md
@@ -1,4 +1,4 @@
-# ⚡ JarvisChat v1.7.8
+# ⚡ JarvisChat v1.7.6
 ![screenshot](docs/images/screenshot.png)
@@ -74,7 +74,7 @@ Canonical backlog: [docs/wiki/current-wip.md](docs/wiki/current-wip.md)
 Scope boundary: local-first (same-host Ollama), optional RFC1918 LAN endpoints, no public Internet AI endpoints by default.
-Total identified items: 27
+Total identified items: 26
 Top 10 (brief):
@@ -113,7 +113,6 @@ Implementation status: complete (guest session by default + admin unlock + admin
 16. Hide/remove model from list — exclude models from dropdown
 17. Update model function — trigger `ollama pull` for selected model from UI
 18. Add mouseover tooltip to SEND button
 19. Add preflight validation for required model/preset selection and show a clear warning before send to prevent avoidable timeout loops
 ## File Structure
Author	SHA1	Message	Date
gramps	31ba4769c8	feat: RAG pipeline + OpenAI SSE streaming, llama-server cluster integration	2026-06-14 13:57:09 -07:00
gramps	347650b507	feat: switch from Ollama to llama-server OpenAI-compat API, fix streaming parser	2026-06-14 12:37:29 -07:00