chore: bump version to v1.8.0

feat: RAG pipeline + OpenAI SSE streaming, llama-server cluster integration
feat: switch from Ollama to llama-server OpenAI-compat API, fix streaming parser
2026-06-14 21:34:24 -07:00 · 2026-06-14 21:34:24 -07:00 · 2026-06-14 21:34:05 -07:00 · 2026-04-28 09:14:54 -07:00 · 2026-04-28 09:08:36 -07:00
5 changed files with 2415 additions and 69 deletions
--- a/app.py
+++ b/app.py
@@ -56,8 +56,9 @@ syslog_handler.setFormatter(
 log.addHandler(syslog_handler)

 # --- Configuration ---
-VERSION = "1.7.6"
-OLLAMA_BASE = "http://localhost:11434"
+VERSION = "v1.8.0"
+OLLAMA_BASE = os.environ.get("OLLAMA_BASE", "http://localhost:11434")
+LLAMA_SERVER_BASE = os.environ.get("LLAMA_SERVER_BASE", "http://192.168.50.108:8081")
 SEARXNG_BASE = "http://localhost:8888"
 BASE_DIR = Path(__file__).parent
 DB_PATH = BASE_DIR / "jarvischat.db"
@@ -1038,7 +1039,7 @@ def get_gpu_stats() -> dict:
@asynccontextmanager
 async def lifespan(app: FastAPI):
    log.info(f"JarvisChat v{VERSION} starting up")
-    log.info(f"Ollama: {OLLAMA_BASE}, SearXNG: {SEARXNG_BASE}")
+    log.info(f"Ollama: {OLLAMA_BASE}, llama-server: {LLAMA_SERVER_BASE}, SearXNG: {SEARXNG_BASE}")
    init_db()
    log.info(f"Memory system: {get_memory_count()} memories loaded")
    yield
@@ -1480,14 +1481,24 @@ async def index(request: Request):
    return templates.TemplateResponse(request, "index.html", {"version": VERSION})


+#@app.get("/api/models")
+#async def list_models():
+#    async with httpx.AsyncClient() as client:
+#        try:
+#            resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
+#            return resp.json()
+#        except httpx.ConnectError:
+#            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
@app.get("/api/models")
 async def list_models():
    async with httpx.AsyncClient() as client:
        try:
-            resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
-            return resp.json()
+            resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
+            data = resp.json()
+            models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
+            return {"models": models}
        except httpx.ConnectError:
-            raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
+            raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")


@app.get("/api/ps")
@@ -1956,22 +1967,18 @@ async def explicit_search(request: Request):
            try:
                async with client.stream(
                    "POST",
-                    f"{OLLAMA_BASE}/api/chat",
+                    f"{LLAMA_SERVER_BASE}/v1/chat/completions",
                    json={"model": model, "messages": messages, "stream": True},
                    timeout=httpx.Timeout(300.0, connect=10.0),
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line.strip():
-                            try:
-                                chunk = json.loads(line)
-                                if "message" in chunk and "content" in chunk["message"]:
-                                    token = chunk["message"]["content"]
-                                    full_response.append(token)
-                                    yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
-                                if chunk.get("done"):
-                                    break
-                            except json.JSONDecodeError:
-                                pass
+                            token, done, _ = parse_llama_stream_chunk(line)
+                            if token:
+                                full_response.append(token)
+                                yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
+                            if done:
+                                break
            except Exception as e:
                incident_key = log_incident(
                    "search_summarization_stream",
@@ -2010,7 +2017,32 @@ async def explicit_search(request: Request):
 # =============================================================================


-def build_system_prompt(db, extra_prompt="", user_message=""):
+
+async def query_rag(query: str, limit: int = 3) -> list[dict]:
+    """Query Qdrant for semantically relevant chunks."""
+    try:
+        async with httpx.AsyncClient() as client:
+            embed_resp = await client.post(
+                "http://192.168.50.108:11434/api/embeddings",
+                json={"model": "mxbai-embed-large", "prompt": query},
+                timeout=10.0,
+            )
+            if embed_resp.status_code != 200:
+                return []
+            vector = embed_resp.json()["embedding"]
+            search_resp = await client.post(
+                "http://192.168.50.108:6333/collections/jarvis_rag/points/search",
+                json={"vector": vector, "limit": limit, "with_payload": True},
+                timeout=10.0,
+            )
+            if search_resp.status_code != 200:
+                return []
+            return search_resp.json().get("result", [])
+    except Exception as e:
+        log.warning(f"RAG query error: {e}")
+        return []
+
+async def build_system_prompt(db, extra_prompt="", user_message=""):
    """Build the full system prompt: profile + memories + preset."""
    parts = []
    settings = {
@@ -2030,6 +2062,17 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
            parts.append("## Relevant Context from Memory\n" + "\n".join(memory_lines))
            log.debug(f"Injected {len(memories)} memories into context")

+    if user_message:
+        try:
+            rag_results = await query_rag(user_message)
+            if rag_results:
+                rag_lines = [r["payload"]["text"] for r in rag_results if r["score"] > 0.25]
+                if rag_lines:
+                    parts.append("## Retrieved Context\n" + "\n\n---\n\n".join(rag_lines))
+                    log.warning(f"RAG injected {len(rag_lines)} chunks into context")
+        except Exception as e:
+            log.warning(f"RAG injection error: {e}")
+
    if settings.get("skills_enabled", "true") == "true":
        active_skills = [s for s in list_skills_with_state(db) if s["enabled"]]
        if active_skills:
@@ -2041,6 +2084,42 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
    return "\n\n---\n\n".join(parts) if parts else ""


+def parse_llama_stream_chunk(line: str) -> tuple[str | None, bool, dict]:
+    """Parse OpenAI-compatible SSE chunk. Returns (token, is_done, stats)."""
+    if line.startswith("data: "):
+        line = line[6:]
+    if line.strip() == "[DONE]":
+        return None, True, {}
+    try:
+        chunk = json.loads(line)
+        # OpenAI format
+        choices = chunk.get("choices", [])
+        if choices:
+            delta = choices[0].get("delta", {})
+            token = delta.get("content")
+            finish = choices[0].get("finish_reason")
+            stats = {}
+            if finish == "stop":
+                usage = chunk.get("usage", {})
+                stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
+            return token, finish == "stop", stats
+        # Ollama format fallback
+        if "message" in chunk and "content" in chunk["message"]:
+            token = chunk["message"]["content"]
+            done = chunk.get("done", False)
+            stats = {}
+            if done:
+                eval_count = chunk.get("eval_count", 0)
+                eval_duration = chunk.get("eval_duration", 0)
+                stats["tokens_per_sec"] = (
+                    (eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
+                )
+            return token, done, stats
+    except json.JSONDecodeError:
+        pass
+    return None, False, {}
+
+
@app.post("/api/chat")
 async def chat(request: Request):
    body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES)
@@ -2086,7 +2165,7 @@ async def chat(request: Request):
        "SELECT role, content FROM messages WHERE conversation_id = ? ORDER BY id ASC",
        (conv_id,),
    ).fetchall()
-    system_prompt = build_system_prompt(db, preset_prompt, user_message)
+    system_prompt = await build_system_prompt(db, preset_prompt, user_message)
    db.close()

    messages = []
@@ -2099,7 +2178,6 @@ async def chat(request: Request):
        "model": model,
        "messages": messages,
        "stream": True,
-        "logprobs": True,
    }

    async def stream_response():
@@ -2114,31 +2192,18 @@ async def chat(request: Request):
            try:
                async with client.stream(
                    "POST",
-                    f"{OLLAMA_BASE}/api/chat",
+                    f"{LLAMA_SERVER_BASE}/v1/chat/completions",
                    json=ollama_payload,
                    timeout=httpx.Timeout(300.0, connect=10.0),
                ) as resp:
                    async for line in resp.aiter_lines():
                        if line.strip():
-                            try:
-                                chunk = json.loads(line)
-                                if "message" in chunk and "content" in chunk["message"]:
-                                    token = chunk["message"]["content"]
-                                    full_response.append(token)
-                                    yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
-                                if "logprobs" in chunk and chunk["logprobs"]:
-                                    all_logprobs.extend(chunk["logprobs"])
-                                if chunk.get("done"):
-                                    eval_count = chunk.get("eval_count", 0)
-                                    eval_duration = chunk.get("eval_duration", 0)
-                                    tokens_per_sec = (
-                                        (eval_count / (eval_duration / 1e9))
-                                        if eval_duration > 0
-                                        else 0
-                                    )
-                                    break
-                            except json.JSONDecodeError:
-                                pass
+                            token, done, stats = parse_llama_stream_chunk(line)
+                            if token:
+                                full_response.append(token)
+                                yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
+                            if done:
+                                tokens_per_sec = stats.get("tokens_per_sec", 0.0)

                assistant_msg = "".join(full_response)
                perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0
@@ -2176,7 +2241,7 @@ async def chat(request: Request):
                        augmented_response = []
                        async with client.stream(
                            "POST",
-                            f"{OLLAMA_BASE}/api/chat",
+                            f"{LLAMA_SERVER_BASE}/v1/chat/completions",
                            json={
                                "model": model,
                                "messages": augmented_messages,
@@ -2186,19 +2251,11 @@ async def chat(request: Request):
                        ) as resp2:
                            async for line in resp2.aiter_lines():
                                if line.strip():
-                                    try:
-                                        chunk = json.loads(line)
-                                        if (
-                                            "message" in chunk
-                                            and "content" in chunk["message"]
-                                        ):
-                                            augmented_response.append(
-                                                chunk["message"]["content"]
-                                            )
-                                        if chunk.get("done"):
-                                            break
-                                    except json.JSONDecodeError:
-                                        pass
+                                    token2, done2, _ = parse_llama_stream_chunk(line)
+                                    if token2:
+                                        augmented_response.append(token2)
+                                    if done2:
+                                        break

                        raw_response = "".join(augmented_response) or assistant_msg
                        cleaned_response = clean_hedging(raw_response)
@@ -2251,6 +2308,8 @@ async def chat(request: Request):

                yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n"

+            except httpx.RemoteProtocolError:
+                pass  # llama-server closes connection after [DONE] — normal
            except httpx.ConnectError:
                yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
            except Exception as e:
--- a/app.py.bak
+++ b/app.py.bak
--- a/docs/images/screenshot.png
+++ b/docs/images/screenshot.png
--- a/docs/wiki/current-wip.md
+++ b/docs/wiki/current-wip.md
@@ -4,7 +4,7 @@ Last updated: 2026-04-27
 Owner: Gramps + Copilot
 Scope: issues, bugs, security exposures, and feature enhancements.

-Total identified items: 26
+Total identified items: 27

 ## Priority Definitions
 - P0: Critical risk or data-loss/security exposure; do first.
@@ -60,22 +60,23 @@ Total identified items: 26
 12. Add unit/integration tests for: remember/forget parsing, refusal detection, search fallback, SSE done/error shape.
 13. Add conversation title sanitization and length constraints.
 14. Ensure default preset semantics are correct (currently all seeded presets are marked default).
+15. Add preflight validation for required model/preset selection and block send with clear user guidance instead of timing out.

 ### P2 Important Features
-15. Skills system: load markdown skill files with YAML frontmatter from skills directory.
-16. Skills registry API: list/enable/disable skills and expose active skills to UI.
-17. Inject active skill instructions into system prompt with bounded token budget.
-18. Tool execution guardrails: allowlist, confirmation mode, and execution logs.
-19. Heartbeat scheduler (cron/systemd timer) for daily check-ins.
-20. Heartbeat endpoint for generated briefings and anomaly summaries.
-21. Model info UI panel (description, updated date, best-use purpose).
-22. Default model selection improvements and persistence validation.
-23. Hidden model list support (exclude models from dropdown).
-24. Model update action from UI (trigger controlled model pull).
+16. Skills system: load markdown skill files with YAML frontmatter from skills directory.
+17. Skills registry API: list/enable/disable skills and expose active skills to UI.
+18. Inject active skill instructions into system prompt with bounded token budget.
+19. Tool execution guardrails: allowlist, confirmation mode, and execution logs.
+20. Heartbeat scheduler (cron/systemd timer) for daily check-ins.
+21. Heartbeat endpoint for generated briefings and anomaly summaries.
+22. Model info UI panel (description, updated date, best-use purpose).
+23. Default model selection improvements and persistence validation.
+24. Hidden model list support (exclude models from dropdown).
+25. Model update action from UI (trigger controlled model pull).

 ### P3 Nice to Have
-25. Conversation search/filter and export tooling.
-26. Keyboard shortcuts, retry button, and source-link polish.
+26. Conversation search/filter and export tooling.
+27. Keyboard shortcuts, retry button, and source-link polish.

 ## Maintenance Rules
 - Keep this file as the single source of truth.
--- a/readme.md
+++ b/readme.md
@@ -1,4 +1,4 @@
-# ⚡ JarvisChat v1.7.6
+# ⚡ JarvisChat v1.7.8

 ![screenshot](docs/images/screenshot.png)

@@ -74,7 +74,7 @@ Canonical backlog: [docs/wiki/current-wip.md](docs/wiki/current-wip.md)

 Scope boundary: local-first (same-host Ollama), optional RFC1918 LAN endpoints, no public Internet AI endpoints by default.

-Total identified items: 26
+Total identified items: 27

 Top 10 (brief):

@@ -113,6 +113,7 @@ Implementation status: complete (guest session by default + admin unlock + admin
 16. Hide/remove model from list — exclude models from dropdown
 17. Update model function — trigger `ollama pull` for selected model from UI
 18. Add mouseover tooltip to SEND button
+19. Add preflight validation for required model/preset selection and show a clear warning before send to prevent avoidable timeout loops

 ## File Structure
Author	SHA1	Message	Date
gramps	dd475a6f2d	chore: bump version to v1.8.0	2026-06-14 21:34:24 -07:00
gramps	6de3a1e154	feat: RAG pipeline + OpenAI SSE streaming, llama-server cluster integration	2026-06-14 21:34:24 -07:00
gramps	5a652c1b74	feat: switch from Ollama to llama-server OpenAI-compat API, fix streaming parser	2026-06-14 21:34:05 -07:00
gramps	18bca027de	docs: replace README screenshot asset (v1.7.8)	2026-04-28 09:14:54 -07:00
gramps	36bca94840	docs(todo): add model/preset preflight validation item (v1.7.7)	2026-04-28 09:08:36 -07:00