From ec2f4c0332aa80c9aa9d81101c67514daac2c61b Mon Sep 17 00:00:00 2001
From: gramps <gramps@jarvis.local>
Date: Sat, 20 Jun 2026 14:34:47 -0700
Subject: [PATCH] feat: add OpenAI-compat /v1/chat/completions endpoint (TODO
 #22)

---
 app.py                 |   3 +-
 routers/completions.py | 267 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 routers/completions.py

diff --git a/app.py b/app.py
index fadf6c0..2f75024 100644
--- a/app.py
+++ b/app.py
@@ -31,6 +31,7 @@ import routers.settings as settings
 import routers.skills as skills
 import routers.chat as chat
 import routers.search_route as search_route
+import routers.completions as completions
 
 # --- Logging ---
 log = logging.getLogger("jarvischat")
@@ -137,7 +138,7 @@ async def index(request: Request):
 for router_module in [
     auth_router, conversations.router, memories.router, models.router,
     presets.router, profile.router, settings.router, skills.router,
-    chat.router, search_route.router,
+    chat.router, search_route.router, completions.router,
 ]:
     app.include_router(router_module)
 
diff --git a/routers/completions.py b/routers/completions.py
new file mode 100644
index 0000000..c3e97ad
--- /dev/null
+++ b/routers/completions.py
@@ -0,0 +1,267 @@
+"""
+JarvisChat - /v1/chat/completions router.
+OpenAI-compatible endpoint for IDE integration (Continue.dev, etc.).
+Runs all requests through the full jC pipeline: profile + RAG + memory injection.
+FIM (fill-in-the-middle) requests are proxied directly — not persisted.
+Chat-style requests are persisted to conversation history.
+Auth: static Bearer token via COMPLETIONS_API_KEY in config.
+"""
+import json
+import logging
+import uuid
+from datetime import datetime, timezone
+
+import httpx
+from fastapi import APIRouter, HTTPException, Request
+from fastapi.responses import StreamingResponse, JSONResponse
+
+from config import DEFAULT_MODEL, LLAMA_SERVER_BASE, COMPLETIONS_API_KEY
+from db import get_db
+from rag import build_system_prompt
+from routers.chat import parse_llama_stream_chunk
+
+log = logging.getLogger("jarvischat")
+router = APIRouter()
+
+
+def _check_api_key(request: Request):
+    auth = request.headers.get("Authorization", "")
+    if not auth.startswith("Bearer "):
+        raise HTTPException(status_code=401, detail="Missing Bearer token")
+    token = auth[7:].strip()
+    if token != COMPLETIONS_API_KEY:
+        raise HTTPException(status_code=401, detail="Invalid API key")
+
+
+def _is_fim_request(body: dict) -> bool:
+    """
+    FIM (fill-in-the-middle) requests use a 'prompt' + optional 'suffix' structure
+    rather than a 'messages' array. Continue.dev sends these for inline autocomplete.
+    We proxy them directly without pipeline injection or persistence.
+    """
+    return "prompt" in body and "messages" not in body
+
+
+def _build_openai_chunk(token: str, model: str, conv_id: str) -> str:
+    chunk = {
+        "id": f"chatcmpl-{conv_id}",
+        "object": "chat.completion.chunk",
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "delta": {"content": token},
+            "finish_reason": None,
+        }],
+    }
+    return f"data: {json.dumps(chunk)}\n\n"
+
+
+def _build_openai_stop_chunk(model: str, conv_id: str) -> str:
+    chunk = {
+        "id": f"chatcmpl-{conv_id}",
+        "object": "chat.completion.chunk",
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "delta": {},
+            "finish_reason": "stop",
+        }],
+    }
+    return f"data: {json.dumps(chunk)}\n\n"
+
+
+def _build_openai_response(content: str, model: str, conv_id: str) -> dict:
+    """Non-streaming response envelope."""
+    return {
+        "id": f"chatcmpl-{conv_id}",
+        "object": "chat.completion",
+        "model": model,
+        "choices": [{
+            "index": 0,
+            "message": {"role": "assistant", "content": content},
+            "finish_reason": "stop",
+        }],
+        "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+    }
+
+
+@router.post("/v1/chat/completions")
+async def chat_completions(request: Request):
+    _check_api_key(request)
+
+    try:
+        body = await request.json()
+    except Exception:
+        raise HTTPException(status_code=400, detail="Invalid JSON body")
+
+    # --- FIM passthrough ---
+    if _is_fim_request(body):
+        return await _fim_passthrough(body)
+
+    # --- Chat completion ---
+    messages = body.get("messages", [])
+    if not messages:
+        raise HTTPException(status_code=400, detail="No messages provided")
+
+    model = body.get("model", DEFAULT_MODEL)
+    stream = body.get("stream", True)
+
+    # Extract the latest user message for RAG + conversation title
+    user_message = ""
+    for msg in reversed(messages):
+        if msg.get("role") == "user":
+            user_message = msg.get("content", "").strip()
+            break
+
+    if not user_message:
+        raise HTTPException(status_code=400, detail="No user message found")
+
+    # --- Persist conversation ---
+    db = get_db()
+    now = datetime.now(timezone.utc).isoformat()
+    conv_id = str(uuid.uuid4())
+    title = f"[IDE] {user_message[:72]}{'...' if len(user_message) > 72 else ''}"
+    db.execute(
+        "INSERT INTO conversations (id, title, model, created_at, updated_at) VALUES (?, ?, ?, ?, ?)",
+        (conv_id, title, model, now, now),
+    )
+    for msg in messages:
+        role = msg.get("role")
+        content = msg.get("content", "")
+        if role in ("user", "assistant"):
+            db.execute(
+                "INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
+                (conv_id, role, content, now),
+            )
+    db.commit()
+
+    # --- Build system prompt through full jC pipeline ---
+    system_prompt = await build_system_prompt(db, "", user_message)
+    db.close()
+
+    # Assemble messages for upstream: inject jC system prompt, preserve history
+    upstream_messages = []
+    if system_prompt:
+        upstream_messages.append({"role": "system", "content": system_prompt})
+
+    # Strip any system messages from the incoming payload — jC owns the system prompt
+    for msg in messages:
+        if msg.get("role") != "system":
+            upstream_messages.append(msg)
+
+    upstream_payload = {
+        "model": model,
+        "messages": upstream_messages,
+        "stream": True,  # always stream from upstream; we buffer if client wants non-stream
+    }
+
+    if stream:
+        return StreamingResponse(
+            _stream_chat(upstream_payload, model, conv_id, request),
+            media_type="text/event-stream",
+        )
+    else:
+        return await _blocking_chat(upstream_payload, model, conv_id, request)
+
+
+async def _stream_chat(payload: dict, model: str, conv_id: str, request: Request):
+    """Stream tokens to client in OpenAI SSE format, persist assistant response."""
+    full_response = []
+
+    async with httpx.AsyncClient() as client:
+        try:
+            async with client.stream(
+                "POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
+                json=payload,
+                timeout=httpx.Timeout(300.0, connect=10.0),
+            ) as resp:
+                async for line in resp.aiter_lines():
+                    if not line.strip():
+                        continue
+                    token, done, _ = parse_llama_stream_chunk(line)
+                    if token:
+                        full_response.append(token)
+                        yield _build_openai_chunk(token, model, conv_id)
+                    if done:
+                        break
+
+            yield _build_openai_stop_chunk(model, conv_id)
+            yield "data: [DONE]\n\n"
+
+            # Persist assistant response
+            assistant_msg = "".join(full_response)
+            if assistant_msg:
+                db = get_db()
+                db.execute(
+                    "INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
+                    (conv_id, "assistant", assistant_msg, datetime.now(timezone.utc).isoformat()),
+                )
+                db.commit()
+                db.close()
+
+        except httpx.ConnectError:
+            err = {"error": {"message": "Cannot connect to inference server", "type": "connection_error"}}
+            yield f"data: {json.dumps(err)}\n\n"
+        except Exception as e:
+            log.error(f"completions stream error: {e}")
+            err = {"error": {"message": "Stream failed", "type": "server_error"}}
+            yield f"data: {json.dumps(err)}\n\n"
+
+
+async def _blocking_chat(payload: dict, model: str, conv_id: str, request: Request) -> JSONResponse:
+    """Accumulate full response, return as standard OpenAI JSON object."""
+    full_response = []
+
+    async with httpx.AsyncClient() as client:
+        try:
+            async with client.stream(
+                "POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
+                json=payload,
+                timeout=httpx.Timeout(300.0, connect=10.0),
+            ) as resp:
+                async for line in resp.aiter_lines():
+                    if not line.strip():
+                        continue
+                    token, done, _ = parse_llama_stream_chunk(line)
+                    if token:
+                        full_response.append(token)
+                    if done:
+                        break
+        except httpx.ConnectError:
+            raise HTTPException(status_code=503, detail="Cannot connect to inference server")
+        except Exception as e:
+            log.error(f"completions blocking error: {e}")
+            raise HTTPException(status_code=500, detail="Inference request failed")
+
+    assistant_msg = "".join(full_response)
+
+    if assistant_msg:
+        db = get_db()
+        db.execute(
+            "INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
+            (conv_id, "assistant", assistant_msg, datetime.now(timezone.utc).isoformat()),
+        )
+        db.commit()
+        db.close()
+
+    return JSONResponse(content=_build_openai_response(assistant_msg, model, conv_id))
+
+
+async def _fim_passthrough(body: dict) -> JSONResponse:
+    """
+    Proxy FIM requests directly to llama-server without pipeline injection.
+    Not persisted — autocomplete noise has no RAG value.
+    """
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(
+                f"{LLAMA_SERVER_BASE}/v1/completions",
+                json=body,
+                timeout=httpx.Timeout(30.0, connect=5.0),
+            )
+            return JSONResponse(content=resp.json(), status_code=resp.status_code)
+        except httpx.ConnectError:
+            raise HTTPException(status_code=503, detail="Cannot connect to inference server")
+        except Exception as e:
+            log.error(f"FIM passthrough error: {e}")
+            raise HTTPException(status_code=500, detail="FIM request failed")
\ No newline at end of file