From ec2f4c0332aa80c9aa9d81101c67514daac2c61b Mon Sep 17 00:00:00 2001 From: gramps Date: Sat, 20 Jun 2026 14:34:47 -0700 Subject: [PATCH] feat: add OpenAI-compat /v1/chat/completions endpoint (TODO #22) --- app.py | 3 +- routers/completions.py | 267 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 routers/completions.py diff --git a/app.py b/app.py index fadf6c0..2f75024 100644 --- a/app.py +++ b/app.py @@ -31,6 +31,7 @@ import routers.settings as settings import routers.skills as skills import routers.chat as chat import routers.search_route as search_route +import routers.completions as completions # --- Logging --- log = logging.getLogger("jarvischat") @@ -137,7 +138,7 @@ async def index(request: Request): for router_module in [ auth_router, conversations.router, memories.router, models.router, presets.router, profile.router, settings.router, skills.router, - chat.router, search_route.router, + chat.router, search_route.router, completions.router, ]: app.include_router(router_module) diff --git a/routers/completions.py b/routers/completions.py new file mode 100644 index 0000000..c3e97ad --- /dev/null +++ b/routers/completions.py @@ -0,0 +1,267 @@ +""" +JarvisChat - /v1/chat/completions router. +OpenAI-compatible endpoint for IDE integration (Continue.dev, etc.). +Runs all requests through the full jC pipeline: profile + RAG + memory injection. +FIM (fill-in-the-middle) requests are proxied directly — not persisted. +Chat-style requests are persisted to conversation history. +Auth: static Bearer token via COMPLETIONS_API_KEY in config. +""" +import json +import logging +import uuid +from datetime import datetime, timezone + +import httpx +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import StreamingResponse, JSONResponse + +from config import DEFAULT_MODEL, LLAMA_SERVER_BASE, COMPLETIONS_API_KEY +from db import get_db +from rag import build_system_prompt +from routers.chat import parse_llama_stream_chunk + +log = logging.getLogger("jarvischat") +router = APIRouter() + + +def _check_api_key(request: Request): + auth = request.headers.get("Authorization", "") + if not auth.startswith("Bearer "): + raise HTTPException(status_code=401, detail="Missing Bearer token") + token = auth[7:].strip() + if token != COMPLETIONS_API_KEY: + raise HTTPException(status_code=401, detail="Invalid API key") + + +def _is_fim_request(body: dict) -> bool: + """ + FIM (fill-in-the-middle) requests use a 'prompt' + optional 'suffix' structure + rather than a 'messages' array. Continue.dev sends these for inline autocomplete. + We proxy them directly without pipeline injection or persistence. + """ + return "prompt" in body and "messages" not in body + + +def _build_openai_chunk(token: str, model: str, conv_id: str) -> str: + chunk = { + "id": f"chatcmpl-{conv_id}", + "object": "chat.completion.chunk", + "model": model, + "choices": [{ + "index": 0, + "delta": {"content": token}, + "finish_reason": None, + }], + } + return f"data: {json.dumps(chunk)}\n\n" + + +def _build_openai_stop_chunk(model: str, conv_id: str) -> str: + chunk = { + "id": f"chatcmpl-{conv_id}", + "object": "chat.completion.chunk", + "model": model, + "choices": [{ + "index": 0, + "delta": {}, + "finish_reason": "stop", + }], + } + return f"data: {json.dumps(chunk)}\n\n" + + +def _build_openai_response(content: str, model: str, conv_id: str) -> dict: + """Non-streaming response envelope.""" + return { + "id": f"chatcmpl-{conv_id}", + "object": "chat.completion", + "model": model, + "choices": [{ + "index": 0, + "message": {"role": "assistant", "content": content}, + "finish_reason": "stop", + }], + "usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}, + } + + +@router.post("/v1/chat/completions") +async def chat_completions(request: Request): + _check_api_key(request) + + try: + body = await request.json() + except Exception: + raise HTTPException(status_code=400, detail="Invalid JSON body") + + # --- FIM passthrough --- + if _is_fim_request(body): + return await _fim_passthrough(body) + + # --- Chat completion --- + messages = body.get("messages", []) + if not messages: + raise HTTPException(status_code=400, detail="No messages provided") + + model = body.get("model", DEFAULT_MODEL) + stream = body.get("stream", True) + + # Extract the latest user message for RAG + conversation title + user_message = "" + for msg in reversed(messages): + if msg.get("role") == "user": + user_message = msg.get("content", "").strip() + break + + if not user_message: + raise HTTPException(status_code=400, detail="No user message found") + + # --- Persist conversation --- + db = get_db() + now = datetime.now(timezone.utc).isoformat() + conv_id = str(uuid.uuid4()) + title = f"[IDE] {user_message[:72]}{'...' if len(user_message) > 72 else ''}" + db.execute( + "INSERT INTO conversations (id, title, model, created_at, updated_at) VALUES (?, ?, ?, ?, ?)", + (conv_id, title, model, now, now), + ) + for msg in messages: + role = msg.get("role") + content = msg.get("content", "") + if role in ("user", "assistant"): + db.execute( + "INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)", + (conv_id, role, content, now), + ) + db.commit() + + # --- Build system prompt through full jC pipeline --- + system_prompt = await build_system_prompt(db, "", user_message) + db.close() + + # Assemble messages for upstream: inject jC system prompt, preserve history + upstream_messages = [] + if system_prompt: + upstream_messages.append({"role": "system", "content": system_prompt}) + + # Strip any system messages from the incoming payload — jC owns the system prompt + for msg in messages: + if msg.get("role") != "system": + upstream_messages.append(msg) + + upstream_payload = { + "model": model, + "messages": upstream_messages, + "stream": True, # always stream from upstream; we buffer if client wants non-stream + } + + if stream: + return StreamingResponse( + _stream_chat(upstream_payload, model, conv_id, request), + media_type="text/event-stream", + ) + else: + return await _blocking_chat(upstream_payload, model, conv_id, request) + + +async def _stream_chat(payload: dict, model: str, conv_id: str, request: Request): + """Stream tokens to client in OpenAI SSE format, persist assistant response.""" + full_response = [] + + async with httpx.AsyncClient() as client: + try: + async with client.stream( + "POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions", + json=payload, + timeout=httpx.Timeout(300.0, connect=10.0), + ) as resp: + async for line in resp.aiter_lines(): + if not line.strip(): + continue + token, done, _ = parse_llama_stream_chunk(line) + if token: + full_response.append(token) + yield _build_openai_chunk(token, model, conv_id) + if done: + break + + yield _build_openai_stop_chunk(model, conv_id) + yield "data: [DONE]\n\n" + + # Persist assistant response + assistant_msg = "".join(full_response) + if assistant_msg: + db = get_db() + db.execute( + "INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)", + (conv_id, "assistant", assistant_msg, datetime.now(timezone.utc).isoformat()), + ) + db.commit() + db.close() + + except httpx.ConnectError: + err = {"error": {"message": "Cannot connect to inference server", "type": "connection_error"}} + yield f"data: {json.dumps(err)}\n\n" + except Exception as e: + log.error(f"completions stream error: {e}") + err = {"error": {"message": "Stream failed", "type": "server_error"}} + yield f"data: {json.dumps(err)}\n\n" + + +async def _blocking_chat(payload: dict, model: str, conv_id: str, request: Request) -> JSONResponse: + """Accumulate full response, return as standard OpenAI JSON object.""" + full_response = [] + + async with httpx.AsyncClient() as client: + try: + async with client.stream( + "POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions", + json=payload, + timeout=httpx.Timeout(300.0, connect=10.0), + ) as resp: + async for line in resp.aiter_lines(): + if not line.strip(): + continue + token, done, _ = parse_llama_stream_chunk(line) + if token: + full_response.append(token) + if done: + break + except httpx.ConnectError: + raise HTTPException(status_code=503, detail="Cannot connect to inference server") + except Exception as e: + log.error(f"completions blocking error: {e}") + raise HTTPException(status_code=500, detail="Inference request failed") + + assistant_msg = "".join(full_response) + + if assistant_msg: + db = get_db() + db.execute( + "INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)", + (conv_id, "assistant", assistant_msg, datetime.now(timezone.utc).isoformat()), + ) + db.commit() + db.close() + + return JSONResponse(content=_build_openai_response(assistant_msg, model, conv_id)) + + +async def _fim_passthrough(body: dict) -> JSONResponse: + """ + Proxy FIM requests directly to llama-server without pipeline injection. + Not persisted — autocomplete noise has no RAG value. + """ + async with httpx.AsyncClient() as client: + try: + resp = await client.post( + f"{LLAMA_SERVER_BASE}/v1/completions", + json=body, + timeout=httpx.Timeout(30.0, connect=5.0), + ) + return JSONResponse(content=resp.json(), status_code=resp.status_code) + except httpx.ConnectError: + raise HTTPException(status_code=503, detail="Cannot connect to inference server") + except Exception as e: + log.error(f"FIM passthrough error: {e}") + raise HTTPException(status_code=500, detail="FIM request failed") \ No newline at end of file