Files
jarvisChat/routers/completions.py
gramps 193829b7ff fix: resolve all critical runtime errors and bugs from audit
- Add COMPLETIONS_API_KEY to config.py (env var + auto-generated fallback)
- Fix perplexity auto-search: upstream sends logprobs=true, parse_llama_stream_chunk
  extracts per-token logprobs, all_logprobs populated during streaming
- Fix all /api/models endpoints to target LLAMA_SERVER_BASE (port 8081) not OLLAMA_BASE
- Fix RAG embedding endpoint URL from port 11434 (Ollama) to 8081 (llama-server)
- Correct misleading error messages: 'inference server' not 'Ollama'
- Remove raw_results leak from SSE event stream in /api/search
- Fix weather query extractor: pattern-match instead of unconditional suffix append
- Escape FTS5 operator keywords (AND/OR/NOT/NEAR) in memory search
- Move auth.py BODY_LIMIT_DEFAULT_BYTES imports to module level
- Change RAG injection log level from warning to info
- Fix all 8 test files after modular refactor (rewire imports from correct modules)
- Update AGENTS.md and README.md to reflect v1.8.0 changes
2026-06-27 15:10:32 -07:00

267 lines
9.5 KiB
Python

"""
JarvisChat - /v1/chat/completions router.
OpenAI-compatible endpoint for IDE integration (Continue.dev, etc.).
Runs all requests through the full jC pipeline: profile + RAG + memory injection.
FIM (fill-in-the-middle) requests are proxied directly — not persisted.
Chat-style requests are persisted to conversation history.
Auth: static Bearer token via COMPLETIONS_API_KEY in config.
"""
import json
import logging
import uuid
from datetime import datetime, timezone
import httpx
from fastapi import APIRouter, HTTPException, Request
from fastapi.responses import StreamingResponse, JSONResponse
from config import DEFAULT_MODEL, LLAMA_SERVER_BASE, COMPLETIONS_API_KEY
from db import get_db
from rag import build_system_prompt
from routers.chat import parse_llama_stream_chunk
log = logging.getLogger("jarvischat")
router = APIRouter()
def _check_api_key(request: Request):
auth = request.headers.get("Authorization", "")
if not auth.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Missing Bearer token")
token = auth[7:].strip()
if token != COMPLETIONS_API_KEY:
raise HTTPException(status_code=401, detail="Invalid API key")
def _is_fim_request(body: dict) -> bool:
"""
FIM (fill-in-the-middle) requests use a 'prompt' + optional 'suffix' structure
rather than a 'messages' array. Continue.dev sends these for inline autocomplete.
We proxy them directly without pipeline injection or persistence.
"""
return "prompt" in body and "messages" not in body
def _build_openai_chunk(token: str, model: str, conv_id: str) -> str:
chunk = {
"id": f"chatcmpl-{conv_id}",
"object": "chat.completion.chunk",
"model": model,
"choices": [{
"index": 0,
"delta": {"content": token},
"finish_reason": None,
}],
}
return f"data: {json.dumps(chunk)}\n\n"
def _build_openai_stop_chunk(model: str, conv_id: str) -> str:
chunk = {
"id": f"chatcmpl-{conv_id}",
"object": "chat.completion.chunk",
"model": model,
"choices": [{
"index": 0,
"delta": {},
"finish_reason": "stop",
}],
}
return f"data: {json.dumps(chunk)}\n\n"
def _build_openai_response(content: str, model: str, conv_id: str) -> dict:
"""Non-streaming response envelope."""
return {
"id": f"chatcmpl-{conv_id}",
"object": "chat.completion",
"model": model,
"choices": [{
"index": 0,
"message": {"role": "assistant", "content": content},
"finish_reason": "stop",
}],
"usage": {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
}
@router.post("/v1/chat/completions")
async def chat_completions(request: Request):
_check_api_key(request)
try:
body = await request.json()
except Exception:
raise HTTPException(status_code=400, detail="Invalid JSON body")
# --- FIM passthrough ---
if _is_fim_request(body):
return await _fim_passthrough(body)
# --- Chat completion ---
messages = body.get("messages", [])
if not messages:
raise HTTPException(status_code=400, detail="No messages provided")
model = body.get("model", DEFAULT_MODEL)
stream = body.get("stream", True)
# Extract the latest user message for RAG + conversation title
user_message = ""
for msg in reversed(messages):
if msg.get("role") == "user":
user_message = msg.get("content", "").strip()
break
if not user_message:
raise HTTPException(status_code=400, detail="No user message found")
# --- Persist conversation ---
db = get_db()
now = datetime.now(timezone.utc).isoformat()
conv_id = str(uuid.uuid4())
title = f"[IDE] {user_message[:72]}{'...' if len(user_message) > 72 else ''}"
db.execute(
"INSERT INTO conversations (id, title, model, created_at, updated_at) VALUES (?, ?, ?, ?, ?)",
(conv_id, title, model, now, now),
)
for msg in messages:
role = msg.get("role")
content = msg.get("content", "")
if role in ("user", "assistant"):
db.execute(
"INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
(conv_id, role, content, now),
)
db.commit()
# --- Build system prompt through full jC pipeline ---
system_prompt = await build_system_prompt(db, "", user_message)
db.close()
# Assemble messages for upstream: inject jC system prompt, preserve history
upstream_messages = []
if system_prompt:
upstream_messages.append({"role": "system", "content": system_prompt})
# Strip any system messages from the incoming payload — jC owns the system prompt
for msg in messages:
if msg.get("role") != "system":
upstream_messages.append(msg)
upstream_payload = {
"model": model,
"messages": upstream_messages,
"stream": True, # always stream from upstream; we buffer if client wants non-stream
}
if stream:
return StreamingResponse(
_stream_chat(upstream_payload, model, conv_id, request),
media_type="text/event-stream",
)
else:
return await _blocking_chat(upstream_payload, model, conv_id, request)
async def _stream_chat(payload: dict, model: str, conv_id: str, request: Request):
"""Stream tokens to client in OpenAI SSE format, persist assistant response."""
full_response = []
async with httpx.AsyncClient() as client:
try:
async with client.stream(
"POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json=payload,
timeout=httpx.Timeout(300.0, connect=10.0),
) as resp:
async for line in resp.aiter_lines():
if not line.strip():
continue
token, done, _, _ = parse_llama_stream_chunk(line)
if token:
full_response.append(token)
yield _build_openai_chunk(token, model, conv_id)
if done:
break
yield _build_openai_stop_chunk(model, conv_id)
yield "data: [DONE]\n\n"
# Persist assistant response
assistant_msg = "".join(full_response)
if assistant_msg:
db = get_db()
db.execute(
"INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
(conv_id, "assistant", assistant_msg, datetime.now(timezone.utc).isoformat()),
)
db.commit()
db.close()
except httpx.ConnectError:
err = {"error": {"message": "Cannot connect to inference server", "type": "connection_error"}}
yield f"data: {json.dumps(err)}\n\n"
except Exception as e:
log.error(f"completions stream error: {e}")
err = {"error": {"message": "Stream failed", "type": "server_error"}}
yield f"data: {json.dumps(err)}\n\n"
async def _blocking_chat(payload: dict, model: str, conv_id: str, request: Request) -> JSONResponse:
"""Accumulate full response, return as standard OpenAI JSON object."""
full_response = []
async with httpx.AsyncClient() as client:
try:
async with client.stream(
"POST", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json=payload,
timeout=httpx.Timeout(300.0, connect=10.0),
) as resp:
async for line in resp.aiter_lines():
if not line.strip():
continue
token, done, _, _ = parse_llama_stream_chunk(line)
if token:
full_response.append(token)
if done:
break
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to inference server")
except Exception as e:
log.error(f"completions blocking error: {e}")
raise HTTPException(status_code=500, detail="Inference request failed")
assistant_msg = "".join(full_response)
if assistant_msg:
db = get_db()
db.execute(
"INSERT INTO messages (conversation_id, role, content, created_at) VALUES (?, ?, ?, ?)",
(conv_id, "assistant", assistant_msg, datetime.now(timezone.utc).isoformat()),
)
db.commit()
db.close()
return JSONResponse(content=_build_openai_response(assistant_msg, model, conv_id))
async def _fim_passthrough(body: dict) -> JSONResponse:
"""
Proxy FIM requests directly to llama-server without pipeline injection.
Not persisted — autocomplete noise has no RAG value.
"""
async with httpx.AsyncClient() as client:
try:
resp = await client.post(
f"{LLAMA_SERVER_BASE}/v1/completions",
json=body,
timeout=httpx.Timeout(30.0, connect=5.0),
)
return JSONResponse(content=resp.json(), status_code=resp.status_code)
except httpx.ConnectError:
raise HTTPException(status_code=503, detail="Cannot connect to inference server")
except Exception as e:
log.error(f"FIM passthrough error: {e}")
raise HTTPException(status_code=500, detail="FIM request failed")