Compare commits

...

5 Commits

5 changed files with 2415 additions and 69 deletions

155
app.py
View File

@@ -56,8 +56,9 @@ syslog_handler.setFormatter(
log.addHandler(syslog_handler)
# --- Configuration ---
VERSION = "1.7.6"
OLLAMA_BASE = "http://localhost:11434"
VERSION = "v1.8.0"
OLLAMA_BASE = os.environ.get("OLLAMA_BASE", "http://localhost:11434")
LLAMA_SERVER_BASE = os.environ.get("LLAMA_SERVER_BASE", "http://192.168.50.108:8081")
SEARXNG_BASE = "http://localhost:8888"
BASE_DIR = Path(__file__).parent
DB_PATH = BASE_DIR / "jarvischat.db"
@@ -1038,7 +1039,7 @@ def get_gpu_stats() -> dict:
@asynccontextmanager
async def lifespan(app: FastAPI):
log.info(f"JarvisChat v{VERSION} starting up")
log.info(f"Ollama: {OLLAMA_BASE}, SearXNG: {SEARXNG_BASE}")
log.info(f"Ollama: {OLLAMA_BASE}, llama-server: {LLAMA_SERVER_BASE}, SearXNG: {SEARXNG_BASE}")
init_db()
log.info(f"Memory system: {get_memory_count()} memories loaded")
yield
@@ -1480,14 +1481,24 @@ async def index(request: Request):
return templates.TemplateResponse(request, "index.html", {"version": VERSION})
#@app.get("/api/models")
#async def list_models():
# async with httpx.AsyncClient() as client:
# try:
# resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
# return resp.json()
# except httpx.ConnectError:
# raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
@app.get("/api/models")
async def list_models():
async with httpx.AsyncClient() as client:
try:
resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
return resp.json()
resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
data = resp.json()
models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
return {"models": models}
except httpx.ConnectError:
raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")
@app.get("/api/ps")
@@ -1956,22 +1967,18 @@ async def explicit_search(request: Request):
try:
async with client.stream(
"POST",
f"{OLLAMA_BASE}/api/chat",
f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json={"model": model, "messages": messages, "stream": True},
timeout=httpx.Timeout(300.0, connect=10.0),
) as resp:
async for line in resp.aiter_lines():
if line.strip():
try:
chunk = json.loads(line)
if "message" in chunk and "content" in chunk["message"]:
token = chunk["message"]["content"]
token, done, _ = parse_llama_stream_chunk(line)
if token:
full_response.append(token)
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
if chunk.get("done"):
if done:
break
except json.JSONDecodeError:
pass
except Exception as e:
incident_key = log_incident(
"search_summarization_stream",
@@ -2010,7 +2017,32 @@ async def explicit_search(request: Request):
# =============================================================================
def build_system_prompt(db, extra_prompt="", user_message=""):
async def query_rag(query: str, limit: int = 3) -> list[dict]:
"""Query Qdrant for semantically relevant chunks."""
try:
async with httpx.AsyncClient() as client:
embed_resp = await client.post(
"http://192.168.50.108:11434/api/embeddings",
json={"model": "mxbai-embed-large", "prompt": query},
timeout=10.0,
)
if embed_resp.status_code != 200:
return []
vector = embed_resp.json()["embedding"]
search_resp = await client.post(
"http://192.168.50.108:6333/collections/jarvis_rag/points/search",
json={"vector": vector, "limit": limit, "with_payload": True},
timeout=10.0,
)
if search_resp.status_code != 200:
return []
return search_resp.json().get("result", [])
except Exception as e:
log.warning(f"RAG query error: {e}")
return []
async def build_system_prompt(db, extra_prompt="", user_message=""):
"""Build the full system prompt: profile + memories + preset."""
parts = []
settings = {
@@ -2030,6 +2062,17 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
parts.append("## Relevant Context from Memory\n" + "\n".join(memory_lines))
log.debug(f"Injected {len(memories)} memories into context")
if user_message:
try:
rag_results = await query_rag(user_message)
if rag_results:
rag_lines = [r["payload"]["text"] for r in rag_results if r["score"] > 0.25]
if rag_lines:
parts.append("## Retrieved Context\n" + "\n\n---\n\n".join(rag_lines))
log.warning(f"RAG injected {len(rag_lines)} chunks into context")
except Exception as e:
log.warning(f"RAG injection error: {e}")
if settings.get("skills_enabled", "true") == "true":
active_skills = [s for s in list_skills_with_state(db) if s["enabled"]]
if active_skills:
@@ -2041,6 +2084,42 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
return "\n\n---\n\n".join(parts) if parts else ""
def parse_llama_stream_chunk(line: str) -> tuple[str | None, bool, dict]:
"""Parse OpenAI-compatible SSE chunk. Returns (token, is_done, stats)."""
if line.startswith("data: "):
line = line[6:]
if line.strip() == "[DONE]":
return None, True, {}
try:
chunk = json.loads(line)
# OpenAI format
choices = chunk.get("choices", [])
if choices:
delta = choices[0].get("delta", {})
token = delta.get("content")
finish = choices[0].get("finish_reason")
stats = {}
if finish == "stop":
usage = chunk.get("usage", {})
stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
return token, finish == "stop", stats
# Ollama format fallback
if "message" in chunk and "content" in chunk["message"]:
token = chunk["message"]["content"]
done = chunk.get("done", False)
stats = {}
if done:
eval_count = chunk.get("eval_count", 0)
eval_duration = chunk.get("eval_duration", 0)
stats["tokens_per_sec"] = (
(eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
)
return token, done, stats
except json.JSONDecodeError:
pass
return None, False, {}
@app.post("/api/chat")
async def chat(request: Request):
body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES)
@@ -2086,7 +2165,7 @@ async def chat(request: Request):
"SELECT role, content FROM messages WHERE conversation_id = ? ORDER BY id ASC",
(conv_id,),
).fetchall()
system_prompt = build_system_prompt(db, preset_prompt, user_message)
system_prompt = await build_system_prompt(db, preset_prompt, user_message)
db.close()
messages = []
@@ -2099,7 +2178,6 @@ async def chat(request: Request):
"model": model,
"messages": messages,
"stream": True,
"logprobs": True,
}
async def stream_response():
@@ -2114,31 +2192,18 @@ async def chat(request: Request):
try:
async with client.stream(
"POST",
f"{OLLAMA_BASE}/api/chat",
f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json=ollama_payload,
timeout=httpx.Timeout(300.0, connect=10.0),
) as resp:
async for line in resp.aiter_lines():
if line.strip():
try:
chunk = json.loads(line)
if "message" in chunk and "content" in chunk["message"]:
token = chunk["message"]["content"]
token, done, stats = parse_llama_stream_chunk(line)
if token:
full_response.append(token)
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
if "logprobs" in chunk and chunk["logprobs"]:
all_logprobs.extend(chunk["logprobs"])
if chunk.get("done"):
eval_count = chunk.get("eval_count", 0)
eval_duration = chunk.get("eval_duration", 0)
tokens_per_sec = (
(eval_count / (eval_duration / 1e9))
if eval_duration > 0
else 0
)
break
except json.JSONDecodeError:
pass
if done:
tokens_per_sec = stats.get("tokens_per_sec", 0.0)
assistant_msg = "".join(full_response)
perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0
@@ -2176,7 +2241,7 @@ async def chat(request: Request):
augmented_response = []
async with client.stream(
"POST",
f"{OLLAMA_BASE}/api/chat",
f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json={
"model": model,
"messages": augmented_messages,
@@ -2186,19 +2251,11 @@ async def chat(request: Request):
) as resp2:
async for line in resp2.aiter_lines():
if line.strip():
try:
chunk = json.loads(line)
if (
"message" in chunk
and "content" in chunk["message"]
):
augmented_response.append(
chunk["message"]["content"]
)
if chunk.get("done"):
token2, done2, _ = parse_llama_stream_chunk(line)
if token2:
augmented_response.append(token2)
if done2:
break
except json.JSONDecodeError:
pass
raw_response = "".join(augmented_response) or assistant_msg
cleaned_response = clean_hedging(raw_response)
@@ -2251,6 +2308,8 @@ async def chat(request: Request):
yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n"
except httpx.RemoteProtocolError:
pass # llama-server closes connection after [DONE] — normal
except httpx.ConnectError:
yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
except Exception as e:

2285
app.py.bak Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 322 KiB

After

Width:  |  Height:  |  Size: 219 KiB

View File

@@ -4,7 +4,7 @@ Last updated: 2026-04-27
Owner: Gramps + Copilot
Scope: issues, bugs, security exposures, and feature enhancements.
Total identified items: 26
Total identified items: 27
## Priority Definitions
- P0: Critical risk or data-loss/security exposure; do first.
@@ -60,22 +60,23 @@ Total identified items: 26
12. Add unit/integration tests for: remember/forget parsing, refusal detection, search fallback, SSE done/error shape.
13. Add conversation title sanitization and length constraints.
14. Ensure default preset semantics are correct (currently all seeded presets are marked default).
15. Add preflight validation for required model/preset selection and block send with clear user guidance instead of timing out.
### P2 Important Features
15. Skills system: load markdown skill files with YAML frontmatter from skills directory.
16. Skills registry API: list/enable/disable skills and expose active skills to UI.
17. Inject active skill instructions into system prompt with bounded token budget.
18. Tool execution guardrails: allowlist, confirmation mode, and execution logs.
19. Heartbeat scheduler (cron/systemd timer) for daily check-ins.
20. Heartbeat endpoint for generated briefings and anomaly summaries.
21. Model info UI panel (description, updated date, best-use purpose).
22. Default model selection improvements and persistence validation.
23. Hidden model list support (exclude models from dropdown).
24. Model update action from UI (trigger controlled model pull).
16. Skills system: load markdown skill files with YAML frontmatter from skills directory.
17. Skills registry API: list/enable/disable skills and expose active skills to UI.
18. Inject active skill instructions into system prompt with bounded token budget.
19. Tool execution guardrails: allowlist, confirmation mode, and execution logs.
20. Heartbeat scheduler (cron/systemd timer) for daily check-ins.
21. Heartbeat endpoint for generated briefings and anomaly summaries.
22. Model info UI panel (description, updated date, best-use purpose).
23. Default model selection improvements and persistence validation.
24. Hidden model list support (exclude models from dropdown).
25. Model update action from UI (trigger controlled model pull).
### P3 Nice to Have
25. Conversation search/filter and export tooling.
26. Keyboard shortcuts, retry button, and source-link polish.
26. Conversation search/filter and export tooling.
27. Keyboard shortcuts, retry button, and source-link polish.
## Maintenance Rules
- Keep this file as the single source of truth.

View File

@@ -1,4 +1,4 @@
# ⚡ JarvisChat v1.7.6
# ⚡ JarvisChat v1.7.8
![screenshot](docs/images/screenshot.png)
@@ -74,7 +74,7 @@ Canonical backlog: [docs/wiki/current-wip.md](docs/wiki/current-wip.md)
Scope boundary: local-first (same-host Ollama), optional RFC1918 LAN endpoints, no public Internet AI endpoints by default.
Total identified items: 26
Total identified items: 27
Top 10 (brief):
@@ -113,6 +113,7 @@ Implementation status: complete (guest session by default + admin unlock + admin
16. Hide/remove model from list — exclude models from dropdown
17. Update model function — trigger `ollama pull` for selected model from UI
18. Add mouseover tooltip to SEND button
19. Add preflight validation for required model/preset selection and show a clear warning before send to prevent avoidable timeout loops
## File Structure