feat: update logo to jarvisChat-logo-1024.png, update static reference in index.html
This commit is contained in:
151
app.py.bak
151
app.py.bak
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
JarvisChat - Lightweight Ollama Coding Companion
|
||||
JarvisChat - Privacy-First Homelab Developer Platform
|
||||
A minimal replacement for Open-WebUI that actually runs on Python 3.13
|
||||
Talks to Ollama API on localhost:11434
|
||||
|
||||
@@ -56,8 +56,9 @@ syslog_handler.setFormatter(
|
||||
log.addHandler(syslog_handler)
|
||||
|
||||
# --- Configuration ---
|
||||
VERSION = "1.7.6"
|
||||
VERSION = "v1.8.0"
|
||||
OLLAMA_BASE = os.environ.get("OLLAMA_BASE", "http://localhost:11434")
|
||||
LLAMA_SERVER_BASE = os.environ.get("LLAMA_SERVER_BASE", "http://192.168.50.108:8081")
|
||||
SEARXNG_BASE = "http://localhost:8888"
|
||||
BASE_DIR = Path(__file__).parent
|
||||
DB_PATH = BASE_DIR / "jarvischat.db"
|
||||
@@ -1038,7 +1039,7 @@ def get_gpu_stats() -> dict:
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
log.info(f"JarvisChat v{VERSION} starting up")
|
||||
log.info(f"Ollama: {OLLAMA_BASE}, SearXNG: {SEARXNG_BASE}")
|
||||
log.info(f"Ollama: {OLLAMA_BASE}, llama-server: {LLAMA_SERVER_BASE}, SearXNG: {SEARXNG_BASE}")
|
||||
init_db()
|
||||
log.info(f"Memory system: {get_memory_count()} memories loaded")
|
||||
yield
|
||||
@@ -1966,22 +1967,18 @@ async def explicit_search(request: Request):
|
||||
try:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{OLLAMA_BASE}/api/chat",
|
||||
f"{LLAMA_SERVER_BASE}/v1/chat/completions",
|
||||
json={"model": model, "messages": messages, "stream": True},
|
||||
timeout=httpx.Timeout(300.0, connect=10.0),
|
||||
) as resp:
|
||||
async for line in resp.aiter_lines():
|
||||
if line.strip():
|
||||
try:
|
||||
chunk = json.loads(line)
|
||||
if "message" in chunk and "content" in chunk["message"]:
|
||||
token = chunk["message"]["content"]
|
||||
full_response.append(token)
|
||||
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
|
||||
if chunk.get("done"):
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
token, done, _ = parse_llama_stream_chunk(line)
|
||||
if token:
|
||||
full_response.append(token)
|
||||
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
|
||||
if done:
|
||||
break
|
||||
except Exception as e:
|
||||
incident_key = log_incident(
|
||||
"search_summarization_stream",
|
||||
@@ -2020,7 +2017,32 @@ async def explicit_search(request: Request):
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def build_system_prompt(db, extra_prompt="", user_message=""):
|
||||
|
||||
async def query_rag(query: str, limit: int = 3) -> list[dict]:
|
||||
"""Query Qdrant for semantically relevant chunks."""
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
embed_resp = await client.post(
|
||||
"http://192.168.50.108:11434/api/embeddings",
|
||||
json={"model": "mxbai-embed-large", "prompt": query},
|
||||
timeout=10.0,
|
||||
)
|
||||
if embed_resp.status_code != 200:
|
||||
return []
|
||||
vector = embed_resp.json()["embedding"]
|
||||
search_resp = await client.post(
|
||||
"http://192.168.50.108:6333/collections/jarvis_rag/points/search",
|
||||
json={"vector": vector, "limit": limit, "with_payload": True},
|
||||
timeout=10.0,
|
||||
)
|
||||
if search_resp.status_code != 200:
|
||||
return []
|
||||
return search_resp.json().get("result", [])
|
||||
except Exception as e:
|
||||
log.warning(f"RAG query error: {e}")
|
||||
return []
|
||||
|
||||
async def build_system_prompt(db, extra_prompt="", user_message=""):
|
||||
"""Build the full system prompt: profile + memories + preset."""
|
||||
parts = []
|
||||
settings = {
|
||||
@@ -2040,6 +2062,17 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
|
||||
parts.append("## Relevant Context from Memory\n" + "\n".join(memory_lines))
|
||||
log.debug(f"Injected {len(memories)} memories into context")
|
||||
|
||||
if user_message:
|
||||
try:
|
||||
rag_results = await query_rag(user_message)
|
||||
if rag_results:
|
||||
rag_lines = [r["payload"]["text"] for r in rag_results if r["score"] > 0.25]
|
||||
if rag_lines:
|
||||
parts.append("## Retrieved Context\n" + "\n\n---\n\n".join(rag_lines))
|
||||
log.warning(f"RAG injected {len(rag_lines)} chunks into context")
|
||||
except Exception as e:
|
||||
log.warning(f"RAG injection error: {e}")
|
||||
|
||||
if settings.get("skills_enabled", "true") == "true":
|
||||
active_skills = [s for s in list_skills_with_state(db) if s["enabled"]]
|
||||
if active_skills:
|
||||
@@ -2051,6 +2084,42 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
|
||||
return "\n\n---\n\n".join(parts) if parts else ""
|
||||
|
||||
|
||||
def parse_llama_stream_chunk(line: str) -> tuple[str | None, bool, dict]:
|
||||
"""Parse OpenAI-compatible SSE chunk. Returns (token, is_done, stats)."""
|
||||
if line.startswith("data: "):
|
||||
line = line[6:]
|
||||
if line.strip() == "[DONE]":
|
||||
return None, True, {}
|
||||
try:
|
||||
chunk = json.loads(line)
|
||||
# OpenAI format
|
||||
choices = chunk.get("choices", [])
|
||||
if choices:
|
||||
delta = choices[0].get("delta", {})
|
||||
token = delta.get("content")
|
||||
finish = choices[0].get("finish_reason")
|
||||
stats = {}
|
||||
if finish == "stop":
|
||||
usage = chunk.get("usage", {})
|
||||
stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
|
||||
return token, finish == "stop", stats
|
||||
# Ollama format fallback
|
||||
if "message" in chunk and "content" in chunk["message"]:
|
||||
token = chunk["message"]["content"]
|
||||
done = chunk.get("done", False)
|
||||
stats = {}
|
||||
if done:
|
||||
eval_count = chunk.get("eval_count", 0)
|
||||
eval_duration = chunk.get("eval_duration", 0)
|
||||
stats["tokens_per_sec"] = (
|
||||
(eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
|
||||
)
|
||||
return token, done, stats
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None, False, {}
|
||||
|
||||
|
||||
@app.post("/api/chat")
|
||||
async def chat(request: Request):
|
||||
body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES)
|
||||
@@ -2096,7 +2165,7 @@ async def chat(request: Request):
|
||||
"SELECT role, content FROM messages WHERE conversation_id = ? ORDER BY id ASC",
|
||||
(conv_id,),
|
||||
).fetchall()
|
||||
system_prompt = build_system_prompt(db, preset_prompt, user_message)
|
||||
system_prompt = await build_system_prompt(db, preset_prompt, user_message)
|
||||
db.close()
|
||||
|
||||
messages = []
|
||||
@@ -2109,7 +2178,6 @@ async def chat(request: Request):
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"stream": True,
|
||||
"logprobs": True,
|
||||
}
|
||||
|
||||
async def stream_response():
|
||||
@@ -2124,31 +2192,18 @@ async def chat(request: Request):
|
||||
try:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{OLLAMA_BASE}/api/chat",
|
||||
f"{LLAMA_SERVER_BASE}/v1/chat/completions",
|
||||
json=ollama_payload,
|
||||
timeout=httpx.Timeout(300.0, connect=10.0),
|
||||
) as resp:
|
||||
async for line in resp.aiter_lines():
|
||||
if line.strip():
|
||||
try:
|
||||
chunk = json.loads(line)
|
||||
if "message" in chunk and "content" in chunk["message"]:
|
||||
token = chunk["message"]["content"]
|
||||
full_response.append(token)
|
||||
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
|
||||
if "logprobs" in chunk and chunk["logprobs"]:
|
||||
all_logprobs.extend(chunk["logprobs"])
|
||||
if chunk.get("done"):
|
||||
eval_count = chunk.get("eval_count", 0)
|
||||
eval_duration = chunk.get("eval_duration", 0)
|
||||
tokens_per_sec = (
|
||||
(eval_count / (eval_duration / 1e9))
|
||||
if eval_duration > 0
|
||||
else 0
|
||||
)
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
token, done, stats = parse_llama_stream_chunk(line)
|
||||
if token:
|
||||
full_response.append(token)
|
||||
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
|
||||
if done:
|
||||
tokens_per_sec = stats.get("tokens_per_sec", 0.0)
|
||||
|
||||
assistant_msg = "".join(full_response)
|
||||
perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0
|
||||
@@ -2186,7 +2241,7 @@ async def chat(request: Request):
|
||||
augmented_response = []
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{OLLAMA_BASE}/api/chat",
|
||||
f"{LLAMA_SERVER_BASE}/v1/chat/completions",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": augmented_messages,
|
||||
@@ -2196,19 +2251,11 @@ async def chat(request: Request):
|
||||
) as resp2:
|
||||
async for line in resp2.aiter_lines():
|
||||
if line.strip():
|
||||
try:
|
||||
chunk = json.loads(line)
|
||||
if (
|
||||
"message" in chunk
|
||||
and "content" in chunk["message"]
|
||||
):
|
||||
augmented_response.append(
|
||||
chunk["message"]["content"]
|
||||
)
|
||||
if chunk.get("done"):
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
token2, done2, _ = parse_llama_stream_chunk(line)
|
||||
if token2:
|
||||
augmented_response.append(token2)
|
||||
if done2:
|
||||
break
|
||||
|
||||
raw_response = "".join(augmented_response) or assistant_msg
|
||||
cleaned_response = clean_hedging(raw_response)
|
||||
@@ -2261,6 +2308,8 @@ async def chat(request: Request):
|
||||
|
||||
yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n"
|
||||
|
||||
except httpx.RemoteProtocolError:
|
||||
pass # llama-server closes connection after [DONE] — normal
|
||||
except httpx.ConnectError:
|
||||
yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user