Compare commits

...

3 Commits

2 changed files with 2398 additions and 54 deletions

155
app.py
View File

@@ -56,8 +56,9 @@ syslog_handler.setFormatter(
log.addHandler(syslog_handler) log.addHandler(syslog_handler)
# --- Configuration --- # --- Configuration ---
VERSION = "1.7.8" VERSION = "v1.8.0"
OLLAMA_BASE = "http://localhost:11434" OLLAMA_BASE = os.environ.get("OLLAMA_BASE", "http://localhost:11434")
LLAMA_SERVER_BASE = os.environ.get("LLAMA_SERVER_BASE", "http://192.168.50.108:8081")
SEARXNG_BASE = "http://localhost:8888" SEARXNG_BASE = "http://localhost:8888"
BASE_DIR = Path(__file__).parent BASE_DIR = Path(__file__).parent
DB_PATH = BASE_DIR / "jarvischat.db" DB_PATH = BASE_DIR / "jarvischat.db"
@@ -1038,7 +1039,7 @@ def get_gpu_stats() -> dict:
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
log.info(f"JarvisChat v{VERSION} starting up") log.info(f"JarvisChat v{VERSION} starting up")
log.info(f"Ollama: {OLLAMA_BASE}, SearXNG: {SEARXNG_BASE}") log.info(f"Ollama: {OLLAMA_BASE}, llama-server: {LLAMA_SERVER_BASE}, SearXNG: {SEARXNG_BASE}")
init_db() init_db()
log.info(f"Memory system: {get_memory_count()} memories loaded") log.info(f"Memory system: {get_memory_count()} memories loaded")
yield yield
@@ -1480,14 +1481,24 @@ async def index(request: Request):
return templates.TemplateResponse(request, "index.html", {"version": VERSION}) return templates.TemplateResponse(request, "index.html", {"version": VERSION})
#@app.get("/api/models")
#async def list_models():
# async with httpx.AsyncClient() as client:
# try:
# resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10)
# return resp.json()
# except httpx.ConnectError:
# raise HTTPException(status_code=502, detail="Cannot connect to Ollama.")
@app.get("/api/models") @app.get("/api/models")
async def list_models(): async def list_models():
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
try: try:
resp = await client.get(f"{OLLAMA_BASE}/api/tags", timeout=10) resp = await client.get(f"{OLLAMA_BASE}/v1/models", timeout=10)
return resp.json() data = resp.json()
models = [{"name": m["id"], "model": m["id"]} for m in data.get("data", [])]
return {"models": models}
except httpx.ConnectError: except httpx.ConnectError:
raise HTTPException(status_code=502, detail="Cannot connect to Ollama.") raise HTTPException(status_code=502, detail="Cannot connect to llama-server.")
@app.get("/api/ps") @app.get("/api/ps")
@@ -1956,22 +1967,18 @@ async def explicit_search(request: Request):
try: try:
async with client.stream( async with client.stream(
"POST", "POST",
f"{OLLAMA_BASE}/api/chat", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json={"model": model, "messages": messages, "stream": True}, json={"model": model, "messages": messages, "stream": True},
timeout=httpx.Timeout(300.0, connect=10.0), timeout=httpx.Timeout(300.0, connect=10.0),
) as resp: ) as resp:
async for line in resp.aiter_lines(): async for line in resp.aiter_lines():
if line.strip(): if line.strip():
try: token, done, _ = parse_llama_stream_chunk(line)
chunk = json.loads(line) if token:
if "message" in chunk and "content" in chunk["message"]:
token = chunk["message"]["content"]
full_response.append(token) full_response.append(token)
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n" yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
if chunk.get("done"): if done:
break break
except json.JSONDecodeError:
pass
except Exception as e: except Exception as e:
incident_key = log_incident( incident_key = log_incident(
"search_summarization_stream", "search_summarization_stream",
@@ -2010,7 +2017,32 @@ async def explicit_search(request: Request):
# ============================================================================= # =============================================================================
def build_system_prompt(db, extra_prompt="", user_message=""):
async def query_rag(query: str, limit: int = 3) -> list[dict]:
"""Query Qdrant for semantically relevant chunks."""
try:
async with httpx.AsyncClient() as client:
embed_resp = await client.post(
"http://192.168.50.108:11434/api/embeddings",
json={"model": "mxbai-embed-large", "prompt": query},
timeout=10.0,
)
if embed_resp.status_code != 200:
return []
vector = embed_resp.json()["embedding"]
search_resp = await client.post(
"http://192.168.50.108:6333/collections/jarvis_rag/points/search",
json={"vector": vector, "limit": limit, "with_payload": True},
timeout=10.0,
)
if search_resp.status_code != 200:
return []
return search_resp.json().get("result", [])
except Exception as e:
log.warning(f"RAG query error: {e}")
return []
async def build_system_prompt(db, extra_prompt="", user_message=""):
"""Build the full system prompt: profile + memories + preset.""" """Build the full system prompt: profile + memories + preset."""
parts = [] parts = []
settings = { settings = {
@@ -2030,6 +2062,17 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
parts.append("## Relevant Context from Memory\n" + "\n".join(memory_lines)) parts.append("## Relevant Context from Memory\n" + "\n".join(memory_lines))
log.debug(f"Injected {len(memories)} memories into context") log.debug(f"Injected {len(memories)} memories into context")
if user_message:
try:
rag_results = await query_rag(user_message)
if rag_results:
rag_lines = [r["payload"]["text"] for r in rag_results if r["score"] > 0.25]
if rag_lines:
parts.append("## Retrieved Context\n" + "\n\n---\n\n".join(rag_lines))
log.warning(f"RAG injected {len(rag_lines)} chunks into context")
except Exception as e:
log.warning(f"RAG injection error: {e}")
if settings.get("skills_enabled", "true") == "true": if settings.get("skills_enabled", "true") == "true":
active_skills = [s for s in list_skills_with_state(db) if s["enabled"]] active_skills = [s for s in list_skills_with_state(db) if s["enabled"]]
if active_skills: if active_skills:
@@ -2041,6 +2084,42 @@ def build_system_prompt(db, extra_prompt="", user_message=""):
return "\n\n---\n\n".join(parts) if parts else "" return "\n\n---\n\n".join(parts) if parts else ""
def parse_llama_stream_chunk(line: str) -> tuple[str | None, bool, dict]:
"""Parse OpenAI-compatible SSE chunk. Returns (token, is_done, stats)."""
if line.startswith("data: "):
line = line[6:]
if line.strip() == "[DONE]":
return None, True, {}
try:
chunk = json.loads(line)
# OpenAI format
choices = chunk.get("choices", [])
if choices:
delta = choices[0].get("delta", {})
token = delta.get("content")
finish = choices[0].get("finish_reason")
stats = {}
if finish == "stop":
usage = chunk.get("usage", {})
stats["tokens_per_sec"] = usage.get("tokens_per_second", 0.0)
return token, finish == "stop", stats
# Ollama format fallback
if "message" in chunk and "content" in chunk["message"]:
token = chunk["message"]["content"]
done = chunk.get("done", False)
stats = {}
if done:
eval_count = chunk.get("eval_count", 0)
eval_duration = chunk.get("eval_duration", 0)
stats["tokens_per_sec"] = (
(eval_count / (eval_duration / 1e9)) if eval_duration > 0 else 0
)
return token, done, stats
except json.JSONDecodeError:
pass
return None, False, {}
@app.post("/api/chat") @app.post("/api/chat")
async def chat(request: Request): async def chat(request: Request):
body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES) body = await read_json_body(request, BODY_LIMIT_CHAT_BYTES)
@@ -2086,7 +2165,7 @@ async def chat(request: Request):
"SELECT role, content FROM messages WHERE conversation_id = ? ORDER BY id ASC", "SELECT role, content FROM messages WHERE conversation_id = ? ORDER BY id ASC",
(conv_id,), (conv_id,),
).fetchall() ).fetchall()
system_prompt = build_system_prompt(db, preset_prompt, user_message) system_prompt = await build_system_prompt(db, preset_prompt, user_message)
db.close() db.close()
messages = [] messages = []
@@ -2099,7 +2178,6 @@ async def chat(request: Request):
"model": model, "model": model,
"messages": messages, "messages": messages,
"stream": True, "stream": True,
"logprobs": True,
} }
async def stream_response(): async def stream_response():
@@ -2114,31 +2192,18 @@ async def chat(request: Request):
try: try:
async with client.stream( async with client.stream(
"POST", "POST",
f"{OLLAMA_BASE}/api/chat", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json=ollama_payload, json=ollama_payload,
timeout=httpx.Timeout(300.0, connect=10.0), timeout=httpx.Timeout(300.0, connect=10.0),
) as resp: ) as resp:
async for line in resp.aiter_lines(): async for line in resp.aiter_lines():
if line.strip(): if line.strip():
try: token, done, stats = parse_llama_stream_chunk(line)
chunk = json.loads(line) if token:
if "message" in chunk and "content" in chunk["message"]:
token = chunk["message"]["content"]
full_response.append(token) full_response.append(token)
yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n" yield f"data: {json.dumps({'token': token, 'conversation_id': conv_id})}\n\n"
if "logprobs" in chunk and chunk["logprobs"]: if done:
all_logprobs.extend(chunk["logprobs"]) tokens_per_sec = stats.get("tokens_per_sec", 0.0)
if chunk.get("done"):
eval_count = chunk.get("eval_count", 0)
eval_duration = chunk.get("eval_duration", 0)
tokens_per_sec = (
(eval_count / (eval_duration / 1e9))
if eval_duration > 0
else 0
)
break
except json.JSONDecodeError:
pass
assistant_msg = "".join(full_response) assistant_msg = "".join(full_response)
perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0 perplexity = calculate_perplexity(all_logprobs) if all_logprobs else 0.0
@@ -2176,7 +2241,7 @@ async def chat(request: Request):
augmented_response = [] augmented_response = []
async with client.stream( async with client.stream(
"POST", "POST",
f"{OLLAMA_BASE}/api/chat", f"{LLAMA_SERVER_BASE}/v1/chat/completions",
json={ json={
"model": model, "model": model,
"messages": augmented_messages, "messages": augmented_messages,
@@ -2186,19 +2251,11 @@ async def chat(request: Request):
) as resp2: ) as resp2:
async for line in resp2.aiter_lines(): async for line in resp2.aiter_lines():
if line.strip(): if line.strip():
try: token2, done2, _ = parse_llama_stream_chunk(line)
chunk = json.loads(line) if token2:
if ( augmented_response.append(token2)
"message" in chunk if done2:
and "content" in chunk["message"]
):
augmented_response.append(
chunk["message"]["content"]
)
if chunk.get("done"):
break break
except json.JSONDecodeError:
pass
raw_response = "".join(augmented_response) or assistant_msg raw_response = "".join(augmented_response) or assistant_msg
cleaned_response = clean_hedging(raw_response) cleaned_response = clean_hedging(raw_response)
@@ -2251,6 +2308,8 @@ async def chat(request: Request):
yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n" yield f"data: {json.dumps({'done': True, 'conversation_id': conv_id, 'perplexity': round(perplexity, 2), 'tokens_per_sec': round(tokens_per_sec, 1)})}\n\n"
except httpx.RemoteProtocolError:
pass # llama-server closes connection after [DONE] — normal
except httpx.ConnectError: except httpx.ConnectError:
yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n" yield f"data: {json.dumps({'error': 'Cannot connect to Ollama. Is it running?'})}\n\n"
except Exception as e: except Exception as e:

2285
app.py.bak Normal file

File diff suppressed because it is too large Load Diff