#!/usr/bin/env python3 """llama.cpp monitor - lightweight diagnostics dashboard.""" import http.server import json import os import re import subprocess import time import urllib.error import urllib.request LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://localhost:8080") MONITOR_PORT = int(os.environ.get("MONITOR_PORT", "80")) def get_gpu_stats(): try: result = subprocess.run( ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,name", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: parts = [p.strip() for p in result.stdout.strip().split(",")] return { "available": True, "utilization": float(parts[0]), "memory_used": float(parts[1]), "memory_total": float(parts[2]), "temperature": float(parts[3]), "power_draw": float(parts[4]), "power_limit": float(parts[5]), "fan_speed": float(parts[6]), "name": parts[7], } except Exception: pass return {"available": False} def fetch_json(path): try: req = urllib.request.urlopen(f"{LLAMA_SERVER_URL}{path}", timeout=3) return json.loads(req.read()) except Exception: return None def fetch_text(path): try: req = urllib.request.urlopen(f"{LLAMA_SERVER_URL}{path}", timeout=3) return req.read().decode() except Exception: return None def parse_prometheus(text): if not text: return {} metrics = {} for line in text.strip().split("\n"): if line.startswith("#"): continue m = re.match(r'^([\w:]+)(?:\{[^}]*\})?\s+([\d.eE+-]+)', line) if m: key, val = m.group(1), m.group(2) try: metrics[key] = float(val) except ValueError: pass return metrics def get_all_stats(): gpu = get_gpu_stats() health = fetch_json("/health") slots = fetch_json("/slots") model = fetch_json("/v1/models") metrics = parse_prometheus(fetch_text("/metrics")) return { "timestamp": time.time(), "gpu": gpu, "health": health, "slots": slots, "model": model, "metrics": metrics, "llama_url": LLAMA_SERVER_URL, } DASHBOARD_HTML = r"""