#!/usr/bin/env python3 """llama.cpp monitor - lightweight diagnostics dashboard.""" import http.server import json import os import re import subprocess import time import urllib.error import urllib.request LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://localhost:8080") MONITOR_PORT = int(os.environ.get("MONITOR_PORT", "80")) def get_gpu_stats(): try: result = subprocess.run( ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,name", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: parts = [p.strip() for p in result.stdout.strip().split(",")] return { "available": True, "utilization": float(parts[0]), "memory_used": float(parts[1]), "memory_total": float(parts[2]), "temperature": float(parts[3]), "power_draw": float(parts[4]), "power_limit": float(parts[5]), "fan_speed": float(parts[6]), "name": parts[7], } except Exception: pass return {"available": False} def fetch_json(path): try: req = urllib.request.urlopen(f"{LLAMA_SERVER_URL}{path}", timeout=3) return json.loads(req.read()) except Exception: return None def fetch_text(path): try: req = urllib.request.urlopen(f"{LLAMA_SERVER_URL}{path}", timeout=3) return req.read().decode() except Exception: return None def parse_prometheus(text): if not text: return {} metrics = {} for line in text.strip().split("\n"): if line.startswith("#"): continue m = re.match(r'^([\w:]+)(?:\{[^}]*\})?\s+([\d.eE+-]+)', line) if m: key, val = m.group(1), m.group(2) try: metrics[key] = float(val) except ValueError: pass return metrics def get_all_stats(): gpu = get_gpu_stats() health = fetch_json("/health") slots = fetch_json("/slots") model = fetch_json("/v1/models") metrics = parse_prometheus(fetch_text("/metrics")) return { "timestamp": time.time(), "gpu": gpu, "health": health, "slots": slots, "model": model, "metrics": metrics, "llama_url": LLAMA_SERVER_URL, } DASHBOARD_HTML = r""" llama.cpp Monitor

llama.cpp Monitor ...

GPU Utilization

Load --%

GPU Memory

Used --MiB / --MiB

GPU Vitals

Temperature --°C
Power --W / --W
Fan --%
GPU --

Model

--Model
--Parameters
--Size
--Context (train)
--GPU Layers

Throughput

--
Prompt tok/s
--
Generate tok/s
--
Prompt Tokens (total)
--
Gen Tokens (total)
--
Requests
--
KV Cache Used

Slots

Waiting for data...
""" class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): if self.path == "/" or self.path == "/index.html": self.send_response(200) self.send_header("Content-Type", "text/html; charset=utf-8") self.end_headers() self.wfile.write(DASHBOARD_HTML.encode()) elif self.path == "/api/stats": stats = get_all_stats() self.send_response(200) self.send_header("Content-Type", "application/json") self.send_header("Cache-Control", "no-cache") self.end_headers() self.wfile.write(json.dumps(stats).encode()) elif self.path == "/health": self.send_response(200) self.send_header("Content-Type", "application/json") self.end_headers() self.wfile.write(b'{"status":"ok"}') else: self.send_response(404) self.end_headers() def log_message(self, fmt, *args): pass # suppress request logging def main(): server = http.server.HTTPServer(("0.0.0.0", MONITOR_PORT), Handler) print(f"llama.cpp monitor listening on port {MONITOR_PORT}") print(f" llama server: {LLAMA_SERVER_URL}") try: server.serve_forever() except KeyboardInterrupt: pass server.server_close() if __name__ == "__main__": main()