|
|
|
|
@@ -1,465 +0,0 @@
|
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""llama.cpp monitor - lightweight diagnostics dashboard."""
|
|
|
|
|
|
|
|
|
|
import http.server
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import re
|
|
|
|
|
import subprocess
|
|
|
|
|
import time
|
|
|
|
|
import urllib.error
|
|
|
|
|
import urllib.request
|
|
|
|
|
|
|
|
|
|
LLAMA_SERVER_URL = os.environ.get("LLAMA_SERVER_URL", "http://localhost:8080")
|
|
|
|
|
MONITOR_PORT = int(os.environ.get("MONITOR_PORT", "80"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_gpu_stats():
|
|
|
|
|
try:
|
|
|
|
|
result = subprocess.run(
|
|
|
|
|
["nvidia-smi",
|
|
|
|
|
"--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw,power.limit,fan.speed,name",
|
|
|
|
|
"--format=csv,noheader,nounits"],
|
|
|
|
|
capture_output=True, text=True, timeout=5
|
|
|
|
|
)
|
|
|
|
|
if result.returncode == 0:
|
|
|
|
|
parts = [p.strip() for p in result.stdout.strip().split(",")]
|
|
|
|
|
return {
|
|
|
|
|
"available": True,
|
|
|
|
|
"utilization": float(parts[0]),
|
|
|
|
|
"memory_used": float(parts[1]),
|
|
|
|
|
"memory_total": float(parts[2]),
|
|
|
|
|
"temperature": float(parts[3]),
|
|
|
|
|
"power_draw": float(parts[4]),
|
|
|
|
|
"power_limit": float(parts[5]),
|
|
|
|
|
"fan_speed": float(parts[6]),
|
|
|
|
|
"name": parts[7],
|
|
|
|
|
}
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
return {"available": False}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_json(path):
|
|
|
|
|
try:
|
|
|
|
|
req = urllib.request.urlopen(f"{LLAMA_SERVER_URL}{path}", timeout=3)
|
|
|
|
|
return json.loads(req.read())
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_text(path):
|
|
|
|
|
try:
|
|
|
|
|
req = urllib.request.urlopen(f"{LLAMA_SERVER_URL}{path}", timeout=3)
|
|
|
|
|
return req.read().decode()
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_prometheus(text):
|
|
|
|
|
if not text:
|
|
|
|
|
return {}
|
|
|
|
|
metrics = {}
|
|
|
|
|
for line in text.strip().split("\n"):
|
|
|
|
|
if line.startswith("#"):
|
|
|
|
|
continue
|
|
|
|
|
m = re.match(r'^([\w:]+)(?:\{[^}]*\})?\s+([\d.eE+-]+)', line)
|
|
|
|
|
if m:
|
|
|
|
|
key, val = m.group(1), m.group(2)
|
|
|
|
|
try:
|
|
|
|
|
metrics[key] = float(val)
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
return metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_all_stats():
|
|
|
|
|
gpu = get_gpu_stats()
|
|
|
|
|
health = fetch_json("/health")
|
|
|
|
|
slots = fetch_json("/slots")
|
|
|
|
|
model = fetch_json("/v1/models")
|
|
|
|
|
metrics = parse_prometheus(fetch_text("/metrics"))
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
"timestamp": time.time(),
|
|
|
|
|
"gpu": gpu,
|
|
|
|
|
"health": health,
|
|
|
|
|
"slots": slots,
|
|
|
|
|
"model": model,
|
|
|
|
|
"metrics": metrics,
|
|
|
|
|
"llama_url": LLAMA_SERVER_URL,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DASHBOARD_HTML = r"""<!DOCTYPE html>
|
|
|
|
|
<html lang="en">
|
|
|
|
|
<head>
|
|
|
|
|
<meta charset="utf-8">
|
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
|
|
|
<title>llama.cpp Monitor</title>
|
|
|
|
|
<style>
|
|
|
|
|
*{margin:0;padding:0;box-sizing:border-box}
|
|
|
|
|
body{background:#0f0f1a;color:#d0d0e0;font-family:'Courier New',monospace;font-size:14px;padding:16px}
|
|
|
|
|
h1{font-size:18px;color:#7aa2f7;margin-bottom:12px;display:flex;align-items:center;gap:12px}
|
|
|
|
|
h1 .status{font-size:12px;padding:2px 8px;border-radius:4px;font-weight:normal}
|
|
|
|
|
h1 .status.ok{background:#1a3a1a;color:#9ece6a}
|
|
|
|
|
h1 .status.err{background:#3a1a1a;color:#f7768e}
|
|
|
|
|
.grid{display:grid;grid-template-columns:1fr 1fr;gap:12px;margin-bottom:12px}
|
|
|
|
|
.card{background:#1a1b2e;border:1px solid #2a2b3e;border-radius:8px;padding:14px}
|
|
|
|
|
.card h2{font-size:13px;color:#565f89;text-transform:uppercase;letter-spacing:1px;margin-bottom:10px}
|
|
|
|
|
.metric{display:flex;justify-content:space-between;align-items:baseline;margin-bottom:6px}
|
|
|
|
|
.metric .label{color:#787c99}
|
|
|
|
|
.metric .value{font-size:18px;font-weight:bold;color:#c0caf5}
|
|
|
|
|
.metric .unit{font-size:12px;color:#565f89;margin-left:2px}
|
|
|
|
|
.bar-wrap{background:#1f2035;border-radius:4px;height:20px;overflow:hidden;margin-bottom:8px}
|
|
|
|
|
.bar-fill{height:100%;border-radius:4px;transition:width 0.5s ease}
|
|
|
|
|
.bar-fill.mem{background:linear-gradient(90deg,#7aa2f7,#bb9af7)}
|
|
|
|
|
.bar-fill.util{background:linear-gradient(90deg,#9ece6a,#e0af68)}
|
|
|
|
|
.bar-fill.hot{background:linear-gradient(90deg,#e0af68,#f7768e)}
|
|
|
|
|
.spark-wrap{height:50px;margin-top:6px}
|
|
|
|
|
canvas{width:100%;height:50px;display:block}
|
|
|
|
|
.slots-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(280px,1fr));gap:8px}
|
|
|
|
|
.slot{background:#1f2035;border-radius:6px;padding:10px;border-left:3px solid #565f89}
|
|
|
|
|
.slot.active{border-left-color:#9ece6a}
|
|
|
|
|
.slot .slot-head{display:flex;justify-content:space-between;margin-bottom:4px}
|
|
|
|
|
.slot .slot-state{font-size:11px;padding:1px 6px;border-radius:3px}
|
|
|
|
|
.slot .slot-state.idle{background:#1a2a3a;color:#7aa2f7}
|
|
|
|
|
.slot .slot-state.processing{background:#1a3a1a;color:#9ece6a}
|
|
|
|
|
.metrics-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(180px,1fr));gap:8px}
|
|
|
|
|
.metrics-grid .m-item{background:#1f2035;border-radius:6px;padding:10px;text-align:center}
|
|
|
|
|
.metrics-grid .m-val{font-size:20px;font-weight:bold;color:#c0caf5}
|
|
|
|
|
.metrics-grid .m-label{font-size:11px;color:#565f89;margin-top:2px}
|
|
|
|
|
.full-width{grid-column:1/-1}
|
|
|
|
|
.model-info{display:flex;flex-wrap:wrap;gap:16px}
|
|
|
|
|
.model-info .mi{display:flex;flex-direction:column}
|
|
|
|
|
.model-info .mi .mi-val{font-size:16px;color:#c0caf5;font-weight:bold}
|
|
|
|
|
.model-info .mi .mi-label{font-size:11px;color:#565f89}
|
|
|
|
|
.no-gpu{color:#565f89;font-style:italic}
|
|
|
|
|
</style>
|
|
|
|
|
</head>
|
|
|
|
|
<body>
|
|
|
|
|
|
|
|
|
|
<h1>
|
|
|
|
|
llama.cpp Monitor
|
|
|
|
|
<span class="status" id="srv-status">...</span>
|
|
|
|
|
<span style="flex:1"></span>
|
|
|
|
|
<span style="font-size:12px;color:#565f89;font-weight:normal" id="srv-url"></span>
|
|
|
|
|
</h1>
|
|
|
|
|
|
|
|
|
|
<div class="grid">
|
|
|
|
|
<!-- GPU Utilization -->
|
|
|
|
|
<div class="card" id="gpu-card">
|
|
|
|
|
<h2>GPU Utilization</h2>
|
|
|
|
|
<div id="gpu-util-content">
|
|
|
|
|
<div class="metric">
|
|
|
|
|
<span class="label">Load</span>
|
|
|
|
|
<span><span class="value" id="gpu-util">--</span><span class="unit">%</span></span>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="bar-wrap"><div class="bar-fill util" id="gpu-util-bar" style="width:0%"></div></div>
|
|
|
|
|
<div class="spark-wrap"><canvas id="gpu-util-chart"></canvas></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<!-- GPU Memory -->
|
|
|
|
|
<div class="card">
|
|
|
|
|
<h2>GPU Memory</h2>
|
|
|
|
|
<div id="gpu-mem-content">
|
|
|
|
|
<div class="metric">
|
|
|
|
|
<span class="label">Used</span>
|
|
|
|
|
<span><span class="value" id="gpu-mem-used">--</span><span class="unit">MiB</span>
|
|
|
|
|
/ <span id="gpu-mem-total">--</span><span class="unit">MiB</span></span>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="bar-wrap"><div class="bar-fill mem" id="gpu-mem-bar" style="width:0%"></div></div>
|
|
|
|
|
<div class="spark-wrap"><canvas id="gpu-mem-chart"></canvas></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<!-- GPU Vitals -->
|
|
|
|
|
<div class="card">
|
|
|
|
|
<h2>GPU Vitals</h2>
|
|
|
|
|
<div id="gpu-vitals-content">
|
|
|
|
|
<div class="metric">
|
|
|
|
|
<span class="label">Temperature</span>
|
|
|
|
|
<span><span class="value" id="gpu-temp">--</span><span class="unit">°C</span></span>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="bar-wrap"><div class="bar-fill hot" id="gpu-temp-bar" style="width:0%"></div></div>
|
|
|
|
|
<div class="metric">
|
|
|
|
|
<span class="label">Power</span>
|
|
|
|
|
<span><span class="value" id="gpu-power">--</span><span class="unit">W</span>
|
|
|
|
|
/ <span id="gpu-power-limit">--</span><span class="unit">W</span></span>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="metric">
|
|
|
|
|
<span class="label">Fan</span>
|
|
|
|
|
<span><span class="value" id="gpu-fan">--</span><span class="unit">%</span></span>
|
|
|
|
|
</div>
|
|
|
|
|
<div class="metric">
|
|
|
|
|
<span class="label">GPU</span>
|
|
|
|
|
<span style="font-size:12px;color:#787c99" id="gpu-name">--</span>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<!-- Model Info -->
|
|
|
|
|
<div class="card">
|
|
|
|
|
<h2>Model</h2>
|
|
|
|
|
<div class="model-info" id="model-info">
|
|
|
|
|
<div class="mi"><span class="mi-val" id="model-name">--</span><span class="mi-label">Model</span></div>
|
|
|
|
|
<div class="mi"><span class="mi-val" id="model-params">--</span><span class="mi-label">Parameters</span></div>
|
|
|
|
|
<div class="mi"><span class="mi-val" id="model-size">--</span><span class="mi-label">Size</span></div>
|
|
|
|
|
<div class="mi"><span class="mi-val" id="model-ctx">--</span><span class="mi-label">Context (train)</span></div>
|
|
|
|
|
<div class="mi"><span class="mi-val" id="model-ngl">--</span><span class="mi-label">GPU Layers</span></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<!-- Throughput Metrics -->
|
|
|
|
|
<div class="card full-width">
|
|
|
|
|
<h2>Throughput</h2>
|
|
|
|
|
<div class="metrics-grid" id="throughput-grid">
|
|
|
|
|
<div class="m-item"><div class="m-val" id="m-prompt-tps">--</div><div class="m-label">Prompt tok/s</div></div>
|
|
|
|
|
<div class="m-item"><div class="m-val" id="m-gen-tps">--</div><div class="m-label">Generate tok/s</div></div>
|
|
|
|
|
<div class="m-item"><div class="m-val" id="m-prompt-tokens">--</div><div class="m-label">Prompt Tokens (total)</div></div>
|
|
|
|
|
<div class="m-item"><div class="m-val" id="m-gen-tokens">--</div><div class="m-label">Gen Tokens (total)</div></div>
|
|
|
|
|
<div class="m-item"><div class="m-val" id="m-requests">--</div><div class="m-label">Requests</div></div>
|
|
|
|
|
<div class="m-item"><div class="m-val" id="m-kv-pct">--</div><div class="m-label">KV Cache Used</div></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<!-- Slots -->
|
|
|
|
|
<div class="card full-width">
|
|
|
|
|
<h2>Slots</h2>
|
|
|
|
|
<div class="slots-grid" id="slots-container">
|
|
|
|
|
<div class="slot"><span class="no-gpu">Waiting for data...</span></div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<script>
|
|
|
|
|
const HISTORY_LEN = 120;
|
|
|
|
|
const gpuUtilHistory = [];
|
|
|
|
|
const gpuMemHistory = [];
|
|
|
|
|
function drawSparkline(canvasId, data, maxVal, color1, color2) {
|
|
|
|
|
const canvas = document.getElementById(canvasId);
|
|
|
|
|
if (!canvas) return;
|
|
|
|
|
const ctx = canvas.getContext('2d');
|
|
|
|
|
const dpr = window.devicePixelRatio || 1;
|
|
|
|
|
const rect = canvas.getBoundingClientRect();
|
|
|
|
|
canvas.width = rect.width * dpr;
|
|
|
|
|
canvas.height = rect.height * dpr;
|
|
|
|
|
ctx.scale(dpr, dpr);
|
|
|
|
|
const w = rect.width, h = rect.height;
|
|
|
|
|
ctx.clearRect(0, 0, w, h);
|
|
|
|
|
if (data.length < 2) return;
|
|
|
|
|
|
|
|
|
|
const grad = ctx.createLinearGradient(0, h, 0, 0);
|
|
|
|
|
grad.addColorStop(0, color1 + '10');
|
|
|
|
|
grad.addColorStop(1, color1 + '40');
|
|
|
|
|
|
|
|
|
|
const step = w / (HISTORY_LEN - 1);
|
|
|
|
|
const startX = w - (data.length - 1) * step;
|
|
|
|
|
|
|
|
|
|
ctx.beginPath();
|
|
|
|
|
ctx.moveTo(startX, h);
|
|
|
|
|
for (let i = 0; i < data.length; i++) {
|
|
|
|
|
const x = startX + i * step;
|
|
|
|
|
const y = h - (data[i] / maxVal) * (h - 4) - 2;
|
|
|
|
|
if (i === 0) ctx.lineTo(x, y);
|
|
|
|
|
else ctx.lineTo(x, y);
|
|
|
|
|
}
|
|
|
|
|
ctx.lineTo(startX + (data.length - 1) * step, h);
|
|
|
|
|
ctx.closePath();
|
|
|
|
|
ctx.fillStyle = grad;
|
|
|
|
|
ctx.fill();
|
|
|
|
|
|
|
|
|
|
ctx.beginPath();
|
|
|
|
|
for (let i = 0; i < data.length; i++) {
|
|
|
|
|
const x = startX + i * step;
|
|
|
|
|
const y = h - (data[i] / maxVal) * (h - 4) - 2;
|
|
|
|
|
if (i === 0) ctx.moveTo(x, y);
|
|
|
|
|
else ctx.lineTo(x, y);
|
|
|
|
|
}
|
|
|
|
|
ctx.strokeStyle = color1;
|
|
|
|
|
ctx.lineWidth = 1.5;
|
|
|
|
|
ctx.stroke();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function setText(id, val) {
|
|
|
|
|
const el = document.getElementById(id);
|
|
|
|
|
if (el && el.textContent !== String(val)) el.textContent = val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function setBar(id, pct) {
|
|
|
|
|
const el = document.getElementById(id);
|
|
|
|
|
if (el) el.style.width = Math.min(100, Math.max(0, pct)) + '%';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function fmtNum(n) {
|
|
|
|
|
if (n === null || n === undefined) return '--';
|
|
|
|
|
if (n >= 1e9) return (n / 1e9).toFixed(1) + 'B';
|
|
|
|
|
if (n >= 1e6) return (n / 1e6).toFixed(1) + 'M';
|
|
|
|
|
if (n >= 1e3) return (n / 1e3).toFixed(1) + 'K';
|
|
|
|
|
return Math.round(n).toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function fmtBytes(n) {
|
|
|
|
|
if (n === null || n === undefined) return '--';
|
|
|
|
|
if (n >= 1e12) return (n / 1e12).toFixed(1) + ' TB';
|
|
|
|
|
if (n >= 1e9) return (n / 1e9).toFixed(1) + ' GB';
|
|
|
|
|
if (n >= 1e6) return (n / 1e6).toFixed(1) + ' MB';
|
|
|
|
|
return Math.round(n).toString() + ' B';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function update(data) {
|
|
|
|
|
// Server status
|
|
|
|
|
const statusEl = document.getElementById('srv-status');
|
|
|
|
|
if (data.health && data.health.status === 'ok') {
|
|
|
|
|
statusEl.textContent = 'online';
|
|
|
|
|
statusEl.className = 'status ok';
|
|
|
|
|
} else if (data.health) {
|
|
|
|
|
statusEl.textContent = data.health.status || 'unknown';
|
|
|
|
|
statusEl.className = 'status err';
|
|
|
|
|
} else {
|
|
|
|
|
statusEl.textContent = 'offline';
|
|
|
|
|
statusEl.className = 'status err';
|
|
|
|
|
}
|
|
|
|
|
setText('srv-url', data.llama_url || '');
|
|
|
|
|
|
|
|
|
|
// GPU
|
|
|
|
|
const gpu = data.gpu;
|
|
|
|
|
if (gpu && gpu.available) {
|
|
|
|
|
gpuUtilHistory.push(gpu.utilization);
|
|
|
|
|
if (gpuUtilHistory.length > HISTORY_LEN) gpuUtilHistory.shift();
|
|
|
|
|
gpuMemHistory.push(gpu.memory_used);
|
|
|
|
|
if (gpuMemHistory.length > HISTORY_LEN) gpuMemHistory.shift();
|
|
|
|
|
|
|
|
|
|
setText('gpu-util', Math.round(gpu.utilization));
|
|
|
|
|
setBar('gpu-util-bar', gpu.utilization);
|
|
|
|
|
setText('gpu-mem-used', Math.round(gpu.memory_used));
|
|
|
|
|
setText('gpu-mem-total', Math.round(gpu.memory_total));
|
|
|
|
|
setBar('gpu-mem-bar', (gpu.memory_used / gpu.memory_total) * 100);
|
|
|
|
|
setText('gpu-temp', Math.round(gpu.temperature));
|
|
|
|
|
setBar('gpu-temp-bar', (gpu.temperature / 100) * 100);
|
|
|
|
|
setText('gpu-power', Math.round(gpu.power_draw));
|
|
|
|
|
setText('gpu-power-limit', Math.round(gpu.power_limit));
|
|
|
|
|
setText('gpu-fan', Math.round(gpu.fan_speed));
|
|
|
|
|
setText('gpu-name', gpu.name);
|
|
|
|
|
|
|
|
|
|
drawSparkline('gpu-util-chart', gpuUtilHistory, 100, '#9ece6a', '#e0af68');
|
|
|
|
|
drawSparkline('gpu-mem-chart', gpuMemHistory, gpu.memory_total, '#7aa2f7', '#bb9af7');
|
|
|
|
|
} else {
|
|
|
|
|
setText('gpu-util', '--');
|
|
|
|
|
setText('gpu-name', 'nvidia-smi not available');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Model
|
|
|
|
|
if (data.model && data.model.data && data.model.data.length > 0) {
|
|
|
|
|
const m = data.model.data[0];
|
|
|
|
|
const meta = m.meta || {};
|
|
|
|
|
setText('model-name', m.id || '--');
|
|
|
|
|
setText('model-params', fmtNum(meta.n_params));
|
|
|
|
|
setText('model-size', fmtBytes(meta.size));
|
|
|
|
|
setText('model-ctx', fmtNum(meta.n_ctx_train));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Throughput metrics
|
|
|
|
|
const met = data.metrics || {};
|
|
|
|
|
const now = data.timestamp;
|
|
|
|
|
|
|
|
|
|
// Use gauge-based tok/s from server (already averaged)
|
|
|
|
|
const promptTps = met['llamacpp:prompt_tokens_seconds'];
|
|
|
|
|
const genTps = met['llamacpp:predicted_tokens_seconds'];
|
|
|
|
|
setText('m-prompt-tps', promptTps !== undefined ? promptTps.toFixed(1) : '--');
|
|
|
|
|
setText('m-gen-tps', genTps !== undefined ? genTps.toFixed(1) : '--');
|
|
|
|
|
|
|
|
|
|
setText('m-prompt-tokens', fmtNum(met['llamacpp:prompt_tokens_total']));
|
|
|
|
|
setText('m-gen-tokens', fmtNum(met['llamacpp:tokens_predicted_total']));
|
|
|
|
|
setText('m-requests', fmtNum(met['llamacpp:requests_processing']));
|
|
|
|
|
setText('m-kv-pct', '--');
|
|
|
|
|
|
|
|
|
|
// GPU layers from slots
|
|
|
|
|
if (data.slots && data.slots.length > 0 && data.slots[0].n_gpu_layers !== undefined) {
|
|
|
|
|
setText('model-ngl', data.slots[0].n_gpu_layers);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Slots
|
|
|
|
|
const sc = document.getElementById('slots-container');
|
|
|
|
|
if (data.slots && data.slots.length > 0) {
|
|
|
|
|
let html = '';
|
|
|
|
|
for (const s of data.slots) {
|
|
|
|
|
const isActive = s.is_processing === true || s.state !== 0;
|
|
|
|
|
const stateText = isActive ? 'processing' : 'idle';
|
|
|
|
|
const ctxUsed = s.n_past || s.n_predict || 0;
|
|
|
|
|
const ctxTotal = s.n_ctx || 0;
|
|
|
|
|
const ctxPct = ctxTotal > 0 ? ((ctxUsed / ctxTotal) * 100).toFixed(1) : 0;
|
|
|
|
|
html += '<div class="slot ' + (isActive ? 'active' : '') + '">'
|
|
|
|
|
+ '<div class="slot-head">'
|
|
|
|
|
+ '<span>Slot ' + s.id + '</span>'
|
|
|
|
|
+ '<span class="slot-state ' + stateText + '">' + stateText + '</span>'
|
|
|
|
|
+ '</div>'
|
|
|
|
|
+ '<div class="metric"><span class="label">Context</span>'
|
|
|
|
|
+ '<span><span class="value" style="font-size:14px">' + ctxUsed + '</span>'
|
|
|
|
|
+ '<span class="unit"> / ' + ctxTotal + ' (' + ctxPct + '%)</span></span></div>'
|
|
|
|
|
+ '<div class="bar-wrap"><div class="bar-fill mem" style="width:' + ctxPct + '%;transition:width 0.5s"></div></div>'
|
|
|
|
|
+ '</div>';
|
|
|
|
|
}
|
|
|
|
|
sc.innerHTML = html;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function poll() {
|
|
|
|
|
try {
|
|
|
|
|
const resp = await fetch('/api/stats');
|
|
|
|
|
if (resp.ok) {
|
|
|
|
|
const data = await resp.json();
|
|
|
|
|
update(data);
|
|
|
|
|
}
|
|
|
|
|
} catch (e) {}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
poll();
|
|
|
|
|
setInterval(poll, 1000);
|
|
|
|
|
</script>
|
|
|
|
|
</body>
|
|
|
|
|
</html>
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Handler(http.server.BaseHTTPRequestHandler):
|
|
|
|
|
def do_GET(self):
|
|
|
|
|
if self.path == "/" or self.path == "/index.html":
|
|
|
|
|
self.send_response(200)
|
|
|
|
|
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
|
|
|
self.end_headers()
|
|
|
|
|
self.wfile.write(DASHBOARD_HTML.encode())
|
|
|
|
|
elif self.path == "/api/stats":
|
|
|
|
|
stats = get_all_stats()
|
|
|
|
|
self.send_response(200)
|
|
|
|
|
self.send_header("Content-Type", "application/json")
|
|
|
|
|
self.send_header("Cache-Control", "no-cache")
|
|
|
|
|
self.end_headers()
|
|
|
|
|
self.wfile.write(json.dumps(stats).encode())
|
|
|
|
|
elif self.path == "/health":
|
|
|
|
|
self.send_response(200)
|
|
|
|
|
self.send_header("Content-Type", "application/json")
|
|
|
|
|
self.end_headers()
|
|
|
|
|
self.wfile.write(b'{"status":"ok"}')
|
|
|
|
|
else:
|
|
|
|
|
self.send_response(404)
|
|
|
|
|
self.end_headers()
|
|
|
|
|
|
|
|
|
|
def log_message(self, fmt, *args):
|
|
|
|
|
pass # suppress request logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
server = http.server.HTTPServer(("0.0.0.0", MONITOR_PORT), Handler)
|
|
|
|
|
print(f"llama.cpp monitor listening on port {MONITOR_PORT}")
|
|
|
|
|
print(f" llama server: {LLAMA_SERVER_URL}")
|
|
|
|
|
try:
|
|
|
|
|
server.serve_forever()
|
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
pass
|
|
|
|
|
server.server_close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|