Add GPU utilization monitoring (NVIDIA/Intel) and dropshell service discovery
This commit is contained in:
@@ -143,6 +143,45 @@ while read -r line; do
|
||||
gpu_idx=$((gpu_idx + 1))
|
||||
done < <(lspci 2>/dev/null | grep -iE 'vga|3d|display')
|
||||
|
||||
# GPU utilization (NVIDIA)
|
||||
if command -v nvidia-smi &>/dev/null; then
|
||||
nv_idx=0
|
||||
while IFS=',' read -r name util mem_used mem_total temp; do
|
||||
echo "[nvidia_gpu:$nv_idx]"
|
||||
echo "name=$(echo "$name" | xargs)"
|
||||
echo "utilization_percent=$(echo "$util" | xargs)"
|
||||
echo "memory_used_mb=$(echo "$mem_used" | xargs)"
|
||||
echo "memory_total_mb=$(echo "$mem_total" | xargs)"
|
||||
echo "temperature=$(echo "$temp" | xargs)"
|
||||
nv_idx=$((nv_idx + 1))
|
||||
done < <(nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits 2>/dev/null)
|
||||
fi
|
||||
|
||||
# GPU utilization (Intel)
|
||||
igpu_cmd=$(command -v intel_gpu_top 2>/dev/null)
|
||||
if [ -n "$igpu_cmd" ]; then
|
||||
igpu_prefix=""
|
||||
[ "$(id -u)" -ne 0 ] && igpu_prefix="sudo"
|
||||
igpu_idx=0
|
||||
for drm in /sys/class/drm/card*; do
|
||||
[ -d "$drm" ] || continue
|
||||
driver=$(readlink "$drm/device/driver" 2>/dev/null | xargs basename 2>/dev/null)
|
||||
[ "$driver" = "i915" ] || [ "$driver" = "xe" ] || continue
|
||||
card=$(basename "$drm")
|
||||
igpu_raw=$(timeout 2 $igpu_prefix intel_gpu_top -J -s 500 -d /dev/dri/"$card" 2>/dev/null)
|
||||
if [ -n "$igpu_raw" ]; then
|
||||
echo "[intel_gpu:$igpu_idx]"
|
||||
busy=$(echo "$igpu_raw" | grep -oP '"busy"\s*:\s*\K[0-9.]+' | sort -rn | head -1)
|
||||
echo "utilization_percent=${busy:-0}"
|
||||
freq=$(echo "$igpu_raw" | grep -oP '"actual"\s*:\s*\K[0-9.]+' | head -1)
|
||||
[ -n "$freq" ] && echo "frequency_mhz=${freq%.*}"
|
||||
power=$(echo "$igpu_raw" | grep -oP '"GPU"\s*:\s*\K[0-9.]+' | head -1)
|
||||
[ -n "$power" ] && echo "power_w=$power"
|
||||
igpu_idx=$((igpu_idx + 1))
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Network interfaces
|
||||
for iface in $(ls /sys/class/net/ 2>/dev/null); do
|
||||
[ "$iface" = "lo" ] && continue
|
||||
@@ -311,6 +350,22 @@ if command -v virsh &>/dev/null; then
|
||||
done
|
||||
fi
|
||||
|
||||
# Dropshell services
|
||||
ds_idx=0
|
||||
for svc_dir in /home/dropshell/dropshell/services/*/; do
|
||||
[ -d "$svc_dir" ] || continue
|
||||
svc_env="$svc_dir/config/service.env"
|
||||
[ -f "$svc_env" ] || continue
|
||||
svc_name=$(basename "$svc_dir")
|
||||
echo "[dropshell_service:$ds_idx]"
|
||||
echo "name=$svc_name"
|
||||
container_name=$(grep '^CONTAINER_NAME=' "$svc_env" 2>/dev/null | head -1 | cut -d= -f2 | tr -d '"')
|
||||
[ -n "$container_name" ] && echo "container_name=$container_name"
|
||||
template=$(grep '^TEMPLATE=' "$svc_env" 2>/dev/null | head -1 | cut -d= -f2 | tr -d '"')
|
||||
[ -n "$template" ] && echo "template=$template"
|
||||
ds_idx=$((ds_idx + 1))
|
||||
done
|
||||
|
||||
# Docker containers
|
||||
if command -v docker &>/dev/null; then
|
||||
_sudo docker ps -a --format '{{.Names}}\t{{.State}}\t{{.Image}}\t{{.Status}}' 2>/dev/null | while IFS=$'\t' read -r name state image status_text; do
|
||||
|
||||
@@ -32,6 +32,21 @@
|
||||
{% set max_temp.val = v|float %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set nvidia_gpus = d.get('nvidia_gpu', []) if d.get('nvidia_gpu') else [] %}
|
||||
{% set intel_gpus = d.get('intel_gpu', []) if d.get('intel_gpu') else [] %}
|
||||
{% set max_gpu_pct = namespace(val=-1.0) %}
|
||||
{% for ng in nvidia_gpus %}
|
||||
{% set gp = ng.get('utilization_percent', '0')|float %}
|
||||
{% if gp > max_gpu_pct.val %}
|
||||
{% set max_gpu_pct.val = gp %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% for ig in intel_gpus %}
|
||||
{% set gp = ig.get('utilization_percent', '0')|float %}
|
||||
{% if gp > max_gpu_pct.val %}
|
||||
{% set max_gpu_pct.val = gp %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set cpu_pct = cpu.get('usage_percent', '0')|float %}
|
||||
{% set mem_pct = mem.get('usage_percent', '0')|float %}
|
||||
{% set disk_usages = d.get('disk_usage', []) if d.get('disk_usage') else [] %}
|
||||
@@ -91,6 +106,26 @@
|
||||
{% set _ = containers.append(ct) %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set ds_services = d.get('dropshell_service', []) if d.get('dropshell_service') else [] %}
|
||||
{% set svc_container_names = [] %}
|
||||
{% for svc in ds_services %}
|
||||
{% if svc.get('container_name') %}
|
||||
{% set _ = svc_container_names.append(svc.get('container_name')) %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% set orphan_containers = [] %}
|
||||
{% for ct in containers %}
|
||||
{% set ct_name = ct.get('name', '') %}
|
||||
{% set ns = namespace(is_svc=false) %}
|
||||
{% for cn in svc_container_names %}
|
||||
{% if ct_name.startswith(cn ~ '-') %}
|
||||
{% set ns.is_svc = true %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% if not ns.is_svc %}
|
||||
{% set _ = orphan_containers.append(ct) %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
{% if server.is_online %}
|
||||
<div class="usage-bars">
|
||||
@@ -124,8 +159,17 @@
|
||||
<span class="usage-pct">{{ '%.0f'|format(max_temp.val) }}°</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if max_gpu_pct.val >= 0 %}
|
||||
<div class="usage-row">
|
||||
<span class="usage-label">GPU</span>
|
||||
<div class="usage-bar-bg">
|
||||
<div class="usage-bar-fill" style="width: {{ max_gpu_pct.val }}%; background: {{ max_gpu_pct.val|usage_color }};"></div>
|
||||
</div>
|
||||
<span class="usage-pct">{{ '%.0f'|format(max_gpu_pct.val) }}%</span>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if containers or child_vms %}
|
||||
{% if ds_services or orphan_containers or containers or child_vms %}
|
||||
<div class="ct-summary-list">
|
||||
{% for vm in child_vms %}
|
||||
<div class="ct-summary-item">
|
||||
@@ -136,17 +180,57 @@
|
||||
{% if vm_os %}<span class="ct-summary-os">{{ vm_os }}</span>{% endif %}
|
||||
<span class="ct-summary-ip">{{ vm.primary_ip or vm.hostname }}</span>
|
||||
</div>
|
||||
{% set vm_ds = vm.details.get('dropshell_service', []) if vm.details and vm.details.get('dropshell_service') else [] %}
|
||||
{% set vm_cts = vm.details.get('container', []) if vm.details and vm.details.get('container') else [] %}
|
||||
{% set vm_svc_cns = [] %}
|
||||
{% for vs in vm_ds %}
|
||||
{% if vs.get('container_name') %}{% set _ = vm_svc_cns.append(vs.get('container_name')) %}{% endif %}
|
||||
{% endfor %}
|
||||
{% for vs in vm_ds %}
|
||||
{% set vs_cn = vs.get('container_name', '') %}
|
||||
{% set vs_running = namespace(val=false) %}
|
||||
{% for vct in vm_cts %}
|
||||
{% if vct.get('name', '').startswith(vs_cn ~ '-') and vct.get('status', '')|lower in ['running', 'started'] %}
|
||||
{% set vs_running.val = true %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
<div class="ct-summary-item nested">
|
||||
<span class="status-dot-sm {% if vs_running.val %}online{% else %}offline{% endif %}"></span>
|
||||
<span class="ct-summary-name">{{ vs.get('name') }}</span>
|
||||
<span class="ct-summary-type">SVC</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% for vct in vm_cts %}
|
||||
{% set vct_name = vct.get('name', '') %}
|
||||
{% set vct_is_svc = namespace(val=false) %}
|
||||
{% for cn in vm_svc_cns %}
|
||||
{% if vct_name.startswith(cn ~ '-') %}{% set vct_is_svc.val = true %}{% endif %}
|
||||
{% endfor %}
|
||||
{% if not vct_is_svc.val %}
|
||||
{% set vct_up = vct.get('status', '')|lower in ['running', 'started'] %}
|
||||
<div class="ct-summary-item nested">
|
||||
<span class="status-dot-sm {% if vct_up %}online{% else %}offline{% endif %}"></span>
|
||||
<span class="ct-summary-name">{{ vct.get('name', vct.get('id', '?')) }}</span>
|
||||
<span class="ct-summary-name">{{ vct_name }}</span>
|
||||
{% if vct.get('ip') %}<span class="ct-summary-ip">{{ vct.get('ip') }}</span>{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{% for svc in ds_services %}
|
||||
{% set svc_cn = svc.get('container_name', '') %}
|
||||
{% set svc_running = namespace(val=false) %}
|
||||
{% for ct in containers %}
|
||||
{% if ct.get('name', '').startswith(svc_cn ~ '-') and ct.get('status', '')|lower in ['running', 'started'] %}
|
||||
{% set svc_running.val = true %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
<div class="ct-summary-item">
|
||||
<span class="status-dot-sm {% if svc_running.val %}online{% else %}offline{% endif %}"></span>
|
||||
<span class="ct-summary-name">{{ svc.get('name') }}</span>
|
||||
<span class="ct-summary-type">SVC</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% for ct in orphan_containers %}
|
||||
{% set ct_up = ct.get('status', '')|lower in ['running', 'started'] %}
|
||||
<div class="ct-summary-item">
|
||||
<span class="status-dot-sm {% if ct_up %}online{% else %}offline{% endif %}"></span>
|
||||
@@ -244,13 +328,31 @@
|
||||
|
||||
<!-- GPUs -->
|
||||
{% set gpus = d.get('gpu', []) if d.get('gpu') else [] %}
|
||||
{% if gpus %}
|
||||
{% if gpus or nvidia_gpus or intel_gpus %}
|
||||
<div class="detail-section">
|
||||
<h4>GPUs</h4>
|
||||
<table>
|
||||
{% for gpu in gpus %}
|
||||
<tr><td>GPU {{ loop.index0 }}</td><td>{{ gpu.get('description', '-')|clean_gpu }}</td></tr>
|
||||
{% for ng in nvidia_gpus %}
|
||||
<tr>
|
||||
<td>{{ ng.get('name', 'NVIDIA GPU ' ~ loop.index0) }}</td>
|
||||
<td>{{ ng.get('utilization_percent', '-') }}%</td>
|
||||
<td>{{ ng.get('memory_used_mb', '-') }} / {{ ng.get('memory_total_mb', '-') }} MB</td>
|
||||
<td>{{ ng.get('temperature', '-') }}°C</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% for ig in intel_gpus %}
|
||||
<tr>
|
||||
<td>Intel GPU {{ loop.index0 }}</td>
|
||||
<td>{{ ig.get('utilization_percent', '-') }}%</td>
|
||||
<td>{% if ig.get('frequency_mhz') %}{{ ig.get('frequency_mhz') }} MHz{% else %}-{% endif %}</td>
|
||||
<td>{% if ig.get('power_w') %}{{ ig.get('power_w') }} W{% else %}-{% endif %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% if not nvidia_gpus and not intel_gpus %}
|
||||
{% for gpu in gpus %}
|
||||
<tr><td colspan="4">{{ gpu.get('description', '-')|clean_gpu }}</td></tr>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</table>
|
||||
</div>
|
||||
{% endif %}
|
||||
@@ -381,11 +483,17 @@
|
||||
<div class="container-grid">
|
||||
{% for ct in containers %}
|
||||
{% set ct_running = ct.get('status', '')|lower in ['running', 'started'] %}
|
||||
{% set ct_svc_name = namespace(val='') %}
|
||||
{% for svc in ds_services %}
|
||||
{% if svc.get('container_name') and ct.get('name', '').startswith(svc.get('container_name') ~ '-') %}
|
||||
{% set ct_svc_name.val = svc.get('name') %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
<div class="container-card {% if not ct_running %}offline{% endif %}">
|
||||
<div class="ct-header">
|
||||
<span class="status-dot {% if ct_running %}online{% else %}offline{% endif %}"></span>
|
||||
<span class="ct-name">{{ ct.get('name', ct.get('id', '?')) }}</span>
|
||||
<span class="ct-type">{{ ct.get('type', '')|upper }}</span>
|
||||
<span class="ct-name">{% if ct_svc_name.val %}{{ ct_svc_name.val }}{% else %}{{ ct.get('name', ct.get('id', '?')) }}{% endif %}</span>
|
||||
<span class="ct-type">{% if ct_svc_name.val %}SVC{% else %}{{ ct.get('type', '')|upper }}{% endif %}</span>
|
||||
</div>
|
||||
{% if ct.get('image') %}
|
||||
<div class="ct-image">{{ ct.get('image') }}</div>
|
||||
|
||||
@@ -169,12 +169,14 @@ install_packages() {
|
||||
debian)
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
apt-get update -qq
|
||||
apt-get install -y -qq sudo lm-sensors pciutils iproute2 util-linux intel-gpu-tools >/dev/null 2>&1 || \
|
||||
apt-get install -y -qq sudo lm-sensors pciutils iproute2 util-linux >/dev/null
|
||||
# Auto-detect sensor modules
|
||||
sensors-detect --auto </dev/null >/dev/null 2>&1 || true
|
||||
;;
|
||||
alpine)
|
||||
apk update --quiet
|
||||
apk add --quiet sudo lm-sensors pciutils iproute2 util-linux bash intel-gpu-tools 2>/dev/null || \
|
||||
apk add --quiet sudo lm-sensors pciutils iproute2 util-linux bash
|
||||
# Auto-detect sensor modules
|
||||
sensors-detect --auto </dev/null >/dev/null 2>&1 || true
|
||||
@@ -215,7 +217,7 @@ SUDOERS_FILE="/etc/sudoers.d/infmap"
|
||||
SUDO_CMDS=""
|
||||
|
||||
# Detect which hypervisor tools are present (check common sbin paths too)
|
||||
for cmd in pct qm lxc virsh docker; do
|
||||
for cmd in pct qm lxc virsh docker intel_gpu_top; do
|
||||
cmd_path=$(command -v "$cmd" 2>/dev/null || true)
|
||||
# Also check sbin paths not always in PATH
|
||||
if [ -z "$cmd_path" ]; then
|
||||
|
||||
Reference in New Issue
Block a user