From db5cbf99e1b51995327ba8724c3b7ba3c367b8a7 Mon Sep 17 00:00:00 2001 From: j Date: Wed, 18 Mar 2026 22:34:46 +1300 Subject: [PATCH] Add GPU utilization monitoring (NVIDIA/Intel) and dropshell service discovery --- app/gather_info.sh | 55 ++++++++++++++++++ app/templates/index.html | 122 ++++++++++++++++++++++++++++++++++++--- setup-remote.sh | 4 +- 3 files changed, 173 insertions(+), 8 deletions(-) diff --git a/app/gather_info.sh b/app/gather_info.sh index 205efd5..337ef4a 100755 --- a/app/gather_info.sh +++ b/app/gather_info.sh @@ -143,6 +143,45 @@ while read -r line; do gpu_idx=$((gpu_idx + 1)) done < <(lspci 2>/dev/null | grep -iE 'vga|3d|display') +# GPU utilization (NVIDIA) +if command -v nvidia-smi &>/dev/null; then + nv_idx=0 + while IFS=',' read -r name util mem_used mem_total temp; do + echo "[nvidia_gpu:$nv_idx]" + echo "name=$(echo "$name" | xargs)" + echo "utilization_percent=$(echo "$util" | xargs)" + echo "memory_used_mb=$(echo "$mem_used" | xargs)" + echo "memory_total_mb=$(echo "$mem_total" | xargs)" + echo "temperature=$(echo "$temp" | xargs)" + nv_idx=$((nv_idx + 1)) + done < <(nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total,temperature.gpu --format=csv,noheader,nounits 2>/dev/null) +fi + +# GPU utilization (Intel) +igpu_cmd=$(command -v intel_gpu_top 2>/dev/null) +if [ -n "$igpu_cmd" ]; then + igpu_prefix="" + [ "$(id -u)" -ne 0 ] && igpu_prefix="sudo" + igpu_idx=0 + for drm in /sys/class/drm/card*; do + [ -d "$drm" ] || continue + driver=$(readlink "$drm/device/driver" 2>/dev/null | xargs basename 2>/dev/null) + [ "$driver" = "i915" ] || [ "$driver" = "xe" ] || continue + card=$(basename "$drm") + igpu_raw=$(timeout 2 $igpu_prefix intel_gpu_top -J -s 500 -d /dev/dri/"$card" 2>/dev/null) + if [ -n "$igpu_raw" ]; then + echo "[intel_gpu:$igpu_idx]" + busy=$(echo "$igpu_raw" | grep -oP '"busy"\s*:\s*\K[0-9.]+' | sort -rn | head -1) + echo "utilization_percent=${busy:-0}" + freq=$(echo "$igpu_raw" | grep -oP '"actual"\s*:\s*\K[0-9.]+' | head -1) + [ -n "$freq" ] && echo "frequency_mhz=${freq%.*}" + power=$(echo "$igpu_raw" | grep -oP '"GPU"\s*:\s*\K[0-9.]+' | head -1) + [ -n "$power" ] && echo "power_w=$power" + igpu_idx=$((igpu_idx + 1)) + fi + done +fi + # Network interfaces for iface in $(ls /sys/class/net/ 2>/dev/null); do [ "$iface" = "lo" ] && continue @@ -311,6 +350,22 @@ if command -v virsh &>/dev/null; then done fi +# Dropshell services +ds_idx=0 +for svc_dir in /home/dropshell/dropshell/services/*/; do + [ -d "$svc_dir" ] || continue + svc_env="$svc_dir/config/service.env" + [ -f "$svc_env" ] || continue + svc_name=$(basename "$svc_dir") + echo "[dropshell_service:$ds_idx]" + echo "name=$svc_name" + container_name=$(grep '^CONTAINER_NAME=' "$svc_env" 2>/dev/null | head -1 | cut -d= -f2 | tr -d '"') + [ -n "$container_name" ] && echo "container_name=$container_name" + template=$(grep '^TEMPLATE=' "$svc_env" 2>/dev/null | head -1 | cut -d= -f2 | tr -d '"') + [ -n "$template" ] && echo "template=$template" + ds_idx=$((ds_idx + 1)) +done + # Docker containers if command -v docker &>/dev/null; then _sudo docker ps -a --format '{{.Names}}\t{{.State}}\t{{.Image}}\t{{.Status}}' 2>/dev/null | while IFS=$'\t' read -r name state image status_text; do diff --git a/app/templates/index.html b/app/templates/index.html index 23d914d..1ef38fa 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -32,6 +32,21 @@ {% set max_temp.val = v|float %} {% endif %} {% endfor %} + {% set nvidia_gpus = d.get('nvidia_gpu', []) if d.get('nvidia_gpu') else [] %} + {% set intel_gpus = d.get('intel_gpu', []) if d.get('intel_gpu') else [] %} + {% set max_gpu_pct = namespace(val=-1.0) %} + {% for ng in nvidia_gpus %} + {% set gp = ng.get('utilization_percent', '0')|float %} + {% if gp > max_gpu_pct.val %} + {% set max_gpu_pct.val = gp %} + {% endif %} + {% endfor %} + {% for ig in intel_gpus %} + {% set gp = ig.get('utilization_percent', '0')|float %} + {% if gp > max_gpu_pct.val %} + {% set max_gpu_pct.val = gp %} + {% endif %} + {% endfor %} {% set cpu_pct = cpu.get('usage_percent', '0')|float %} {% set mem_pct = mem.get('usage_percent', '0')|float %} {% set disk_usages = d.get('disk_usage', []) if d.get('disk_usage') else [] %} @@ -91,6 +106,26 @@ {% set _ = containers.append(ct) %} {% endif %} {% endfor %} + {% set ds_services = d.get('dropshell_service', []) if d.get('dropshell_service') else [] %} + {% set svc_container_names = [] %} + {% for svc in ds_services %} + {% if svc.get('container_name') %} + {% set _ = svc_container_names.append(svc.get('container_name')) %} + {% endif %} + {% endfor %} + {% set orphan_containers = [] %} + {% for ct in containers %} + {% set ct_name = ct.get('name', '') %} + {% set ns = namespace(is_svc=false) %} + {% for cn in svc_container_names %} + {% if ct_name.startswith(cn ~ '-') %} + {% set ns.is_svc = true %} + {% endif %} + {% endfor %} + {% if not ns.is_svc %} + {% set _ = orphan_containers.append(ct) %} + {% endif %} + {% endfor %} {% if server.is_online %}
@@ -124,8 +159,17 @@ {{ '%.0f'|format(max_temp.val) }}°
{% endif %} + {% if max_gpu_pct.val >= 0 %} +
+ GPU +
+
+
+ {{ '%.0f'|format(max_gpu_pct.val) }}% +
+ {% endif %} - {% if containers or child_vms %} + {% if ds_services or orphan_containers or containers or child_vms %}
{% for vm in child_vms %}
@@ -136,17 +180,57 @@ {% if vm_os %}{{ vm_os }}{% endif %} {{ vm.primary_ip or vm.hostname }}
+ {% set vm_ds = vm.details.get('dropshell_service', []) if vm.details and vm.details.get('dropshell_service') else [] %} {% set vm_cts = vm.details.get('container', []) if vm.details and vm.details.get('container') else [] %} + {% set vm_svc_cns = [] %} + {% for vs in vm_ds %} + {% if vs.get('container_name') %}{% set _ = vm_svc_cns.append(vs.get('container_name')) %}{% endif %} + {% endfor %} + {% for vs in vm_ds %} + {% set vs_cn = vs.get('container_name', '') %} + {% set vs_running = namespace(val=false) %} {% for vct in vm_cts %} + {% if vct.get('name', '').startswith(vs_cn ~ '-') and vct.get('status', '')|lower in ['running', 'started'] %} + {% set vs_running.val = true %} + {% endif %} + {% endfor %} +
+ + {{ vs.get('name') }} + SVC +
+ {% endfor %} + {% for vct in vm_cts %} + {% set vct_name = vct.get('name', '') %} + {% set vct_is_svc = namespace(val=false) %} + {% for cn in vm_svc_cns %} + {% if vct_name.startswith(cn ~ '-') %}{% set vct_is_svc.val = true %}{% endif %} + {% endfor %} + {% if not vct_is_svc.val %} {% set vct_up = vct.get('status', '')|lower in ['running', 'started'] %}
- {{ vct.get('name', vct.get('id', '?')) }} + {{ vct_name }} {% if vct.get('ip') %}{{ vct.get('ip') }}{% endif %}
+ {% endif %} {% endfor %} {% endfor %} + {% for svc in ds_services %} + {% set svc_cn = svc.get('container_name', '') %} + {% set svc_running = namespace(val=false) %} {% for ct in containers %} + {% if ct.get('name', '').startswith(svc_cn ~ '-') and ct.get('status', '')|lower in ['running', 'started'] %} + {% set svc_running.val = true %} + {% endif %} + {% endfor %} +
+ + {{ svc.get('name') }} + SVC +
+ {% endfor %} + {% for ct in orphan_containers %} {% set ct_up = ct.get('status', '')|lower in ['running', 'started'] %}
@@ -244,13 +328,31 @@ {% set gpus = d.get('gpu', []) if d.get('gpu') else [] %} - {% if gpus %} + {% if gpus or nvidia_gpus or intel_gpus %}

GPUs

- {% for gpu in gpus %} - + {% for ng in nvidia_gpus %} + + + + + + {% endfor %} + {% for ig in intel_gpus %} + + + + + + + {% endfor %} + {% if not nvidia_gpus and not intel_gpus %} + {% for gpu in gpus %} + + {% endfor %} + {% endif %}
GPU {{ loop.index0 }}{{ gpu.get('description', '-')|clean_gpu }}
{{ ng.get('name', 'NVIDIA GPU ' ~ loop.index0) }}{{ ng.get('utilization_percent', '-') }}%{{ ng.get('memory_used_mb', '-') }} / {{ ng.get('memory_total_mb', '-') }} MB{{ ng.get('temperature', '-') }}°C
Intel GPU {{ loop.index0 }}{{ ig.get('utilization_percent', '-') }}%{% if ig.get('frequency_mhz') %}{{ ig.get('frequency_mhz') }} MHz{% else %}-{% endif %}{% if ig.get('power_w') %}{{ ig.get('power_w') }} W{% else %}-{% endif %}
{{ gpu.get('description', '-')|clean_gpu }}
{% endif %} @@ -381,11 +483,17 @@
{% for ct in containers %} {% set ct_running = ct.get('status', '')|lower in ['running', 'started'] %} + {% set ct_svc_name = namespace(val='') %} + {% for svc in ds_services %} + {% if svc.get('container_name') and ct.get('name', '').startswith(svc.get('container_name') ~ '-') %} + {% set ct_svc_name.val = svc.get('name') %} + {% endif %} + {% endfor %}
- {{ ct.get('name', ct.get('id', '?')) }} - {{ ct.get('type', '')|upper }} + {% if ct_svc_name.val %}{{ ct_svc_name.val }}{% else %}{{ ct.get('name', ct.get('id', '?')) }}{% endif %} + {% if ct_svc_name.val %}SVC{% else %}{{ ct.get('type', '')|upper }}{% endif %}
{% if ct.get('image') %}
{{ ct.get('image') }}
diff --git a/setup-remote.sh b/setup-remote.sh index 57b4e32..7932533 100755 --- a/setup-remote.sh +++ b/setup-remote.sh @@ -169,12 +169,14 @@ install_packages() { debian) export DEBIAN_FRONTEND=noninteractive apt-get update -qq + apt-get install -y -qq sudo lm-sensors pciutils iproute2 util-linux intel-gpu-tools >/dev/null 2>&1 || \ apt-get install -y -qq sudo lm-sensors pciutils iproute2 util-linux >/dev/null # Auto-detect sensor modules sensors-detect --auto /dev/null 2>&1 || true ;; alpine) apk update --quiet + apk add --quiet sudo lm-sensors pciutils iproute2 util-linux bash intel-gpu-tools 2>/dev/null || \ apk add --quiet sudo lm-sensors pciutils iproute2 util-linux bash # Auto-detect sensor modules sensors-detect --auto /dev/null 2>&1 || true @@ -215,7 +217,7 @@ SUDOERS_FILE="/etc/sudoers.d/infmap" SUDO_CMDS="" # Detect which hypervisor tools are present (check common sbin paths too) -for cmd in pct qm lxc virsh docker; do +for cmd in pct qm lxc virsh docker intel_gpu_top; do cmd_path=$(command -v "$cmd" 2>/dev/null || true) # Also check sbin paths not always in PATH if [ -z "$cmd_path" ]; then