olimp/roles/proxmox_base_setup/templates/storcli_metrics.sh.j2
Administrator 94d0108766 Update 2 files
- /roles/proxmox_base_setup/templates/storcli_metrics.sh.j2
- /roles/proxmox_base_setup/templates/node_exporter.service.j2
2025-11-14 07:18:41 +00:00

152 lines
6.5 KiB
Django/Jinja
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Сбор метрик MegaRAID через storcli 1.17.08 → textfile_collector
# Совместим с Proxmox VE 9.0
# Версия: 2025-11-14
set -euo pipefail
OUT_FILE="/var/lib/node_exporter/textfile_collector/storcli.prom.$$"
FINAL_FILE="/var/lib/node_exporter/textfile_collector/storcli.prom"
trap 'rm -f "$OUT_FILE"' EXIT
{
echo "# HELP storcli_disk_temp Temperature of physical disk in Celsius"
echo "# TYPE storcli_disk_temp gauge"
echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild,5=... (see docs)"
echo "# TYPE storcli_disk_state gauge"
echo "# HELP storcli_disk_media_error_count Media error count"
echo "# TYPE storcli_disk_media_error_count counter"
echo "# HELP storcli_disk_other_error_count Other error count"
echo "# TYPE storcli_disk_other_error_count counter"
echo "# HELP storcli_array_state Virtual drive state: 0=Offline,1=Online,2=Degraded,3=Failed"
echo "# TYPE storcli_array_state gauge"
echo "# HELP storcli_controller_temp Controller temperature (°C), if available"
echo "# TYPE storcli_controller_temp gauge"
echo "# HELP storcli_controller_bbu_state BBU state: 0=Missing/Failed,1=Good,2=Degraded"
echo "# TYPE storcli_controller_bbu_state gauge"
} > "$OUT_FILE"
# Получаем список контроллеров (в storcli 1.17 — /call show краткий)
controllers=$(storcli /call show | awk '
/^$/ { next }
/Controller [0-9]+/ { print $2 }
' | grep -E '^[0-9]+$')
if [ -z "$controllers" ]; then
echo "# No controllers found" >&2
touch "$FINAL_FILE"
exit 0
fi
for c in $controllers; do
# === Контроллер: температура и BBU ===
ctl_info=$(storcli /c$c show J)
# В 1.17.08 JSON-режим ограничен, но /c0 show J даёт базовые поля
# Температура может быть в строке: "Controller Temperature (C): 58"
ctl_temp=$(echo "$ctl_info" | grep -i "Controller Temperature" | grep -oE '[0-9]+' | head -1)
if [ -n "$ctl_temp" ]; then
echo "storcli_controller_temp{controller=\"${c}\"} ${ctl_temp}" >> "$OUT_FILE"
fi
# BBU state: ищем "BBU Status: Optimal" / "Failed" / "Missing"
bbu_line=$(echo "$ctl_info" | grep -i "BBU.*Status" | head -1)
bbu_state=0
if [[ "$bbu_line" =~ Optimal|Good ]]; then
bbu_state=1
elif [[ "$bbu_line" =~ Degraded|Weak ]]; then
bbu_state=2
fi
echo "storcli_controller_bbu_state{controller=\"${c}\"} ${bbu_state}" >> "$OUT_FILE"
# === Физические диски ===
# В 1.17.08 /c0/eall/sall show даёт таблицу, но без заголовков — парсим по блокам
storcli /c$c/eall/sall show | awk -v c="$c" '
BEGIN {
enc = -1; slot = -1; model = "Unknown"; temp = 0; state_num = 0;
media_err = 0; other_err = 0; in_pd = 0
}
/^PD:/ {
if (enc != -1 && slot != -1) {
gsub(/ /, "_", model);
printf "storcli_disk_temp{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, temp
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, state_num
printf "storcli_disk_media_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, media_err
printf "storcli_disk_other_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, other_err
}
enc = -1; slot = -1; model = "Unknown"; temp = 0; state_num = 0;
media_err = 0; other_err = 0; in_pd = 1
}
in_pd && /Enclosure Device ID:/ { enc = $4 }
in_pd && /Slot Number:/ { slot = $3 }
in_pd && /Device Id:/ { if (enc == -1) enc = $3 } # fallback
in_pd && /Model Number:/ { model = $3 }
in_pd && /Drive Temperature.*C/ {
match($0, /([0-9]+)C/);
if (RSTART) temp = substr($0, RSTART, RLENGTH-1)
}
in_pd && /Firmware state:/ {
state = $3
if (state ~ /Online/) state_num = 2
else if (state ~ /Unconfigured.*Good/) state_num = 1
else if (state ~ /Failed/) state_num = 3
else if (state ~ /Rebuild/) state_num = 4
else if (state ~ /Offline/) state_num = 0
else state_num = -1
}
in_pd && /Media Error Count:/ { media_err = $4 }
in_pd && /Other Error Count:/ { other_err = $4 }
END {
if (enc != -1 && slot != -1) {
gsub(/ /, "_", model);
printf "storcli_disk_temp{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, temp
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, state_num
printf "storcli_disk_media_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, media_err
printf "storcli_disk_other_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, other_err
}
}
' >> "$OUT_FILE"
# === Виртуальные диски (массивы) ===
storcli /c$c/vall show J | jq -r --arg c "$c" '
.Controllers[0].Response.Data |
to_entries[] |
select(.key | test("VD[0-9]+")) |
.value[] |
.["DG/VD"] as $dgvd |
.State as $state |
.Size as $size |
($dgvd | split("/")[1]) as $vd |
($state == "Optl" or $state == "Onln") as $is_online |
($state == "Dgrd") as $is_degraded |
($state == "Offln" or $state == "Pdgd") as $is_offline |
($state == "Ft") as $is_failed |
(if $is_online then 1
elif $is_degraded then 2
elif $is_offline then 0
elif $is_failed then 3
else -1 end) as $state_num |
"storcli_array_state{controller=\"\($c)\",vd=\"\($vd)\",size=\"\($size)\"} \($state_num)"
' 2>/dev/null >> "$OUT_FILE" || {
# fallback без jq (если не установлен)
storcli /c$c/vall show | awk -v c="$c" '
NR <= 8 { next }
$1 ~ /^[0-9]+$/ {
vd = $1
state = $2
size = $3
gsub(/,/,"",size)
state_num = 1
if (state == "Dgrd") state_num = 2
else if (state ~ /Offln|Pdgd/) state_num = 0
else if (state == "Ft") state_num = 3
printf "storcli_array_state{controller=\"%s\",vd=\"%s\",size=\"%s\"} %s\n", c, vd, size, state_num
}
' >> "$OUT_FILE"
}
done
mv "$OUT_FILE" "$FINAL_FILE"
chmod 644 "$FINAL_FILE"
echo "$(date -Iseconds) storcli metrics updated successfully" >&2