Update 2 files
- /roles/proxmox_base_setup/templates/storcli_metrics.sh.j2 - /roles/proxmox_base_setup/templates/node_exporter.service.j2
This commit is contained in:
parent
906555ab4b
commit
94d0108766
@ -18,14 +18,21 @@ ExecStart=/usr/local/bin/node_exporter \
|
||||
--collector.loadavg \
|
||||
--collector.time \
|
||||
--collector.textfile.directory=/var/lib/node_exporter/textfile_collector \
|
||||
--web.listen-address=0.0.0.0:9100
|
||||
--web.listen-address=0.0.0.0:9100 \
|
||||
--web.telemetry-path=/metrics
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
# Защита от атак и случайных ошибок
|
||||
|
||||
# Безопасность (Proxmox-совместимо)
|
||||
NoNewPrivileges=yes
|
||||
ProtectSystem=strict
|
||||
ProtectHome=yes
|
||||
PrivateTmp=yes
|
||||
ProtectControlGroups=yes
|
||||
ProtectKernelModules=yes
|
||||
ProtectKernelTunables=yes
|
||||
LockPersonality=yes
|
||||
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -1,23 +1,91 @@
|
||||
#!/bin/bash
|
||||
if (enc != "" && slot != "") {
|
||||
# Вывести предыдущий диск
|
||||
gsub(/ /, "_", model);
|
||||
printf "storcli_disk_temp{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, (temp ~ /^[0-9]+$/ ? temp : 0)
|
||||
# Сбор метрик MegaRAID через storcli 1.17.08 → textfile_collector
|
||||
# Совместим с Proxmox VE 9.0
|
||||
# Версия: 2025-11-14
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
OUT_FILE="/var/lib/node_exporter/textfile_collector/storcli.prom.$$"
|
||||
FINAL_FILE="/var/lib/node_exporter/textfile_collector/storcli.prom"
|
||||
|
||||
trap 'rm -f "$OUT_FILE"' EXIT
|
||||
|
||||
{
|
||||
echo "# HELP storcli_disk_temp Temperature of physical disk in Celsius"
|
||||
echo "# TYPE storcli_disk_temp gauge"
|
||||
echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild,5=... (see docs)"
|
||||
echo "# TYPE storcli_disk_state gauge"
|
||||
echo "# HELP storcli_disk_media_error_count Media error count"
|
||||
echo "# TYPE storcli_disk_media_error_count counter"
|
||||
echo "# HELP storcli_disk_other_error_count Other error count"
|
||||
echo "# TYPE storcli_disk_other_error_count counter"
|
||||
echo "# HELP storcli_array_state Virtual drive state: 0=Offline,1=Online,2=Degraded,3=Failed"
|
||||
echo "# TYPE storcli_array_state gauge"
|
||||
echo "# HELP storcli_controller_temp Controller temperature (°C), if available"
|
||||
echo "# TYPE storcli_controller_temp gauge"
|
||||
echo "# HELP storcli_controller_bbu_state BBU state: 0=Missing/Failed,1=Good,2=Degraded"
|
||||
echo "# TYPE storcli_controller_bbu_state gauge"
|
||||
} > "$OUT_FILE"
|
||||
|
||||
# Получаем список контроллеров (в storcli 1.17 — /call show краткий)
|
||||
controllers=$(storcli /call show | awk '
|
||||
/^$/ { next }
|
||||
/Controller [0-9]+/ { print $2 }
|
||||
' | grep -E '^[0-9]+$')
|
||||
|
||||
if [ -z "$controllers" ]; then
|
||||
echo "# No controllers found" >&2
|
||||
touch "$FINAL_FILE"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
for c in $controllers; do
|
||||
# === Контроллер: температура и BBU ===
|
||||
ctl_info=$(storcli /c$c show J)
|
||||
# В 1.17.08 JSON-режим ограничен, но /c0 show J даёт базовые поля
|
||||
# Температура может быть в строке: "Controller Temperature (C): 58"
|
||||
ctl_temp=$(echo "$ctl_info" | grep -i "Controller Temperature" | grep -oE '[0-9]+' | head -1)
|
||||
if [ -n "$ctl_temp" ]; then
|
||||
echo "storcli_controller_temp{controller=\"${c}\"} ${ctl_temp}" >> "$OUT_FILE"
|
||||
fi
|
||||
|
||||
# BBU state: ищем "BBU Status: Optimal" / "Failed" / "Missing"
|
||||
bbu_line=$(echo "$ctl_info" | grep -i "BBU.*Status" | head -1)
|
||||
bbu_state=0
|
||||
if [[ "$bbu_line" =~ Optimal|Good ]]; then
|
||||
bbu_state=1
|
||||
elif [[ "$bbu_line" =~ Degraded|Weak ]]; then
|
||||
bbu_state=2
|
||||
fi
|
||||
echo "storcli_controller_bbu_state{controller=\"${c}\"} ${bbu_state}" >> "$OUT_FILE"
|
||||
|
||||
# === Физические диски ===
|
||||
# В 1.17.08 /c0/eall/sall show даёт таблицу, но без заголовков — парсим по блокам
|
||||
storcli /c$c/eall/sall show | awk -v c="$c" '
|
||||
BEGIN {
|
||||
enc = -1; slot = -1; model = "Unknown"; temp = 0; state_num = 0;
|
||||
media_err = 0; other_err = 0; in_pd = 0
|
||||
}
|
||||
/^PD:/ {
|
||||
if (enc != -1 && slot != -1) {
|
||||
gsub(/ /, "_", model);
|
||||
printf "storcli_disk_temp{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, temp
|
||||
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, state_num
|
||||
printf "storcli_disk_media_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, media_err
|
||||
printf "storcli_disk_other_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, other_err
|
||||
}
|
||||
enc=""; slot=""; model=""; temp=""; media_err=0; other_err=0; state_num=0
|
||||
enc = -1; slot = -1; model = "Unknown"; temp = 0; state_num = 0;
|
||||
media_err = 0; other_err = 0; in_pd = 1
|
||||
}
|
||||
/Enclosure Device ID:/ { enc = $4 }
|
||||
/Slot Number:/ { slot = $3 }
|
||||
/Device Id:/ { if (enc == "") enc = $3 } # fallback
|
||||
/Model Number:/ { model = $3 }
|
||||
/Drive Temperature/ {
|
||||
match($0, /([0-9]+)C/);
|
||||
if (RSTART) temp = substr($0, RSTART, RLENGTH-1)
|
||||
in_pd && /Enclosure Device ID:/ { enc = $4 }
|
||||
in_pd && /Slot Number:/ { slot = $3 }
|
||||
in_pd && /Device Id:/ { if (enc == -1) enc = $3 } # fallback
|
||||
in_pd && /Model Number:/ { model = $3 }
|
||||
in_pd && /Drive Temperature.*C/ {
|
||||
match($0, /([0-9]+)C/);
|
||||
if (RSTART) temp = substr($0, RSTART, RLENGTH-1)
|
||||
}
|
||||
/Firmware state:/ {
|
||||
in_pd && /Firmware state:/ {
|
||||
state = $3
|
||||
if (state ~ /Online/) state_num = 2
|
||||
else if (state ~ /Unconfigured.*Good/) state_num = 1
|
||||
@ -26,39 +94,59 @@
|
||||
else if (state ~ /Offline/) state_num = 0
|
||||
else state_num = -1
|
||||
}
|
||||
/Media Error Count:/ { media_err = $4 }
|
||||
/Other Error Count:/ { other_err = $4 }
|
||||
in_pd && /Media Error Count:/ { media_err = $4 }
|
||||
in_pd && /Other Error Count:/ { other_err = $4 }
|
||||
END {
|
||||
if (enc != "" && slot != "") {
|
||||
if (enc != -1 && slot != -1) {
|
||||
gsub(/ /, "_", model);
|
||||
printf "storcli_disk_temp{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, (temp ~ /^[0-9]+$/ ? temp : 0)
|
||||
printf "storcli_disk_temp{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, temp
|
||||
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, state_num
|
||||
printf "storcli_disk_media_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, media_err
|
||||
printf "storcli_disk_other_error_count{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",model=\"%s\"} %s\n", c, enc, slot, model, other_err
|
||||
}
|
||||
}
|
||||
' c="$c")
|
||||
' >> "$OUT_FILE"
|
||||
|
||||
echo "$disks_info" >> "$OUT_FILE"
|
||||
|
||||
# Виртуальные диски (массивы)
|
||||
vds=$(storcli /c$c/vall show | awk '
|
||||
NR>8 && $1 ~ /^[0-9]+$/ {
|
||||
vd = $1; state = $2; size = $3;
|
||||
gsub(/,/,"", size);
|
||||
# state: Optl=Online(1), Dgrd=Degraded(2), Offln=Offline(0), Ft=Failed(3)
|
||||
state_num = 1;
|
||||
if (state == "Dgrd") state_num = 2;
|
||||
else if (state ~ /Offln|Pdgd/) state_num = 0;
|
||||
else if (state == "Ft") state_num = 3;
|
||||
printf "storcli_array_state{controller=\"%s\",vd=\"%s\",size=\"%s\"} %s\n", c, vd, size, state_num
|
||||
}
|
||||
')
|
||||
echo "$vds" >> "$OUT_FILE"
|
||||
# === Виртуальные диски (массивы) ===
|
||||
storcli /c$c/vall show J | jq -r --arg c "$c" '
|
||||
.Controllers[0].Response.Data |
|
||||
to_entries[] |
|
||||
select(.key | test("VD[0-9]+")) |
|
||||
.value[] |
|
||||
.["DG/VD"] as $dgvd |
|
||||
.State as $state |
|
||||
.Size as $size |
|
||||
($dgvd | split("/")[1]) as $vd |
|
||||
($state == "Optl" or $state == "Onln") as $is_online |
|
||||
($state == "Dgrd") as $is_degraded |
|
||||
($state == "Offln" or $state == "Pdgd") as $is_offline |
|
||||
($state == "Ft") as $is_failed |
|
||||
(if $is_online then 1
|
||||
elif $is_degraded then 2
|
||||
elif $is_offline then 0
|
||||
elif $is_failed then 3
|
||||
else -1 end) as $state_num |
|
||||
"storcli_array_state{controller=\"\($c)\",vd=\"\($vd)\",size=\"\($size)\"} \($state_num)"
|
||||
' 2>/dev/null >> "$OUT_FILE" || {
|
||||
# fallback без jq (если не установлен)
|
||||
storcli /c$c/vall show | awk -v c="$c" '
|
||||
NR <= 8 { next }
|
||||
$1 ~ /^[0-9]+$/ {
|
||||
vd = $1
|
||||
state = $2
|
||||
size = $3
|
||||
gsub(/,/,"",size)
|
||||
state_num = 1
|
||||
if (state == "Dgrd") state_num = 2
|
||||
else if (state ~ /Offln|Pdgd/) state_num = 0
|
||||
else if (state == "Ft") state_num = 3
|
||||
printf "storcli_array_state{controller=\"%s\",vd=\"%s\",size=\"%s\"} %s\n", c, vd, size, state_num
|
||||
}
|
||||
' >> "$OUT_FILE"
|
||||
}
|
||||
done
|
||||
|
||||
# Атомарная замена
|
||||
mv "$OUT_FILE" "$FINAL_FILE"
|
||||
chmod 644 "$FINAL_FILE"
|
||||
|
||||
echo "$(date -Iseconds) storcli metrics updated" >&2
|
||||
echo "$(date -Iseconds) storcli metrics updated successfully" >&2
|
||||
Loading…
Reference in New Issue
Block a user