olimp/roles/proxmox_monitoring/templates/storcli_metrics.sh.j2
Administrator a948ee74a8 Update 12 files
- /roles/proxmox_monitoring/handlers/main.yml
- /roles/proxmox_monitoring/tasks/main.yml
- /roles/proxmox_monitoring/templates/node_exporter.service.j2
- /roles/proxmox_monitoring/templates/storcli_metrics.sh.j2
- /roles/proxmox_monitoring/templates/pve_exporter_config.yml.j2
- /roles/proxmox_base_setup/tasks/main.yml
- /roles/grafana/templates/docker-compose.yml.j2
- /roles/grafana/files/vmagent.yaml
- /roles/base_setup/tasks/main.yml
- /roles/base_setup/handlers/main.yml
- /group_vars/all.yml
- /olimp-deploy.yml
2025-11-18 19:57:51 +00:00

79 lines
3.5 KiB
Django/Jinja

#!/bin/bash
OUT_FILE="/var/lib/node_exporter/textfile_collector/storcli.prom"
# Очищаем файл
> "$OUT_FILE"
# Заголовки метрик
echo "# HELP storcli_disk_temp Temperature of physical disk in Celsius" >> "$OUT_FILE"
echo "# TYPE storcli_disk_temp gauge" >> "$OUT_FILE"
echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$OUT_FILE"
echo "# TYPE storcli_disk_state gauge" >> "$OUT_FILE"
echo "# HELP storcli_disk_media_error_count Media error count" >> "$OUT_FILE"
echo "# TYPE storcli_disk_media_error_count counter" >> "$OUT_FILE"
echo "# HELP storcli_disk_other_error_count Other error count" >> "$OUT_FILE"
echo "# TYPE storcli_disk_other_error_count counter" >> "$OUT_FILE"
echo "# HELP storcli_array_state Virtual drive state: 0=Offline,1=Online,2=Degraded,3=Failed" >> "$OUT_FILE"
echo "# TYPE storcli_array_state gauge" >> "$OUT_FILE"
# Состояния дисков из общей таблицы
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all | awk '
/^252:[0-9]+\s+/ {
split($1, parts, ":");
slot = parts[2];
did = $2;
state = $3;
state_num = 2; # по умолчанию Online
if (state == "Offln") state_num = 0;
else if (state == "UGood") state_num = 1;
else if (state == "Failed") state_num = 3;
else if (state == "Rebuild") state_num = 4;
printf "storcli_disk_state{controller=\"0\",enclosure=\"252\",slot=\"%s\",did=\"%s\",model=\"ST8000NM0075\"} %s\n", slot, did, state_num
}
' >> "$OUT_FILE"
# Температуры и ошибки для каждого диска
for slot in 0 1 2 3; do
echo "Processing disk slot $slot..."
disk_info=$(/opt/MegaRAID/storcli/storcli64 /c0/e252/s$slot show all)
# Температура
temp=$(echo "$disk_info" | grep "Drive Temperature" | grep -oE '[0-9]+' | head -1)
if [ -n "$temp" ]; then
echo "storcli_disk_temp{controller=\"0\",enclosure=\"252\",slot=\"$slot\",model=\"ST8000NM0075\"} $temp" >> "$OUT_FILE"
echo " Temperature: $temp°C"
fi
# Ошибки
media_line=$(echo "$disk_info" | grep "Media Error Count")
other_line=$(echo "$disk_info" | grep "Other Error Count")
# Извлекаем числа после "="
media_errors=$(echo "$media_line" | awk -F= '{print $2}' | awk '{print $1}')
other_errors=$(echo "$other_line" | awk -F= '{print $2}' | awk '{print $1}')
# Записываем ошибки (0 по умолчанию)
echo "storcli_disk_media_error_count{controller=\"0\",enclosure=\"252\",slot=\"$slot\",model=\"ST8000NM0075\"} ${media_errors:-0}" >> "$OUT_FILE"
echo "storcli_disk_other_error_count{controller=\"0\",enclosure=\"252\",slot=\"$slot\",model=\"ST8000NM0075\"} ${other_errors:-0}" >> "$OUT_FILE"
echo " Media errors: ${media_errors:-0}, Other errors: ${other_errors:-0}"
done
# Состояние массива
array_line=$(/opt/MegaRAID/storcli/storcli64 /c0/vall show | grep "^0/0")
array_state=$(echo "$array_line" | awk '{print $3}')
state_num=1 # по умолчанию Optimal
[[ "$array_state" == "Dgrd" ]] && state_num=2
[[ "$array_state" == "Offln" ]] && state_num=0
[[ "$array_state" == "Pdgd" ]] && state_num=0
[[ "$array_state" == "Ft" ]] && state_num=3
echo "storcli_array_state{controller=\"0\",vd=\"0\"} $state_num" >> "$OUT_FILE"
echo "Array state: $array_state ($state_num)"
chmod 644 "$OUT_FILE"
echo "$(date -Iseconds) storcli metrics updated successfully"