Update file main.yml
This commit is contained in:
parent
495898cf00
commit
293b854b43
@ -1,31 +1,12 @@
|
||||
---
|
||||
# ========== 1. Установка storcli (если отсутствует) ==========
|
||||
- name: Check if storcli is already installed
|
||||
stat:
|
||||
path: /opt/MegaRAID/storcli/storcli64
|
||||
register: storcli_installed
|
||||
|
||||
- name: Download storcli
|
||||
get_url:
|
||||
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
|
||||
dest: /tmp/storcli.deb
|
||||
mode: '0644'
|
||||
when: not storcli_installed.stat.exists
|
||||
|
||||
- name: Install storcli
|
||||
# ========== 1. Подготовка: установка sudo и зависимостей ==========
|
||||
- name: Ensure sudo is installed (required for privilege escalation)
|
||||
apt:
|
||||
deb: /tmp/storcli.deb
|
||||
name: sudo
|
||||
state: present
|
||||
when: not storcli_installed.stat.exists
|
||||
update_cache: yes
|
||||
become: yes
|
||||
|
||||
- name: Verify storcli works
|
||||
command: /opt/MegaRAID/storcli/storcli64 /c0 show
|
||||
register: storcli_test
|
||||
changed_when: false
|
||||
failed_when: storcli_test.rc != 0
|
||||
|
||||
|
||||
# ========== 2. Настройка pve_exporter (Python) ==========
|
||||
- name: Install Python dependencies
|
||||
apt:
|
||||
name:
|
||||
@ -33,13 +14,45 @@
|
||||
- python3-pip
|
||||
- python3-venv
|
||||
state: present
|
||||
become: yes
|
||||
|
||||
# ========== 2. Установка storcli (если отсутствует) ==========
|
||||
- name: Check if storcli is already installed
|
||||
stat:
|
||||
path: /opt/MegaRAID/storcli/storcli64
|
||||
register: storcli_installed
|
||||
become: yes
|
||||
|
||||
- name: Download storcli
|
||||
get_url:
|
||||
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
|
||||
dest: /tmp/storcli.deb
|
||||
mode: '0644'
|
||||
when: not storcli_installed.stat.exists
|
||||
become: yes
|
||||
|
||||
- name: Install storcli
|
||||
apt:
|
||||
deb: /tmp/storcli.deb
|
||||
state: present
|
||||
when: not storcli_installed.stat.exists
|
||||
become: yes
|
||||
|
||||
- name: Verify storcli works
|
||||
command: /opt/MegaRAID/storcli/storcli64 /c0 show
|
||||
register: storcli_test
|
||||
changed_when: false
|
||||
failed_when: storcli_test.rc != 0
|
||||
become: yes
|
||||
|
||||
# ========== 3. Настройка pve_exporter (Python) ==========
|
||||
- name: Create pve_exporter user
|
||||
user:
|
||||
name: pve_exporter
|
||||
system: yes
|
||||
shell: /usr/sbin/nologin
|
||||
create_home: no
|
||||
become: yes
|
||||
|
||||
- name: Create pve_exporter directories
|
||||
file:
|
||||
@ -51,25 +64,37 @@
|
||||
loop:
|
||||
- /opt/pve_exporter
|
||||
- /opt/pve_exporter/config
|
||||
become: yes
|
||||
|
||||
- name: Deploy pve_exporter venv and install package
|
||||
command: |
|
||||
python3 -m venv /opt/pve_exporter/venv &&
|
||||
/opt/pve_exporter/venv/bin/pip install --upgrade pip &&
|
||||
/opt/pve_exporter/venv/bin/pip install prometheus-pve
|
||||
args:
|
||||
chdir: /opt/pve_exporter
|
||||
creates: /opt/pve_exporter/venv/bin/pve_exporter
|
||||
- name: Create Python virtual environment
|
||||
command:
|
||||
cmd: python3 -m venv /opt/pve_exporter/venv
|
||||
creates: /opt/pve_exporter/venv/bin/python
|
||||
become: yes
|
||||
become_user: pve_exporter
|
||||
|
||||
- name: Deploy pve_exporter config (with vault secrets)
|
||||
- name: Upgrade pip in virtual environment
|
||||
command:
|
||||
cmd: /opt/pve_exporter/venv/bin/pip install --upgrade pip
|
||||
chdir: /opt/pve_exporter
|
||||
become: yes
|
||||
become_user: pve_exporter
|
||||
|
||||
- name: Install prometheus-pve package
|
||||
command:
|
||||
cmd: /opt/pve_exporter/venv/bin/pip install prometheus-pve
|
||||
chdir: /opt/pve_exporter
|
||||
become: yes
|
||||
become_user: pve_exporter
|
||||
|
||||
- name: Deploy pve_exporter config
|
||||
template:
|
||||
src: pve_exporter_config.yml.j2
|
||||
dest: /opt/pve_exporter/config/config.yml
|
||||
owner: pve_exporter
|
||||
group: pve_exporter
|
||||
mode: '0600'
|
||||
become: yes
|
||||
|
||||
- name: Create pve_exporter systemd service
|
||||
copy:
|
||||
@ -93,6 +118,7 @@
|
||||
WantedBy=multi-user.target
|
||||
dest: /etc/systemd/system/pve_exporter.service
|
||||
mode: '0644'
|
||||
become: yes
|
||||
|
||||
- name: Enable and start pve_exporter
|
||||
systemd:
|
||||
@ -100,9 +126,9 @@
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
become: yes
|
||||
|
||||
|
||||
# ========== 3. RAID monitoring via storcli + node_exporter textfile ==========
|
||||
# ========== 4. RAID monitoring via storcli + node_exporter textfile ==========
|
||||
- name: Ensure node_exporter textfile dir exists
|
||||
file:
|
||||
path: /var/lib/node_exporter/textfile_collector
|
||||
@ -111,64 +137,120 @@
|
||||
group: node_exporter
|
||||
mode: '0755'
|
||||
ignore_errors: yes # если node_exporter ещё не установлен — не падать
|
||||
become: yes
|
||||
|
||||
- name: Deploy storcli → Prometheus metrics script
|
||||
copy:
|
||||
content: |
|
||||
#!/bin/bash
|
||||
OUT=/var/lib/node_exporter/textfile_collector/megaraid.prom
|
||||
OUT=/var/lib/node_exporter/textfile_collector/storcli.prom
|
||||
TMP=$(mktemp)
|
||||
|
||||
echo "# HELP megaraid_disk_temperature_celsius Disk temperature from MegaRAID (°C)" > "$TMP"
|
||||
echo "# TYPE megaraid_disk_temperature_celsius gauge" >> "$TMP"
|
||||
echo "# HELP megaraid_disk_state 1=Online, 0=Offline/Failed" >> "$TMP"
|
||||
echo "# TYPE megaraid_disk_state gauge" >> "$TMP"
|
||||
echo "# HELP megaraid_disk_media_errors_total Media errors" >> "$TMP"
|
||||
echo "# TYPE megaraid_disk_media_errors_total counter" >> "$TMP"
|
||||
echo "# HELP megaraid_disk_other_errors_total Other errors" >> "$TMP"
|
||||
echo "# TYPE megaraid_disk_other_errors_total counter" >> "$TMP"
|
||||
echo "# HELP megaraid_disk_predictive_failures_total Predictive failure count" >> "$TMP"
|
||||
echo "# TYPE megaraid_disk_predictive_failures_total counter" >> "$TMP"
|
||||
echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP"
|
||||
echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP"
|
||||
echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP"
|
||||
echo "# TYPE storcli_disk_state gauge" >> "$TMP"
|
||||
echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP"
|
||||
echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP"
|
||||
echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP"
|
||||
echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP"
|
||||
echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP"
|
||||
echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP"
|
||||
|
||||
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \
|
||||
awk '
|
||||
BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0 }
|
||||
BEGIN {
|
||||
slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0;
|
||||
enclosure="252"; controller="0"
|
||||
}
|
||||
/^Drive \// {
|
||||
if (slot != "") {
|
||||
sv = (state=="Onln")?1:0;
|
||||
# Sanitize model name
|
||||
gsub(/[^a-zA-Z0-9._-]/,"_",model);
|
||||
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
|
||||
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv;
|
||||
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me;
|
||||
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe;
|
||||
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf;
|
||||
|
||||
# Temperature
|
||||
if (temp != -1) {
|
||||
printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, temp;
|
||||
}
|
||||
|
||||
# State (convert to number)
|
||||
state_num = 0;
|
||||
if (state ~ /Onln/) state_num = 2;
|
||||
else if (state ~ /UGood/) state_num = 1;
|
||||
else if (state ~ /Failed/) state_num = 3;
|
||||
else if (state ~ /Rebuild/) state_num = 4;
|
||||
|
||||
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, state, state_num;
|
||||
|
||||
# Errors
|
||||
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, me;
|
||||
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, oe;
|
||||
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, pf;
|
||||
|
||||
# Reset values
|
||||
temp=-1; me=0; oe=0; pf=0;
|
||||
}
|
||||
match($0, /\/c0\/e[0-9]+\/s([0-9]+)/, m); slot=m[1];
|
||||
# Extract slot from path
|
||||
if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) {
|
||||
slot = m[1];
|
||||
}
|
||||
/DID/ && slot { did=$2 }
|
||||
/Model Number/ && slot { model=$2 }
|
||||
/Drive Temperature/ && slot { gsub(/[^0-9]/,"",$2); temp=($2==""?-1:$2) }
|
||||
/Firmware state/ && slot { gsub(/.*: /,"",$0); state=$0 }
|
||||
/Media Error Count/ && slot { me=$2 }
|
||||
/Other Error Count/ && slot { oe=$2 }
|
||||
/Predictive Failure Count/ && slot { pf=$2 }
|
||||
}
|
||||
/Device Id/ && slot { did = $3 }
|
||||
/Model Number/ && slot {
|
||||
model = $3;
|
||||
for(i=4; i<=NF; i++) model = model "_" $i;
|
||||
}
|
||||
/Drive Temperature/ && slot {
|
||||
temp_str = $3;
|
||||
gsub(/[^0-9]/, "", temp_str);
|
||||
temp = (temp_str == "" ? -1 : temp_str);
|
||||
}
|
||||
/Firmware state/ && slot {
|
||||
state = $3;
|
||||
for(i=4; i<=NF; i++) state = state " " $i;
|
||||
gsub(/^[ \t]+|[ \t]+$/, "", state);
|
||||
}
|
||||
/Media Error Count/ && slot { me = $4 }
|
||||
/Other Error Count/ && slot { oe = $4 }
|
||||
/Predictive Failure Count/ && slot { pf = $4 }
|
||||
END {
|
||||
if (slot!="") {
|
||||
sv=(state=="Onln")?1:0;
|
||||
if (slot != "") {
|
||||
gsub(/[^a-zA-Z0-9._-]/,"_",model);
|
||||
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
|
||||
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv;
|
||||
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me;
|
||||
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe;
|
||||
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf;
|
||||
|
||||
if (temp != -1) {
|
||||
printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, temp;
|
||||
}
|
||||
|
||||
state_num = 0;
|
||||
if (state ~ /Onln/) state_num = 2;
|
||||
else if (state ~ /UGood/) state_num = 1;
|
||||
else if (state ~ /Failed/) state_num = 3;
|
||||
else if (state ~ /Rebuild/) state_num = 4;
|
||||
|
||||
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, state, state_num;
|
||||
|
||||
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, me;
|
||||
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, oe;
|
||||
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
||||
controller, enclosure, slot, did, model, pf;
|
||||
}
|
||||
}' >> "$TMP"
|
||||
|
||||
mv "$TMP" "$OUT"
|
||||
chown node_exporter:node_exporter "$OUT" 2>/dev/null || true
|
||||
dest: /usr/local/bin/export-megaraid-metrics.sh
|
||||
chmod 644 "$OUT" 2>/dev/null || true
|
||||
dest: /usr/local/bin/export-storcli-metrics.sh
|
||||
mode: '0755'
|
||||
become: yes
|
||||
|
||||
- name: Allow node_exporter to run storcli
|
||||
copy:
|
||||
@ -176,35 +258,64 @@
|
||||
dest: /etc/sudoers.d/node_exporter_storcli
|
||||
validate: 'visudo -cf %s'
|
||||
mode: '0440'
|
||||
become: yes
|
||||
|
||||
- name: Create RAID export service
|
||||
- name: Create storcli export service
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Export MegaRAID metrics
|
||||
Description=Export storcli metrics for Prometheus
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=node_exporter
|
||||
ExecStart=/usr/bin/sudo /usr/local/bin/export-megaraid-metrics.sh
|
||||
dest: /etc/systemd/system/export-megaraid-metrics.service
|
||||
ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh
|
||||
dest: /etc/systemd/system/export-storcli-metrics.service
|
||||
become: yes
|
||||
|
||||
- name: Create RAID export timer (every 5 min)
|
||||
- name: Create storcli export timer (every 2 minutes)
|
||||
copy:
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Run RAID metric export every 5 minutes
|
||||
Description=Run storcli metric export every 2 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=60
|
||||
OnUnitActiveSec=300
|
||||
OnUnitActiveSec=120
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
dest: /etc/systemd/system/export-megaraid-metrics.timer
|
||||
dest: /etc/systemd/system/export-storcli-metrics.timer
|
||||
become: yes
|
||||
|
||||
- name: Enable RAID metrics timer
|
||||
- name: Enable and start storcli metrics timer
|
||||
systemd:
|
||||
name: export-megaraid-metrics.timer
|
||||
name: export-storcli-metrics.timer
|
||||
enabled: yes
|
||||
state: started
|
||||
daemon_reload: yes
|
||||
become: yes
|
||||
|
||||
# ========== 5. Проверка работы ==========
|
||||
- name: Verify pve_exporter is running
|
||||
systemd:
|
||||
name: pve_exporter
|
||||
state: started
|
||||
become: yes
|
||||
register: pve_exporter_status
|
||||
failed_when: pve_exporter_status.status.ActiveState != "active"
|
||||
|
||||
- name: Verify storcli metrics collection
|
||||
command: /usr/local/bin/export-storcli-metrics.sh
|
||||
become: yes
|
||||
register: storcli_export_result
|
||||
changed_when: false
|
||||
failed_when: storcli_export_result.rc != 0
|
||||
|
||||
- name: Check that metrics file was created
|
||||
stat:
|
||||
path: /var/lib/node_exporter/textfile_collector/storcli.prom
|
||||
register: metrics_file
|
||||
failed_when: not metrics_file.stat.exists
|
||||
become: yes
|
||||
Loading…
Reference in New Issue
Block a user