Update file main.yml

This commit is contained in:
Administrator 2025-11-18 05:17:38 +00:00
parent 495898cf00
commit 293b854b43

View File

@ -1,31 +1,12 @@
--- ---
# ========== 1. Установка storcli (если отсутствует) ========== # ========== 1. Подготовка: установка sudo и зависимостей ==========
- name: Check if storcli is already installed - name: Ensure sudo is installed (required for privilege escalation)
stat:
path: /opt/MegaRAID/storcli/storcli64
register: storcli_installed
- name: Download storcli
get_url:
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
dest: /tmp/storcli.deb
mode: '0644'
when: not storcli_installed.stat.exists
- name: Install storcli
apt: apt:
deb: /tmp/storcli.deb name: sudo
state: present state: present
when: not storcli_installed.stat.exists update_cache: yes
become: yes
- name: Verify storcli works
command: /opt/MegaRAID/storcli/storcli64 /c0 show
register: storcli_test
changed_when: false
failed_when: storcli_test.rc != 0
# ========== 2. Настройка pve_exporter (Python) ==========
- name: Install Python dependencies - name: Install Python dependencies
apt: apt:
name: name:
@ -33,13 +14,45 @@
- python3-pip - python3-pip
- python3-venv - python3-venv
state: present state: present
become: yes
# ========== 2. Установка storcli (если отсутствует) ==========
- name: Check if storcli is already installed
stat:
path: /opt/MegaRAID/storcli/storcli64
register: storcli_installed
become: yes
- name: Download storcli
get_url:
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
dest: /tmp/storcli.deb
mode: '0644'
when: not storcli_installed.stat.exists
become: yes
- name: Install storcli
apt:
deb: /tmp/storcli.deb
state: present
when: not storcli_installed.stat.exists
become: yes
- name: Verify storcli works
command: /opt/MegaRAID/storcli/storcli64 /c0 show
register: storcli_test
changed_when: false
failed_when: storcli_test.rc != 0
become: yes
# ========== 3. Настройка pve_exporter (Python) ==========
- name: Create pve_exporter user - name: Create pve_exporter user
user: user:
name: pve_exporter name: pve_exporter
system: yes system: yes
shell: /usr/sbin/nologin shell: /usr/sbin/nologin
create_home: no create_home: no
become: yes
- name: Create pve_exporter directories - name: Create pve_exporter directories
file: file:
@ -51,25 +64,37 @@
loop: loop:
- /opt/pve_exporter - /opt/pve_exporter
- /opt/pve_exporter/config - /opt/pve_exporter/config
become: yes
- name: Deploy pve_exporter venv and install package - name: Create Python virtual environment
command: | command:
python3 -m venv /opt/pve_exporter/venv && cmd: python3 -m venv /opt/pve_exporter/venv
/opt/pve_exporter/venv/bin/pip install --upgrade pip && creates: /opt/pve_exporter/venv/bin/python
/opt/pve_exporter/venv/bin/pip install prometheus-pve
args:
chdir: /opt/pve_exporter
creates: /opt/pve_exporter/venv/bin/pve_exporter
become: yes become: yes
become_user: pve_exporter become_user: pve_exporter
- name: Deploy pve_exporter config (with vault secrets) - name: Upgrade pip in virtual environment
command:
cmd: /opt/pve_exporter/venv/bin/pip install --upgrade pip
chdir: /opt/pve_exporter
become: yes
become_user: pve_exporter
- name: Install prometheus-pve package
command:
cmd: /opt/pve_exporter/venv/bin/pip install prometheus-pve
chdir: /opt/pve_exporter
become: yes
become_user: pve_exporter
- name: Deploy pve_exporter config
template: template:
src: pve_exporter_config.yml.j2 src: pve_exporter_config.yml.j2
dest: /opt/pve_exporter/config/config.yml dest: /opt/pve_exporter/config/config.yml
owner: pve_exporter owner: pve_exporter
group: pve_exporter group: pve_exporter
mode: '0600' mode: '0600'
become: yes
- name: Create pve_exporter systemd service - name: Create pve_exporter systemd service
copy: copy:
@ -93,6 +118,7 @@
WantedBy=multi-user.target WantedBy=multi-user.target
dest: /etc/systemd/system/pve_exporter.service dest: /etc/systemd/system/pve_exporter.service
mode: '0644' mode: '0644'
become: yes
- name: Enable and start pve_exporter - name: Enable and start pve_exporter
systemd: systemd:
@ -100,9 +126,9 @@
enabled: yes enabled: yes
state: started state: started
daemon_reload: yes daemon_reload: yes
become: yes
# ========== 4. RAID monitoring via storcli + node_exporter textfile ==========
# ========== 3. RAID monitoring via storcli + node_exporter textfile ==========
- name: Ensure node_exporter textfile dir exists - name: Ensure node_exporter textfile dir exists
file: file:
path: /var/lib/node_exporter/textfile_collector path: /var/lib/node_exporter/textfile_collector
@ -111,64 +137,120 @@
group: node_exporter group: node_exporter
mode: '0755' mode: '0755'
ignore_errors: yes # если node_exporter ещё не установлен — не падать ignore_errors: yes # если node_exporter ещё не установлен — не падать
become: yes
- name: Deploy storcli → Prometheus metrics script - name: Deploy storcli → Prometheus metrics script
copy: copy:
content: | content: |
#!/bin/bash #!/bin/bash
OUT=/var/lib/node_exporter/textfile_collector/megaraid.prom OUT=/var/lib/node_exporter/textfile_collector/storcli.prom
TMP=$(mktemp) TMP=$(mktemp)
echo "# HELP megaraid_disk_temperature_celsius Disk temperature from MegaRAID (°C)" > "$TMP" echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP"
echo "# TYPE megaraid_disk_temperature_celsius gauge" >> "$TMP" echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP"
echo "# HELP megaraid_disk_state 1=Online, 0=Offline/Failed" >> "$TMP" echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP"
echo "# TYPE megaraid_disk_state gauge" >> "$TMP" echo "# TYPE storcli_disk_state gauge" >> "$TMP"
echo "# HELP megaraid_disk_media_errors_total Media errors" >> "$TMP" echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP"
echo "# TYPE megaraid_disk_media_errors_total counter" >> "$TMP" echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP"
echo "# HELP megaraid_disk_other_errors_total Other errors" >> "$TMP" echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP"
echo "# TYPE megaraid_disk_other_errors_total counter" >> "$TMP" echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP"
echo "# HELP megaraid_disk_predictive_failures_total Predictive failure count" >> "$TMP" echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP"
echo "# TYPE megaraid_disk_predictive_failures_total counter" >> "$TMP" echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP"
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \ /opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \
awk ' awk '
BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0 } BEGIN {
slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0;
enclosure="252"; controller="0"
}
/^Drive \// { /^Drive \// {
if (slot != "") { if (slot != "") {
sv = (state=="Onln")?1:0; # Sanitize model name
gsub(/[^a-zA-Z0-9._-]/,"_",model); gsub(/[^a-zA-Z0-9._-]/,"_",model);
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv; # Temperature
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me; if (temp != -1) {
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe; printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf; controller, enclosure, slot, did, model, temp;
}
# State (convert to number)
state_num = 0;
if (state ~ /Onln/) state_num = 2;
else if (state ~ /UGood/) state_num = 1;
else if (state ~ /Failed/) state_num = 3;
else if (state ~ /Rebuild/) state_num = 4;
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
controller, enclosure, slot, did, model, state, state_num;
# Errors
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, me;
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, oe;
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, pf;
# Reset values
temp=-1; me=0; oe=0; pf=0; temp=-1; me=0; oe=0; pf=0;
} }
match($0, /\/c0\/e[0-9]+\/s([0-9]+)/, m); slot=m[1]; # Extract slot from path
if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) {
slot = m[1];
} }
/DID/ && slot { did=$2 } }
/Model Number/ && slot { model=$2 } /Device Id/ && slot { did = $3 }
/Drive Temperature/ && slot { gsub(/[^0-9]/,"",$2); temp=($2==""?-1:$2) } /Model Number/ && slot {
/Firmware state/ && slot { gsub(/.*: /,"",$0); state=$0 } model = $3;
/Media Error Count/ && slot { me=$2 } for(i=4; i<=NF; i++) model = model "_" $i;
/Other Error Count/ && slot { oe=$2 } }
/Predictive Failure Count/ && slot { pf=$2 } /Drive Temperature/ && slot {
temp_str = $3;
gsub(/[^0-9]/, "", temp_str);
temp = (temp_str == "" ? -1 : temp_str);
}
/Firmware state/ && slot {
state = $3;
for(i=4; i<=NF; i++) state = state " " $i;
gsub(/^[ \t]+|[ \t]+$/, "", state);
}
/Media Error Count/ && slot { me = $4 }
/Other Error Count/ && slot { oe = $4 }
/Predictive Failure Count/ && slot { pf = $4 }
END { END {
if (slot!="") { if (slot != "") {
sv=(state=="Onln")?1:0;
gsub(/[^a-zA-Z0-9._-]/,"_",model); gsub(/[^a-zA-Z0-9._-]/,"_",model);
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv; if (temp != -1) {
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me; printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe; controller, enclosure, slot, did, model, temp;
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf; }
state_num = 0;
if (state ~ /Onln/) state_num = 2;
else if (state ~ /UGood/) state_num = 1;
else if (state ~ /Failed/) state_num = 3;
else if (state ~ /Rebuild/) state_num = 4;
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
controller, enclosure, slot, did, model, state, state_num;
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, me;
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, oe;
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, pf;
} }
}' >> "$TMP" }' >> "$TMP"
mv "$TMP" "$OUT" mv "$TMP" "$OUT"
chown node_exporter:node_exporter "$OUT" 2>/dev/null || true chown node_exporter:node_exporter "$OUT" 2>/dev/null || true
dest: /usr/local/bin/export-megaraid-metrics.sh chmod 644 "$OUT" 2>/dev/null || true
dest: /usr/local/bin/export-storcli-metrics.sh
mode: '0755' mode: '0755'
become: yes
- name: Allow node_exporter to run storcli - name: Allow node_exporter to run storcli
copy: copy:
@ -176,35 +258,64 @@
dest: /etc/sudoers.d/node_exporter_storcli dest: /etc/sudoers.d/node_exporter_storcli
validate: 'visudo -cf %s' validate: 'visudo -cf %s'
mode: '0440' mode: '0440'
become: yes
- name: Create RAID export service - name: Create storcli export service
copy: copy:
content: | content: |
[Unit] [Unit]
Description=Export MegaRAID metrics Description=Export storcli metrics for Prometheus
After=network.target After=network.target
[Service] [Service]
Type=oneshot Type=oneshot
User=node_exporter User=node_exporter
ExecStart=/usr/bin/sudo /usr/local/bin/export-megaraid-metrics.sh ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh
dest: /etc/systemd/system/export-megaraid-metrics.service dest: /etc/systemd/system/export-storcli-metrics.service
become: yes
- name: Create RAID export timer (every 5 min) - name: Create storcli export timer (every 2 minutes)
copy: copy:
content: | content: |
[Unit] [Unit]
Description=Run RAID metric export every 5 minutes Description=Run storcli metric export every 2 minutes
[Timer] [Timer]
OnBootSec=60 OnBootSec=60
OnUnitActiveSec=300 OnUnitActiveSec=120
[Install] [Install]
WantedBy=timers.target WantedBy=timers.target
dest: /etc/systemd/system/export-megaraid-metrics.timer dest: /etc/systemd/system/export-storcli-metrics.timer
become: yes
- name: Enable RAID metrics timer - name: Enable and start storcli metrics timer
systemd: systemd:
name: export-megaraid-metrics.timer name: export-storcli-metrics.timer
enabled: yes enabled: yes
state: started state: started
daemon_reload: yes daemon_reload: yes
become: yes
# ========== 5. Проверка работы ==========
- name: Verify pve_exporter is running
systemd:
name: pve_exporter
state: started
become: yes
register: pve_exporter_status
failed_when: pve_exporter_status.status.ActiveState != "active"
- name: Verify storcli metrics collection
command: /usr/local/bin/export-storcli-metrics.sh
become: yes
register: storcli_export_result
changed_when: false
failed_when: storcli_export_result.rc != 0
- name: Check that metrics file was created
stat:
path: /var/lib/node_exporter/textfile_collector/storcli.prom
register: metrics_file
failed_when: not metrics_file.stat.exists
become: yes