olimp/roles/pve_monitoring/tasks/main.yml
Administrator 00570077d9 Update 5 files
- /roles/pve_monitoring/tasks/main.yml
- /roles/pve_monitoring/templates/pve_exporter_config.yml.j2
- /inventories/hosts
- /olimp-deploy.yml
- /group_vars/all.yml
2025-11-14 05:04:02 +00:00

210 lines
7.3 KiB
YAML

---
# ========== 1. Установка storcli (если отсутствует) ==========
- name: Check if storcli is already installed
stat:
path: /opt/MegaRAID/storcli/storcli64
register: storcli_installed
- name: Download storcli
get_url:
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
dest: /tmp/storcli.deb
mode: '0644'
when: not storcli_installed.stat.exists
- name: Install storcli
apt:
deb: /tmp/storcli.deb
state: present
when: not storcli_installed.stat.exists
- name: Verify storcli works
command: /opt/MegaRAID/storcli/storcli64 /c0 show
register: storcli_test
changed_when: false
failed_when: storcli_test.rc != 0
# ========== 2. Настройка pve_exporter (Python) ==========
- name: Install Python dependencies
apt:
name:
- python3
- python3-pip
- python3-venv
state: present
- name: Create pve_exporter user
user:
name: pve_exporter
system: yes
shell: /usr/sbin/nologin
create_home: no
- name: Create pve_exporter directories
file:
path: "{{ item }}"
state: directory
owner: pve_exporter
group: pve_exporter
mode: '0755'
loop:
- /opt/pve_exporter
- /opt/pve_exporter/config
- name: Deploy pve_exporter venv and install package
command: |
python3 -m venv /opt/pve_exporter/venv &&
/opt/pve_exporter/venv/bin/pip install --upgrade pip &&
/opt/pve_exporter/venv/bin/pip install prometheus-pve
args:
chdir: /opt/pve_exporter
creates: /opt/pve_exporter/venv/bin/pve_exporter
become: yes
become_user: pve_exporter
- name: Deploy pve_exporter config (with vault secrets)
template:
src: pve_exporter_config.yml.j2
dest: /opt/pve_exporter/config/config.yml
owner: pve_exporter
group: pve_exporter
mode: '0600'
- name: Create pve_exporter systemd service
copy:
content: |
[Unit]
Description=Proxmox VE Exporter
After=network.target
[Service]
Type=simple
User=pve_exporter
WorkingDirectory=/opt/pve_exporter
ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \
--server 0.0.0.0 \
--port 9223 \
--config /opt/pve_exporter/config/config.yml
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/pve_exporter.service
mode: '0644'
- name: Enable and start pve_exporter
systemd:
name: pve_exporter
enabled: yes
state: started
daemon_reload: yes
# ========== 3. RAID monitoring via storcli + node_exporter textfile ==========
- name: Ensure node_exporter textfile dir exists
file:
path: /var/lib/node_exporter/textfile_collector
state: directory
owner: node_exporter
group: node_exporter
mode: '0755'
ignore_errors: yes # если node_exporter ещё не установлен — не падать
- name: Deploy storcli → Prometheus metrics script
copy:
content: |
#!/bin/bash
OUT=/var/lib/node_exporter/textfile_collector/megaraid.prom
TMP=$(mktemp)
echo "# HELP megaraid_disk_temperature_celsius Disk temperature from MegaRAID (°C)" > "$TMP"
echo "# TYPE megaraid_disk_temperature_celsius gauge" >> "$TMP"
echo "# HELP megaraid_disk_state 1=Online, 0=Offline/Failed" >> "$TMP"
echo "# TYPE megaraid_disk_state gauge" >> "$TMP"
echo "# HELP megaraid_disk_media_errors_total Media errors" >> "$TMP"
echo "# TYPE megaraid_disk_media_errors_total counter" >> "$TMP"
echo "# HELP megaraid_disk_other_errors_total Other errors" >> "$TMP"
echo "# TYPE megaraid_disk_other_errors_total counter" >> "$TMP"
echo "# HELP megaraid_disk_predictive_failures_total Predictive failure count" >> "$TMP"
echo "# TYPE megaraid_disk_predictive_failures_total counter" >> "$TMP"
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \
awk '
BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0 }
/^Drive \// {
if (slot != "") {
sv = (state=="Onln")?1:0;
gsub(/[^a-zA-Z0-9._-]/,"_",model);
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv;
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me;
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe;
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf;
temp=-1; me=0; oe=0; pf=0;
}
match($0, /\/c0\/e[0-9]+\/s([0-9]+)/, m); slot=m[1];
}
/DID/ && slot { did=$2 }
/Model Number/ && slot { model=$2 }
/Drive Temperature/ && slot { gsub(/[^0-9]/,"",$2); temp=($2==""?-1:$2) }
/Firmware state/ && slot { gsub(/.*: /,"",$0); state=$0 }
/Media Error Count/ && slot { me=$2 }
/Other Error Count/ && slot { oe=$2 }
/Predictive Failure Count/ && slot { pf=$2 }
END {
if (slot!="") {
sv=(state=="Onln")?1:0;
gsub(/[^a-zA-Z0-9._-]/,"_",model);
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv;
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me;
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe;
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf;
}
}' >> "$TMP"
mv "$TMP" "$OUT"
chown node_exporter:node_exporter "$OUT" 2>/dev/null || true
dest: /usr/local/bin/export-megaraid-metrics.sh
mode: '0755'
- name: Allow node_exporter to run storcli
copy:
content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64'
dest: /etc/sudoers.d/node_exporter_storcli
validate: 'visudo -cf %s'
mode: '0440'
- name: Create RAID export service
copy:
content: |
[Unit]
Description=Export MegaRAID metrics
After=network.target
[Service]
Type=oneshot
User=node_exporter
ExecStart=/usr/bin/sudo /usr/local/bin/export-megaraid-metrics.sh
dest: /etc/systemd/system/export-megaraid-metrics.service
- name: Create RAID export timer (every 5 min)
copy:
content: |
[Unit]
Description=Run RAID metric export every 5 minutes
[Timer]
OnBootSec=60
OnUnitActiveSec=300
[Install]
WantedBy=timers.target
dest: /etc/systemd/system/export-megaraid-metrics.timer
- name: Enable RAID metrics timer
systemd:
name: export-megaraid-metrics.timer
enabled: yes
state: started
daemon_reload: yes