olimp/roles/pve_monitoring/tasks/main.yml
2025-11-18 05:37:29 +00:00

372 lines
12 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

---
# ========== 1. Подготовка: установка sudo и зависимостей ==========
- name: Ensure sudo is installed (required for privilege escalation)
apt:
name: sudo
state: present
update_cache: yes
become: yes
- name: Install Python dependencies
apt:
name:
- python3
- python3-pip
- python3-venv
state: present
become: yes
# ========== 2. Установка storcli (если отсутствует) ==========
- name: Check if storcli is already installed
stat:
path: /opt/MegaRAID/storcli/storcli64
register: storcli_installed
become: yes
- name: Download storcli
get_url:
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
dest: /tmp/storcli.deb
mode: '0644'
when: not storcli_installed.stat.exists
become: yes
- name: Install storcli
apt:
deb: /tmp/storcli.deb
state: present
when: not storcli_installed.stat.exists
become: yes
- name: Verify storcli works
command: /opt/MegaRAID/storcli/storcli64 /c0 show
register: storcli_test
changed_when: false
failed_when: storcli_test.rc != 0
become: yes
# ========== 3. Настройка pve_exporter (Python) ==========
- name: Create pve_exporter user
user:
name: pve_exporter
system: yes
shell: /usr/sbin/nologin
create_home: no
become: yes
- name: Create pve_exporter directories
file:
path: "{{ item }}"
state: directory
owner: pve_exporter
group: pve_exporter
mode: '0755'
loop:
- /opt/pve_exporter
- /opt/pve_exporter/config
become: yes
# Устанавливаем sudo для работы с become
- name: Ensure sudo is installed
apt:
name: sudo
state: present
become: yes
- name: Create Python virtual environment
command:
cmd: python3 -m venv /opt/pve_exporter/venv
creates: /opt/pve_exporter/venv/bin/python
become: yes
become_user: pve_exporter
environment:
HOME: /opt/pve_exporter
- name: Upgrade pip in virtual environment
command:
cmd: /opt/pve_exporter/venv/bin/pip install --upgrade pip
chdir: /opt/pve_exporter
become: yes
become_user: pve_exporter
environment:
HOME: /opt/pve_exporter
- name: Install prometheus-pve-exporter package
command:
cmd: /opt/pve_exporter/venv/bin/pip install prometheus-pve-exporter
chdir: /opt/pve_exporter
become: yes
become_user: pve_exporter
environment:
HOME: /opt/pve_exporter
- name: Deploy pve_exporter config (with vault secrets)
template:
src: pve_exporter_config.yml.j2
dest: /opt/pve_exporter/config/config.yml
owner: pve_exporter
group: pve_exporter
mode: '0600'
become: yes
# Проверяем конфиг перед запуском сервиса
- name: Verify pve_exporter config syntax
command: /opt/pve_exporter/venv/bin/pve_exporter --config /opt/pve_exporter/config/config.yml --test
become: yes
become_user: pve_exporter
register: config_test
changed_when: false
failed_when: config_test.rc != 0
ignore_errors: yes
- name: Create pve_exporter systemd service
copy:
content: |
[Unit]
Description=Proxmox VE Exporter
After=network.target
[Service]
Type=simple
User=pve_exporter
WorkingDirectory=/opt/pve_exporter
ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \
--server 0.0.0.0 \
--port 9223 \
--config /opt/pve_exporter/config/config.yml
Restart=always
RestartSec=10
Environment="HOME=/opt/pve_exporter"
[Install]
WantedBy=multi-user.target
dest: /etc/systemd/system/pve_exporter.service
mode: '0644'
become: yes
- name: Reload systemd and start pve_exporter
systemd:
name: pve_exporter
enabled: yes
state: started
daemon_reload: yes
become: yes
register: service_start
failed_when: false
# Ждём 15 секунд для полного запуска
- name: Wait for pve_exporter to initialize
wait_for:
host: localhost
port: 9223
timeout: 30
state: started
delay: 5
register: port_check
failed_when: port_check.failed and service_start.status.ActiveState != "active"
become: yes
# Если порт недоступен — показываем логи для отладки
- name: Show pve_exporter logs if failed
command: journalctl -u pve_exporter -n 100 --no-pager
register: service_logs
changed_when: false
when: port_check.failed
become: yes
- name: Fail if pve_exporter is not running
fail:
msg: "pve_exporter failed to start. Check logs above."
when: port_check.failed
# ========== 4. RAID monitoring via storcli + node_exporter textfile ==========
- name: Ensure node_exporter textfile dir exists
file:
path: /var/lib/node_exporter/textfile_collector
state: directory
owner: node_exporter
group: node_exporter
mode: '0755'
ignore_errors: yes # если node_exporter ещё не установлен — не падать
become: yes
- name: Deploy storcli → Prometheus metrics script
copy:
content: |
#!/bin/bash
OUT=/var/lib/node_exporter/textfile_collector/storcli.prom
TMP=$(mktemp)
echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP"
echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP"
echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP"
echo "# TYPE storcli_disk_state gauge" >> "$TMP"
echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP"
echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP"
echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP"
echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP"
echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP"
echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP"
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \
awk '
BEGIN {
slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0;
enclosure="252"; controller="0"
}
/^Drive \// {
if (slot != "") {
# Sanitize model name
gsub(/[^a-zA-Z0-9._-]/,"_",model);
# Temperature
if (temp != -1) {
printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, temp;
}
# State (convert to number)
state_num = 0;
if (state ~ /Onln/) state_num = 2;
else if (state ~ /UGood/) state_num = 1;
else if (state ~ /Failed/) state_num = 3;
else if (state ~ /Rebuild/) state_num = 4;
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
controller, enclosure, slot, did, model, state, state_num;
# Errors
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, me;
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, oe;
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, pf;
# Reset values
temp=-1; me=0; oe=0; pf=0;
}
# Extract slot from path
if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) {
slot = m[1];
}
}
/Device Id/ && slot { did = $3 }
/Model Number/ && slot {
model = $3;
for(i=4; i<=NF; i++) model = model "_" $i;
}
/Drive Temperature/ && slot {
temp_str = $3;
gsub(/[^0-9]/, "", temp_str);
temp = (temp_str == "" ? -1 : temp_str);
}
/Firmware state/ && slot {
state = $3;
for(i=4; i<=NF; i++) state = state " " $i;
gsub(/^[ \t]+|[ \t]+$/, "", state);
}
/Media Error Count/ && slot { me = $4 }
/Other Error Count/ && slot { oe = $4 }
/Predictive Failure Count/ && slot { pf = $4 }
END {
if (slot != "") {
gsub(/[^a-zA-Z0-9._-]/,"_",model);
if (temp != -1) {
printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, temp;
}
state_num = 0;
if (state ~ /Onln/) state_num = 2;
else if (state ~ /UGood/) state_num = 1;
else if (state ~ /Failed/) state_num = 3;
else if (state ~ /Rebuild/) state_num = 4;
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
controller, enclosure, slot, did, model, state, state_num;
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, me;
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, oe;
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
controller, enclosure, slot, did, model, pf;
}
}' >> "$TMP"
mv "$TMP" "$OUT"
chown node_exporter:node_exporter "$OUT" 2>/dev/null || true
chmod 644 "$OUT" 2>/dev/null || true
dest: /usr/local/bin/export-storcli-metrics.sh
mode: '0755'
become: yes
- name: Allow node_exporter to run storcli
copy:
content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64'
dest: /etc/sudoers.d/node_exporter_storcli
validate: 'visudo -cf %s'
mode: '0440'
become: yes
- name: Create storcli export service
copy:
content: |
[Unit]
Description=Export storcli metrics for Prometheus
After=network.target
[Service]
Type=oneshot
User=node_exporter
ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh
dest: /etc/systemd/system/export-storcli-metrics.service
become: yes
- name: Create storcli export timer (every 2 minutes)
copy:
content: |
[Unit]
Description=Run storcli metric export every 2 minutes
[Timer]
OnBootSec=60
OnUnitActiveSec=120
[Install]
WantedBy=timers.target
dest: /etc/systemd/system/export-storcli-metrics.timer
become: yes
- name: Enable and start storcli metrics timer
systemd:
name: export-storcli-metrics.timer
enabled: yes
state: started
daemon_reload: yes
become: yes
# ========== 5. Проверка работы ==========
- name: Verify pve_exporter is running
systemd:
name: pve_exporter
state: started
become: yes
register: pve_exporter_status
failed_when: pve_exporter_status.status.ActiveState != "active"
- name: Verify storcli metrics collection
command: /usr/local/bin/export-storcli-metrics.sh
become: yes
register: storcli_export_result
changed_when: false
failed_when: storcli_export_result.rc != 0
- name: Check that metrics file was created
stat:
path: /var/lib/node_exporter/textfile_collector/storcli.prom
register: metrics_file
failed_when: not metrics_file.stat.exists
become: yes