Update file main.yml
This commit is contained in:
parent
be8c93f2b0
commit
5b419c061d
@ -1,22 +1,5 @@
|
|||||||
---
|
---
|
||||||
# ========== 1. Подготовка: установка sudo и зависимостей ==========
|
# ========== 1. Установка storcli (если отсутствует) ==========
|
||||||
- name: Ensure sudo is installed (required for privilege escalation)
|
|
||||||
apt:
|
|
||||||
name: sudo
|
|
||||||
state: present
|
|
||||||
update_cache: yes
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Install Python dependencies
|
|
||||||
apt:
|
|
||||||
name:
|
|
||||||
- python3
|
|
||||||
- python3-pip
|
|
||||||
- python3-venv
|
|
||||||
state: present
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
# ========== 2. Установка storcli (если отсутствует) ==========
|
|
||||||
- name: Check if storcli is already installed
|
- name: Check if storcli is already installed
|
||||||
stat:
|
stat:
|
||||||
path: /opt/MegaRAID/storcli/storcli64
|
path: /opt/MegaRAID/storcli/storcli64
|
||||||
@ -45,7 +28,16 @@
|
|||||||
failed_when: storcli_test.rc != 0
|
failed_when: storcli_test.rc != 0
|
||||||
become: yes
|
become: yes
|
||||||
|
|
||||||
# ========== 3. Настройка pve_exporter (Python) ==========
|
# ========== 2. Настройка pve_exporter (Python) ==========
|
||||||
|
- name: Install Python dependencies
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- python3
|
||||||
|
- python3-pip
|
||||||
|
- python3-venv
|
||||||
|
state: present
|
||||||
|
become: yes
|
||||||
|
|
||||||
- name: Create pve_exporter user
|
- name: Create pve_exporter user
|
||||||
user:
|
user:
|
||||||
name: pve_exporter
|
name: pve_exporter
|
||||||
@ -66,7 +58,7 @@
|
|||||||
- /opt/pve_exporter/config
|
- /opt/pve_exporter/config
|
||||||
become: yes
|
become: yes
|
||||||
|
|
||||||
# Устанавливаем sudo для работы с become
|
# Устанавливаем sudo для корректной работы become
|
||||||
- name: Ensure sudo is installed
|
- name: Ensure sudo is installed
|
||||||
apt:
|
apt:
|
||||||
name: sudo
|
name: sudo
|
||||||
@ -100,7 +92,7 @@
|
|||||||
environment:
|
environment:
|
||||||
HOME: /opt/pve_exporter
|
HOME: /opt/pve_exporter
|
||||||
|
|
||||||
- name: Deploy pve_exporter config (with vault secrets)
|
- name: Deploy pve_exporter config
|
||||||
template:
|
template:
|
||||||
src: pve_exporter_config.yml.j2
|
src: pve_exporter_config.yml.j2
|
||||||
dest: /opt/pve_exporter/config/config.yml
|
dest: /opt/pve_exporter/config/config.yml
|
||||||
@ -109,15 +101,8 @@
|
|||||||
mode: '0600'
|
mode: '0600'
|
||||||
become: yes
|
become: yes
|
||||||
|
|
||||||
# Проверяем конфиг перед запуском сервиса
|
# УДАЛЯЕМ задачу с --test - этот флаг не поддерживается
|
||||||
- name: Verify pve_exporter config syntax
|
# Вместо этого проверяем конфиг вручную через запуск сервиса
|
||||||
command: /opt/pve_exporter/venv/bin/pve_exporter --config /opt/pve_exporter/config/config.yml --test
|
|
||||||
become: yes
|
|
||||||
become_user: pve_exporter
|
|
||||||
register: config_test
|
|
||||||
changed_when: false
|
|
||||||
failed_when: config_test.rc != 0
|
|
||||||
ignore_errors: yes
|
|
||||||
|
|
||||||
- name: Create pve_exporter systemd service
|
- name: Create pve_exporter systemd service
|
||||||
copy:
|
copy:
|
||||||
@ -130,13 +115,16 @@
|
|||||||
Type=simple
|
Type=simple
|
||||||
User=pve_exporter
|
User=pve_exporter
|
||||||
WorkingDirectory=/opt/pve_exporter
|
WorkingDirectory=/opt/pve_exporter
|
||||||
|
Environment="HOME=/opt/pve_exporter"
|
||||||
ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \
|
ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \
|
||||||
--server 0.0.0.0 \
|
--server 0.0.0.0 \
|
||||||
--port 9223 \
|
--port 9223 \
|
||||||
--config /opt/pve_exporter/config/config.yml
|
--config /opt/pve_exporter/config/config.yml
|
||||||
Restart=always
|
Restart=always
|
||||||
RestartSec=10
|
RestartSec=10
|
||||||
Environment="HOME=/opt/pve_exporter"
|
StandardOutput=journal
|
||||||
|
StandardError=journal
|
||||||
|
SyslogIdentifier=pve_exporter
|
||||||
|
|
||||||
[Install]
|
[Install]
|
||||||
WantedBy=multi-user.target
|
WantedBy=multi-user.target
|
||||||
@ -152,221 +140,47 @@
|
|||||||
daemon_reload: yes
|
daemon_reload: yes
|
||||||
become: yes
|
become: yes
|
||||||
register: service_start
|
register: service_start
|
||||||
failed_when: false
|
failed_when: false # Не падаем сразу, а проверим статус
|
||||||
|
|
||||||
# Ждём 15 секунд для полного запуска
|
# Ждём запуска сервиса и проверяем порт
|
||||||
- name: Wait for pve_exporter to initialize
|
- name: Wait for pve_exporter to initialize
|
||||||
wait_for:
|
wait_for:
|
||||||
host: localhost
|
host: localhost
|
||||||
port: 9223
|
port: 9223
|
||||||
timeout: 30
|
timeout: 60
|
||||||
state: started
|
state: started
|
||||||
delay: 5
|
delay: 5
|
||||||
register: port_check
|
register: port_check
|
||||||
failed_when: port_check.failed and service_start.status.ActiveState != "active"
|
failed_when: false
|
||||||
become: yes
|
become: yes
|
||||||
|
|
||||||
# Если порт недоступен — показываем логи для отладки
|
# Если порт недоступен - показываем логи для отладки
|
||||||
- name: Show pve_exporter logs if failed
|
- name: Show pve_exporter logs if failed
|
||||||
command: journalctl -u pve_exporter -n 100 --no-pager
|
command: journalctl -u pve_exporter -n 100 --no-pager
|
||||||
register: service_logs
|
register: service_logs
|
||||||
changed_when: false
|
changed_when: false
|
||||||
when: port_check.failed
|
when: port_check.failed
|
||||||
become: yes
|
become: yes
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
- name: Fail if pve_exporter is not running
|
# Финальная проверка работоспособности
|
||||||
|
- name: Verify exporter is responding
|
||||||
|
uri:
|
||||||
|
url: http://localhost:9223/metrics
|
||||||
|
status_code: 200
|
||||||
|
timeout: 10
|
||||||
|
register: metrics_check
|
||||||
|
when: not port_check.failed
|
||||||
|
failed_when: metrics_check.status != 200
|
||||||
|
become: yes
|
||||||
|
|
||||||
|
- name: Fail with detailed error if pve_exporter not started
|
||||||
fail:
|
fail:
|
||||||
msg: "pve_exporter failed to start. Check logs above."
|
msg: |
|
||||||
|
pve_exporter failed to start. Check logs above.
|
||||||
|
Common causes:
|
||||||
|
1. Incorrect API token in config.yml
|
||||||
|
2. Missing permissions for pve_exporter user
|
||||||
|
3. Port 9223 is already in use
|
||||||
|
4. Proxmox API is not accessible
|
||||||
when: port_check.failed
|
when: port_check.failed
|
||||||
|
|
||||||
# ========== 4. RAID monitoring via storcli + node_exporter textfile ==========
|
|
||||||
- name: Ensure node_exporter textfile dir exists
|
|
||||||
file:
|
|
||||||
path: /var/lib/node_exporter/textfile_collector
|
|
||||||
state: directory
|
|
||||||
owner: node_exporter
|
|
||||||
group: node_exporter
|
|
||||||
mode: '0755'
|
|
||||||
ignore_errors: yes # если node_exporter ещё не установлен — не падать
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Deploy storcli → Prometheus metrics script
|
|
||||||
copy:
|
|
||||||
content: |
|
|
||||||
#!/bin/bash
|
|
||||||
OUT=/var/lib/node_exporter/textfile_collector/storcli.prom
|
|
||||||
TMP=$(mktemp)
|
|
||||||
|
|
||||||
echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP"
|
|
||||||
echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP"
|
|
||||||
echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP"
|
|
||||||
echo "# TYPE storcli_disk_state gauge" >> "$TMP"
|
|
||||||
echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP"
|
|
||||||
echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP"
|
|
||||||
echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP"
|
|
||||||
echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP"
|
|
||||||
echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP"
|
|
||||||
echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP"
|
|
||||||
|
|
||||||
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \
|
|
||||||
awk '
|
|
||||||
BEGIN {
|
|
||||||
slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0;
|
|
||||||
enclosure="252"; controller="0"
|
|
||||||
}
|
|
||||||
/^Drive \// {
|
|
||||||
if (slot != "") {
|
|
||||||
# Sanitize model name
|
|
||||||
gsub(/[^a-zA-Z0-9._-]/,"_",model);
|
|
||||||
|
|
||||||
# Temperature
|
|
||||||
if (temp != -1) {
|
|
||||||
printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
# State (convert to number)
|
|
||||||
state_num = 0;
|
|
||||||
if (state ~ /Onln/) state_num = 2;
|
|
||||||
else if (state ~ /UGood/) state_num = 1;
|
|
||||||
else if (state ~ /Failed/) state_num = 3;
|
|
||||||
else if (state ~ /Rebuild/) state_num = 4;
|
|
||||||
|
|
||||||
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, state, state_num;
|
|
||||||
|
|
||||||
# Errors
|
|
||||||
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, me;
|
|
||||||
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, oe;
|
|
||||||
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, pf;
|
|
||||||
|
|
||||||
# Reset values
|
|
||||||
temp=-1; me=0; oe=0; pf=0;
|
|
||||||
}
|
|
||||||
# Extract slot from path
|
|
||||||
if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) {
|
|
||||||
slot = m[1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
/Device Id/ && slot { did = $3 }
|
|
||||||
/Model Number/ && slot {
|
|
||||||
model = $3;
|
|
||||||
for(i=4; i<=NF; i++) model = model "_" $i;
|
|
||||||
}
|
|
||||||
/Drive Temperature/ && slot {
|
|
||||||
temp_str = $3;
|
|
||||||
gsub(/[^0-9]/, "", temp_str);
|
|
||||||
temp = (temp_str == "" ? -1 : temp_str);
|
|
||||||
}
|
|
||||||
/Firmware state/ && slot {
|
|
||||||
state = $3;
|
|
||||||
for(i=4; i<=NF; i++) state = state " " $i;
|
|
||||||
gsub(/^[ \t]+|[ \t]+$/, "", state);
|
|
||||||
}
|
|
||||||
/Media Error Count/ && slot { me = $4 }
|
|
||||||
/Other Error Count/ && slot { oe = $4 }
|
|
||||||
/Predictive Failure Count/ && slot { pf = $4 }
|
|
||||||
END {
|
|
||||||
if (slot != "") {
|
|
||||||
gsub(/[^a-zA-Z0-9._-]/,"_",model);
|
|
||||||
|
|
||||||
if (temp != -1) {
|
|
||||||
printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, temp;
|
|
||||||
}
|
|
||||||
|
|
||||||
state_num = 0;
|
|
||||||
if (state ~ /Onln/) state_num = 2;
|
|
||||||
else if (state ~ /UGood/) state_num = 1;
|
|
||||||
else if (state ~ /Failed/) state_num = 3;
|
|
||||||
else if (state ~ /Rebuild/) state_num = 4;
|
|
||||||
|
|
||||||
printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, state, state_num;
|
|
||||||
|
|
||||||
printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, me;
|
|
||||||
printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, oe;
|
|
||||||
printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",
|
|
||||||
controller, enclosure, slot, did, model, pf;
|
|
||||||
}
|
|
||||||
}' >> "$TMP"
|
|
||||||
|
|
||||||
mv "$TMP" "$OUT"
|
|
||||||
chown node_exporter:node_exporter "$OUT" 2>/dev/null || true
|
|
||||||
chmod 644 "$OUT" 2>/dev/null || true
|
|
||||||
dest: /usr/local/bin/export-storcli-metrics.sh
|
|
||||||
mode: '0755'
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Allow node_exporter to run storcli
|
|
||||||
copy:
|
|
||||||
content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64'
|
|
||||||
dest: /etc/sudoers.d/node_exporter_storcli
|
|
||||||
validate: 'visudo -cf %s'
|
|
||||||
mode: '0440'
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Create storcli export service
|
|
||||||
copy:
|
|
||||||
content: |
|
|
||||||
[Unit]
|
|
||||||
Description=Export storcli metrics for Prometheus
|
|
||||||
After=network.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=oneshot
|
|
||||||
User=node_exporter
|
|
||||||
ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh
|
|
||||||
dest: /etc/systemd/system/export-storcli-metrics.service
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Create storcli export timer (every 2 minutes)
|
|
||||||
copy:
|
|
||||||
content: |
|
|
||||||
[Unit]
|
|
||||||
Description=Run storcli metric export every 2 minutes
|
|
||||||
|
|
||||||
[Timer]
|
|
||||||
OnBootSec=60
|
|
||||||
OnUnitActiveSec=120
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=timers.target
|
|
||||||
dest: /etc/systemd/system/export-storcli-metrics.timer
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
- name: Enable and start storcli metrics timer
|
|
||||||
systemd:
|
|
||||||
name: export-storcli-metrics.timer
|
|
||||||
enabled: yes
|
|
||||||
state: started
|
|
||||||
daemon_reload: yes
|
|
||||||
become: yes
|
|
||||||
|
|
||||||
# ========== 5. Проверка работы ==========
|
|
||||||
- name: Verify pve_exporter is running
|
|
||||||
systemd:
|
|
||||||
name: pve_exporter
|
|
||||||
state: started
|
|
||||||
become: yes
|
|
||||||
register: pve_exporter_status
|
|
||||||
failed_when: pve_exporter_status.status.ActiveState != "active"
|
|
||||||
|
|
||||||
- name: Verify storcli metrics collection
|
|
||||||
command: /usr/local/bin/export-storcli-metrics.sh
|
|
||||||
become: yes
|
|
||||||
register: storcli_export_result
|
|
||||||
changed_when: false
|
|
||||||
failed_when: storcli_export_result.rc != 0
|
|
||||||
|
|
||||||
- name: Check that metrics file was created
|
|
||||||
stat:
|
|
||||||
path: /var/lib/node_exporter/textfile_collector/storcli.prom
|
|
||||||
register: metrics_file
|
|
||||||
failed_when: not metrics_file.stat.exists
|
|
||||||
become: yes
|
|
||||||
Loading…
Reference in New Issue
Block a user