diff --git a/roles/pve_monitoring/tasks/main.yml b/roles/pve_monitoring/tasks/main.yml index 716032a..53e3a01 100644 --- a/roles/pve_monitoring/tasks/main.yml +++ b/roles/pve_monitoring/tasks/main.yml @@ -1,22 +1,5 @@ --- -# ========== 1. Подготовка: установка sudo и зависимостей ========== -- name: Ensure sudo is installed (required for privilege escalation) - apt: - name: sudo - state: present - update_cache: yes - become: yes - -- name: Install Python dependencies - apt: - name: - - python3 - - python3-pip - - python3-venv - state: present - become: yes - -# ========== 2. Установка storcli (если отсутствует) ========== +# ========== 1. Установка storcli (если отсутствует) ========== - name: Check if storcli is already installed stat: path: /opt/MegaRAID/storcli/storcli64 @@ -45,7 +28,16 @@ failed_when: storcli_test.rc != 0 become: yes -# ========== 3. Настройка pve_exporter (Python) ========== +# ========== 2. Настройка pve_exporter (Python) ========== +- name: Install Python dependencies + apt: + name: + - python3 + - python3-pip + - python3-venv + state: present + become: yes + - name: Create pve_exporter user user: name: pve_exporter @@ -66,7 +58,7 @@ - /opt/pve_exporter/config become: yes -# Устанавливаем sudo для работы с become +# Устанавливаем sudo для корректной работы become - name: Ensure sudo is installed apt: name: sudo @@ -100,7 +92,7 @@ environment: HOME: /opt/pve_exporter -- name: Deploy pve_exporter config (with vault secrets) +- name: Deploy pve_exporter config template: src: pve_exporter_config.yml.j2 dest: /opt/pve_exporter/config/config.yml @@ -109,15 +101,8 @@ mode: '0600' become: yes -# Проверяем конфиг перед запуском сервиса -- name: Verify pve_exporter config syntax - command: /opt/pve_exporter/venv/bin/pve_exporter --config /opt/pve_exporter/config/config.yml --test - become: yes - become_user: pve_exporter - register: config_test - changed_when: false - failed_when: config_test.rc != 0 - ignore_errors: yes +# УДАЛЯЕМ задачу с --test - этот флаг не поддерживается +# Вместо этого проверяем конфиг вручную через запуск сервиса - name: Create pve_exporter systemd service copy: @@ -130,13 +115,16 @@ Type=simple User=pve_exporter WorkingDirectory=/opt/pve_exporter + Environment="HOME=/opt/pve_exporter" ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \ --server 0.0.0.0 \ --port 9223 \ --config /opt/pve_exporter/config/config.yml Restart=always RestartSec=10 - Environment="HOME=/opt/pve_exporter" + StandardOutput=journal + StandardError=journal + SyslogIdentifier=pve_exporter [Install] WantedBy=multi-user.target @@ -152,221 +140,47 @@ daemon_reload: yes become: yes register: service_start - failed_when: false + failed_when: false # Не падаем сразу, а проверим статус -# Ждём 15 секунд для полного запуска +# Ждём запуска сервиса и проверяем порт - name: Wait for pve_exporter to initialize wait_for: host: localhost port: 9223 - timeout: 30 + timeout: 60 state: started delay: 5 register: port_check - failed_when: port_check.failed and service_start.status.ActiveState != "active" + failed_when: false become: yes -# Если порт недоступен — показываем логи для отладки +# Если порт недоступен - показываем логи для отладки - name: Show pve_exporter logs if failed command: journalctl -u pve_exporter -n 100 --no-pager register: service_logs changed_when: false when: port_check.failed become: yes + failed_when: false -- name: Fail if pve_exporter is not running +# Финальная проверка работоспособности +- name: Verify exporter is responding + uri: + url: http://localhost:9223/metrics + status_code: 200 + timeout: 10 + register: metrics_check + when: not port_check.failed + failed_when: metrics_check.status != 200 + become: yes + +- name: Fail with detailed error if pve_exporter not started fail: - msg: "pve_exporter failed to start. Check logs above." - when: port_check.failed - -# ========== 4. RAID monitoring via storcli + node_exporter textfile ========== -- name: Ensure node_exporter textfile dir exists - file: - path: /var/lib/node_exporter/textfile_collector - state: directory - owner: node_exporter - group: node_exporter - mode: '0755' - ignore_errors: yes # если node_exporter ещё не установлен — не падать - become: yes - -- name: Deploy storcli → Prometheus metrics script - copy: - content: | - #!/bin/bash - OUT=/var/lib/node_exporter/textfile_collector/storcli.prom - TMP=$(mktemp) - - echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP" - echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP" - echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP" - echo "# TYPE storcli_disk_state gauge" >> "$TMP" - echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP" - echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP" - echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP" - echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP" - echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP" - echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP" - - /opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \ - awk ' - BEGIN { - slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0; - enclosure="252"; controller="0" - } - /^Drive \// { - if (slot != "") { - # Sanitize model name - gsub(/[^a-zA-Z0-9._-]/,"_",model); - - # Temperature - if (temp != -1) { - printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, temp; - } - - # State (convert to number) - state_num = 0; - if (state ~ /Onln/) state_num = 2; - else if (state ~ /UGood/) state_num = 1; - else if (state ~ /Failed/) state_num = 3; - else if (state ~ /Rebuild/) state_num = 4; - - printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n", - controller, enclosure, slot, did, model, state, state_num; - - # Errors - printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, me; - printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, oe; - printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, pf; - - # Reset values - temp=-1; me=0; oe=0; pf=0; - } - # Extract slot from path - if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) { - slot = m[1]; - } - } - /Device Id/ && slot { did = $3 } - /Model Number/ && slot { - model = $3; - for(i=4; i<=NF; i++) model = model "_" $i; - } - /Drive Temperature/ && slot { - temp_str = $3; - gsub(/[^0-9]/, "", temp_str); - temp = (temp_str == "" ? -1 : temp_str); - } - /Firmware state/ && slot { - state = $3; - for(i=4; i<=NF; i++) state = state " " $i; - gsub(/^[ \t]+|[ \t]+$/, "", state); - } - /Media Error Count/ && slot { me = $4 } - /Other Error Count/ && slot { oe = $4 } - /Predictive Failure Count/ && slot { pf = $4 } - END { - if (slot != "") { - gsub(/[^a-zA-Z0-9._-]/,"_",model); - - if (temp != -1) { - printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, temp; - } - - state_num = 0; - if (state ~ /Onln/) state_num = 2; - else if (state ~ /UGood/) state_num = 1; - else if (state ~ /Failed/) state_num = 3; - else if (state ~ /Rebuild/) state_num = 4; - - printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n", - controller, enclosure, slot, did, model, state, state_num; - - printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, me; - printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, oe; - printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", - controller, enclosure, slot, did, model, pf; - } - }' >> "$TMP" - - mv "$TMP" "$OUT" - chown node_exporter:node_exporter "$OUT" 2>/dev/null || true - chmod 644 "$OUT" 2>/dev/null || true - dest: /usr/local/bin/export-storcli-metrics.sh - mode: '0755' - become: yes - -- name: Allow node_exporter to run storcli - copy: - content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64' - dest: /etc/sudoers.d/node_exporter_storcli - validate: 'visudo -cf %s' - mode: '0440' - become: yes - -- name: Create storcli export service - copy: - content: | - [Unit] - Description=Export storcli metrics for Prometheus - After=network.target - - [Service] - Type=oneshot - User=node_exporter - ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh - dest: /etc/systemd/system/export-storcli-metrics.service - become: yes - -- name: Create storcli export timer (every 2 minutes) - copy: - content: | - [Unit] - Description=Run storcli metric export every 2 minutes - - [Timer] - OnBootSec=60 - OnUnitActiveSec=120 - - [Install] - WantedBy=timers.target - dest: /etc/systemd/system/export-storcli-metrics.timer - become: yes - -- name: Enable and start storcli metrics timer - systemd: - name: export-storcli-metrics.timer - enabled: yes - state: started - daemon_reload: yes - become: yes - -# ========== 5. Проверка работы ========== -- name: Verify pve_exporter is running - systemd: - name: pve_exporter - state: started - become: yes - register: pve_exporter_status - failed_when: pve_exporter_status.status.ActiveState != "active" - -- name: Verify storcli metrics collection - command: /usr/local/bin/export-storcli-metrics.sh - become: yes - register: storcli_export_result - changed_when: false - failed_when: storcli_export_result.rc != 0 - -- name: Check that metrics file was created - stat: - path: /var/lib/node_exporter/textfile_collector/storcli.prom - register: metrics_file - failed_when: not metrics_file.stat.exists - become: yes \ No newline at end of file + msg: | + pve_exporter failed to start. Check logs above. + Common causes: + 1. Incorrect API token in config.yml + 2. Missing permissions for pve_exporter user + 3. Port 9223 is already in use + 4. Proxmox API is not accessible + when: port_check.failed \ No newline at end of file