diff --git a/roles/pve_monitoring/tasks/main.yml b/roles/pve_monitoring/tasks/main.yml index 0c3730f..d51e46d 100644 --- a/roles/pve_monitoring/tasks/main.yml +++ b/roles/pve_monitoring/tasks/main.yml @@ -1,31 +1,12 @@ --- -# ========== 1. Установка storcli (если отсутствует) ========== -- name: Check if storcli is already installed - stat: - path: /opt/MegaRAID/storcli/storcli64 - register: storcli_installed - -- name: Download storcli - get_url: - url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb - dest: /tmp/storcli.deb - mode: '0644' - when: not storcli_installed.stat.exists - -- name: Install storcli +# ========== 1. Подготовка: установка sudo и зависимостей ========== +- name: Ensure sudo is installed (required for privilege escalation) apt: - deb: /tmp/storcli.deb + name: sudo state: present - when: not storcli_installed.stat.exists + update_cache: yes + become: yes -- name: Verify storcli works - command: /opt/MegaRAID/storcli/storcli64 /c0 show - register: storcli_test - changed_when: false - failed_when: storcli_test.rc != 0 - - -# ========== 2. Настройка pve_exporter (Python) ========== - name: Install Python dependencies apt: name: @@ -33,13 +14,45 @@ - python3-pip - python3-venv state: present + become: yes +# ========== 2. Установка storcli (если отсутствует) ========== +- name: Check if storcli is already installed + stat: + path: /opt/MegaRAID/storcli/storcli64 + register: storcli_installed + become: yes + +- name: Download storcli + get_url: + url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb + dest: /tmp/storcli.deb + mode: '0644' + when: not storcli_installed.stat.exists + become: yes + +- name: Install storcli + apt: + deb: /tmp/storcli.deb + state: present + when: not storcli_installed.stat.exists + become: yes + +- name: Verify storcli works + command: /opt/MegaRAID/storcli/storcli64 /c0 show + register: storcli_test + changed_when: false + failed_when: storcli_test.rc != 0 + become: yes + +# ========== 3. Настройка pve_exporter (Python) ========== - name: Create pve_exporter user user: name: pve_exporter system: yes shell: /usr/sbin/nologin create_home: no + become: yes - name: Create pve_exporter directories file: @@ -51,25 +64,37 @@ loop: - /opt/pve_exporter - /opt/pve_exporter/config + become: yes -- name: Deploy pve_exporter venv and install package - command: | - python3 -m venv /opt/pve_exporter/venv && - /opt/pve_exporter/venv/bin/pip install --upgrade pip && - /opt/pve_exporter/venv/bin/pip install prometheus-pve - args: - chdir: /opt/pve_exporter - creates: /opt/pve_exporter/venv/bin/pve_exporter +- name: Create Python virtual environment + command: + cmd: python3 -m venv /opt/pve_exporter/venv + creates: /opt/pve_exporter/venv/bin/python become: yes become_user: pve_exporter -- name: Deploy pve_exporter config (with vault secrets) +- name: Upgrade pip in virtual environment + command: + cmd: /opt/pve_exporter/venv/bin/pip install --upgrade pip + chdir: /opt/pve_exporter + become: yes + become_user: pve_exporter + +- name: Install prometheus-pve package + command: + cmd: /opt/pve_exporter/venv/bin/pip install prometheus-pve + chdir: /opt/pve_exporter + become: yes + become_user: pve_exporter + +- name: Deploy pve_exporter config template: src: pve_exporter_config.yml.j2 dest: /opt/pve_exporter/config/config.yml owner: pve_exporter group: pve_exporter mode: '0600' + become: yes - name: Create pve_exporter systemd service copy: @@ -77,7 +102,7 @@ [Unit] Description=Proxmox VE Exporter After=network.target - + [Service] Type=simple User=pve_exporter @@ -88,11 +113,12 @@ --config /opt/pve_exporter/config/config.yml Restart=always RestartSec=10 - + [Install] WantedBy=multi-user.target dest: /etc/systemd/system/pve_exporter.service mode: '0644' + become: yes - name: Enable and start pve_exporter systemd: @@ -100,9 +126,9 @@ enabled: yes state: started daemon_reload: yes + become: yes - -# ========== 3. RAID monitoring via storcli + node_exporter textfile ========== +# ========== 4. RAID monitoring via storcli + node_exporter textfile ========== - name: Ensure node_exporter textfile dir exists file: path: /var/lib/node_exporter/textfile_collector @@ -111,64 +137,120 @@ group: node_exporter mode: '0755' ignore_errors: yes # если node_exporter ещё не установлен — не падать + become: yes - name: Deploy storcli → Prometheus metrics script copy: content: | #!/bin/bash - OUT=/var/lib/node_exporter/textfile_collector/megaraid.prom + OUT=/var/lib/node_exporter/textfile_collector/storcli.prom TMP=$(mktemp) - - echo "# HELP megaraid_disk_temperature_celsius Disk temperature from MegaRAID (°C)" > "$TMP" - echo "# TYPE megaraid_disk_temperature_celsius gauge" >> "$TMP" - echo "# HELP megaraid_disk_state 1=Online, 0=Offline/Failed" >> "$TMP" - echo "# TYPE megaraid_disk_state gauge" >> "$TMP" - echo "# HELP megaraid_disk_media_errors_total Media errors" >> "$TMP" - echo "# TYPE megaraid_disk_media_errors_total counter" >> "$TMP" - echo "# HELP megaraid_disk_other_errors_total Other errors" >> "$TMP" - echo "# TYPE megaraid_disk_other_errors_total counter" >> "$TMP" - echo "# HELP megaraid_disk_predictive_failures_total Predictive failure count" >> "$TMP" - echo "# TYPE megaraid_disk_predictive_failures_total counter" >> "$TMP" - + + echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP" + echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP" + echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP" + echo "# TYPE storcli_disk_state gauge" >> "$TMP" + echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP" + echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP" + echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP" + echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP" + echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP" + echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP" + /opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \ awk ' - BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0 } + BEGIN { + slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0; + enclosure="252"; controller="0" + } /^Drive \// { if (slot != "") { - sv = (state=="Onln")?1:0; + # Sanitize model name gsub(/[^a-zA-Z0-9._-]/,"_",model); - printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp; - printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv; - printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me; - printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe; - printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf; + + # Temperature + if (temp != -1) { + printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, temp; + } + + # State (convert to number) + state_num = 0; + if (state ~ /Onln/) state_num = 2; + else if (state ~ /UGood/) state_num = 1; + else if (state ~ /Failed/) state_num = 3; + else if (state ~ /Rebuild/) state_num = 4; + + printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n", + controller, enclosure, slot, did, model, state, state_num; + + # Errors + printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, me; + printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, oe; + printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, pf; + + # Reset values temp=-1; me=0; oe=0; pf=0; } - match($0, /\/c0\/e[0-9]+\/s([0-9]+)/, m); slot=m[1]; + # Extract slot from path + if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) { + slot = m[1]; + } } - /DID/ && slot { did=$2 } - /Model Number/ && slot { model=$2 } - /Drive Temperature/ && slot { gsub(/[^0-9]/,"",$2); temp=($2==""?-1:$2) } - /Firmware state/ && slot { gsub(/.*: /,"",$0); state=$0 } - /Media Error Count/ && slot { me=$2 } - /Other Error Count/ && slot { oe=$2 } - /Predictive Failure Count/ && slot { pf=$2 } + /Device Id/ && slot { did = $3 } + /Model Number/ && slot { + model = $3; + for(i=4; i<=NF; i++) model = model "_" $i; + } + /Drive Temperature/ && slot { + temp_str = $3; + gsub(/[^0-9]/, "", temp_str); + temp = (temp_str == "" ? -1 : temp_str); + } + /Firmware state/ && slot { + state = $3; + for(i=4; i<=NF; i++) state = state " " $i; + gsub(/^[ \t]+|[ \t]+$/, "", state); + } + /Media Error Count/ && slot { me = $4 } + /Other Error Count/ && slot { oe = $4 } + /Predictive Failure Count/ && slot { pf = $4 } END { - if (slot!="") { - sv=(state=="Onln")?1:0; + if (slot != "") { gsub(/[^a-zA-Z0-9._-]/,"_",model); - printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp; - printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv; - printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me; - printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe; - printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf; + + if (temp != -1) { + printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, temp; + } + + state_num = 0; + if (state ~ /Onln/) state_num = 2; + else if (state ~ /UGood/) state_num = 1; + else if (state ~ /Failed/) state_num = 3; + else if (state ~ /Rebuild/) state_num = 4; + + printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n", + controller, enclosure, slot, did, model, state, state_num; + + printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, me; + printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, oe; + printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", + controller, enclosure, slot, did, model, pf; } }' >> "$TMP" - + mv "$TMP" "$OUT" chown node_exporter:node_exporter "$OUT" 2>/dev/null || true - dest: /usr/local/bin/export-megaraid-metrics.sh + chmod 644 "$OUT" 2>/dev/null || true + dest: /usr/local/bin/export-storcli-metrics.sh mode: '0755' + become: yes - name: Allow node_exporter to run storcli copy: @@ -176,35 +258,64 @@ dest: /etc/sudoers.d/node_exporter_storcli validate: 'visudo -cf %s' mode: '0440' + become: yes -- name: Create RAID export service +- name: Create storcli export service copy: content: | [Unit] - Description=Export MegaRAID metrics + Description=Export storcli metrics for Prometheus After=network.target - + [Service] Type=oneshot User=node_exporter - ExecStart=/usr/bin/sudo /usr/local/bin/export-megaraid-metrics.sh - dest: /etc/systemd/system/export-megaraid-metrics.service + ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh + dest: /etc/systemd/system/export-storcli-metrics.service + become: yes -- name: Create RAID export timer (every 5 min) +- name: Create storcli export timer (every 2 minutes) copy: content: | [Unit] - Description=Run RAID metric export every 5 minutes + Description=Run storcli metric export every 2 minutes + [Timer] OnBootSec=60 - OnUnitActiveSec=300 + OnUnitActiveSec=120 + [Install] WantedBy=timers.target - dest: /etc/systemd/system/export-megaraid-metrics.timer + dest: /etc/systemd/system/export-storcli-metrics.timer + become: yes -- name: Enable RAID metrics timer +- name: Enable and start storcli metrics timer systemd: - name: export-megaraid-metrics.timer + name: export-storcli-metrics.timer enabled: yes state: started - daemon_reload: yes \ No newline at end of file + daemon_reload: yes + become: yes + +# ========== 5. Проверка работы ========== +- name: Verify pve_exporter is running + systemd: + name: pve_exporter + state: started + become: yes + register: pve_exporter_status + failed_when: pve_exporter_status.status.ActiveState != "active" + +- name: Verify storcli metrics collection + command: /usr/local/bin/export-storcli-metrics.sh + become: yes + register: storcli_export_result + changed_when: false + failed_when: storcli_export_result.rc != 0 + +- name: Check that metrics file was created + stat: + path: /var/lib/node_exporter/textfile_collector/storcli.prom + register: metrics_file + failed_when: not metrics_file.stat.exists + become: yes \ No newline at end of file