--- # ========== 1. Подготовка: установка sudo и зависимостей ========== - name: Ensure sudo is installed (required for privilege escalation) apt: name: sudo state: present update_cache: yes become: yes - name: Install Python dependencies apt: name: - python3 - python3-pip - python3-venv state: present become: yes # ========== 2. Установка storcli (если отсутствует) ========== - name: Check if storcli is already installed stat: path: /opt/MegaRAID/storcli/storcli64 register: storcli_installed become: yes - name: Download storcli get_url: url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb dest: /tmp/storcli.deb mode: '0644' when: not storcli_installed.stat.exists become: yes - name: Install storcli apt: deb: /tmp/storcli.deb state: present when: not storcli_installed.stat.exists become: yes - name: Verify storcli works command: /opt/MegaRAID/storcli/storcli64 /c0 show register: storcli_test changed_when: false failed_when: storcli_test.rc != 0 become: yes # ========== 3. Настройка pve_exporter (Python) ========== - name: Create pve_exporter user user: name: pve_exporter system: yes shell: /usr/sbin/nologin create_home: no become: yes - name: Create pve_exporter directories file: path: "{{ item }}" state: directory owner: pve_exporter group: pve_exporter mode: '0755' loop: - /opt/pve_exporter - /opt/pve_exporter/config become: yes - name: Create Python virtual environment command: cmd: python3 -m venv /opt/pve_exporter/venv creates: /opt/pve_exporter/venv/bin/python become: yes become_user: pve_exporter - name: Upgrade pip in virtual environment command: cmd: /opt/pve_exporter/venv/bin/pip install --upgrade pip chdir: /opt/pve_exporter become: yes become_user: pve_exporter - name: Install prometheus-pve-exporter package command: cmd: /opt/pve_exporter/venv/bin/pip install prometheus-pve-exporter chdir: /opt/pve_exporter become: yes become_user: pve_exporter environment: HOME: /opt/pve_exporter - name: Deploy pve_exporter config template: src: pve_exporter_config.yml.j2 dest: /opt/pve_exporter/config/config.yml owner: pve_exporter group: pve_exporter mode: '0600' become: yes - name: Create pve_exporter systemd service copy: content: | [Unit] Description=Proxmox VE Exporter After=network.target [Service] Type=simple User=pve_exporter WorkingDirectory=/opt/pve_exporter ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \ --server 0.0.0.0 \ --port 9223 \ --config /opt/pve_exporter/config/config.yml Restart=always RestartSec=10 [Install] WantedBy=multi-user.target dest: /etc/systemd/system/pve_exporter.service mode: '0644' become: yes - name: Enable and start pve_exporter systemd: name: pve_exporter enabled: yes state: started daemon_reload: yes become: yes # ========== 4. RAID monitoring via storcli + node_exporter textfile ========== - name: Ensure node_exporter textfile dir exists file: path: /var/lib/node_exporter/textfile_collector state: directory owner: node_exporter group: node_exporter mode: '0755' ignore_errors: yes # если node_exporter ещё не установлен — не падать become: yes - name: Deploy storcli → Prometheus metrics script copy: content: | #!/bin/bash OUT=/var/lib/node_exporter/textfile_collector/storcli.prom TMP=$(mktemp) echo "# HELP storcli_disk_temp_celsius Disk temperature from storcli (°C)" > "$TMP" echo "# TYPE storcli_disk_temp_celsius gauge" >> "$TMP" echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$TMP" echo "# TYPE storcli_disk_state gauge" >> "$TMP" echo "# HELP storcli_disk_media_errors_total Media errors" >> "$TMP" echo "# TYPE storcli_disk_media_errors_total counter" >> "$TMP" echo "# HELP storcli_disk_other_errors_total Other errors" >> "$TMP" echo "# TYPE storcli_disk_other_errors_total counter" >> "$TMP" echo "# HELP storcli_disk_predictive_failures_total Predictive failure count" >> "$TMP" echo "# TYPE storcli_disk_predictive_failures_total counter" >> "$TMP" /opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \ awk ' BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0; enclosure="252"; controller="0" } /^Drive \// { if (slot != "") { # Sanitize model name gsub(/[^a-zA-Z0-9._-]/,"_",model); # Temperature if (temp != -1) { printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, temp; } # State (convert to number) state_num = 0; if (state ~ /Onln/) state_num = 2; else if (state ~ /UGood/) state_num = 1; else if (state ~ /Failed/) state_num = 3; else if (state ~ /Rebuild/) state_num = 4; printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n", controller, enclosure, slot, did, model, state, state_num; # Errors printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, me; printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, oe; printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, pf; # Reset values temp=-1; me=0; oe=0; pf=0; } # Extract slot from path if (match($0, /\/c[0-9]+\/e[0-9]+\/s([0-9]+)/, m)) { slot = m[1]; } } /Device Id/ && slot { did = $3 } /Model Number/ && slot { model = $3; for(i=4; i<=NF; i++) model = model "_" $i; } /Drive Temperature/ && slot { temp_str = $3; gsub(/[^0-9]/, "", temp_str); temp = (temp_str == "" ? -1 : temp_str); } /Firmware state/ && slot { state = $3; for(i=4; i<=NF; i++) state = state " " $i; gsub(/^[ \t]+|[ \t]+$/, "", state); } /Media Error Count/ && slot { me = $4 } /Other Error Count/ && slot { oe = $4 } /Predictive Failure Count/ && slot { pf = $4 } END { if (slot != "") { gsub(/[^a-zA-Z0-9._-]/,"_",model); if (temp != -1) { printf "storcli_disk_temp_celsius{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, temp; } state_num = 0; if (state ~ /Onln/) state_num = 2; else if (state ~ /UGood/) state_num = 1; else if (state ~ /Failed/) state_num = 3; else if (state ~ /Rebuild/) state_num = 4; printf "storcli_disk_state{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n", controller, enclosure, slot, did, model, state, state_num; printf "storcli_disk_media_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, me; printf "storcli_disk_other_errors_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, oe; printf "storcli_disk_predictive_failures_total{controller=\"%s\",enclosure=\"%s\",slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n", controller, enclosure, slot, did, model, pf; } }' >> "$TMP" mv "$TMP" "$OUT" chown node_exporter:node_exporter "$OUT" 2>/dev/null || true chmod 644 "$OUT" 2>/dev/null || true dest: /usr/local/bin/export-storcli-metrics.sh mode: '0755' become: yes - name: Allow node_exporter to run storcli copy: content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64' dest: /etc/sudoers.d/node_exporter_storcli validate: 'visudo -cf %s' mode: '0440' become: yes - name: Create storcli export service copy: content: | [Unit] Description=Export storcli metrics for Prometheus After=network.target [Service] Type=oneshot User=node_exporter ExecStart=/usr/bin/sudo /usr/local/bin/export-storcli-metrics.sh dest: /etc/systemd/system/export-storcli-metrics.service become: yes - name: Create storcli export timer (every 2 minutes) copy: content: | [Unit] Description=Run storcli metric export every 2 minutes [Timer] OnBootSec=60 OnUnitActiveSec=120 [Install] WantedBy=timers.target dest: /etc/systemd/system/export-storcli-metrics.timer become: yes - name: Enable and start storcli metrics timer systemd: name: export-storcli-metrics.timer enabled: yes state: started daemon_reload: yes become: yes # ========== 5. Проверка работы ========== - name: Verify pve_exporter is running systemd: name: pve_exporter state: started become: yes register: pve_exporter_status failed_when: pve_exporter_status.status.ActiveState != "active" - name: Verify storcli metrics collection command: /usr/local/bin/export-storcli-metrics.sh become: yes register: storcli_export_result changed_when: false failed_when: storcli_export_result.rc != 0 - name: Check that metrics file was created stat: path: /var/lib/node_exporter/textfile_collector/storcli.prom register: metrics_file failed_when: not metrics_file.stat.exists become: yes