From 00570077d9aae8b02c84f42fcca052cc2633f232 Mon Sep 17 00:00:00 2001 From: Administrator Date: Fri, 14 Nov 2025 05:04:02 +0000 Subject: [PATCH] Update 5 files - /roles/pve_monitoring/tasks/main.yml - /roles/pve_monitoring/templates/pve_exporter_config.yml.j2 - /inventories/hosts - /olimp-deploy.yml - /group_vars/all.yml --- group_vars/all.yml | 4 + inventories/hosts | 4 + olimp-deploy.yml | 4 + roles/pve_monitoring/tasks/main.yml | 210 ++++++++++++++++++ .../templates/pve_exporter_config.yml.j2 | 6 + 5 files changed, 228 insertions(+) create mode 100644 roles/pve_monitoring/tasks/main.yml create mode 100644 roles/pve_monitoring/templates/pve_exporter_config.yml.j2 diff --git a/group_vars/all.yml b/group_vars/all.yml index 963c5dd..5d58103 100644 --- a/group_vars/all.yml +++ b/group_vars/all.yml @@ -37,6 +37,10 @@ custom_directories: cleanup_packages: - gparted +pve_exporter_user: "pve_exporter@pve" +pve_exporter_token_name: "grafana" +pve_exporter_token_value: "93f61884-7c2f-40b6-ae6c-ab36a4eba467" + # ------------ gateway (192.168.1.201) ------------ npm_base_dir: "/opt/npm" npm_data_dir: "/opt/npm/data" diff --git a/inventories/hosts b/inventories/hosts index 83d4554..6c3466f 100644 --- a/inventories/hosts +++ b/inventories/hosts @@ -1,4 +1,5 @@ [infra] +proxmox ansible_host=192.168.1.200 int_ip=192.168.1.200 ansible_python_interpreter=/usr/bin/python3 gateway ansible_host=192.168.1.201 int_ip=192.168.1.201 ansible_python_interpreter=/usr/bin/python3 data ansible_host=192.168.1.202 int_ip=192.168.1.202 ansible_python_interpreter=/usr/bin/python3 media ansible_host=192.168.1.203 int_ip=192.168.1.203 ansible_python_interpreter=/usr/bin/python3 @@ -9,6 +10,9 @@ games ansible_host=192.168.1.207 int_ip=192.168.1.207 ansible_python_i manage ansible_host=192.168.1.228 int_ip=192.168.1.228 ansible_python_interpreter=/usr/bin/python3 #gitlab ansible_host=192.168.1.229 int_ip=192.168.1.229 ansible_python_interpreter=/usr/bin/python3 +[pve-server] +proxmox + [gateway-server] gateway diff --git a/olimp-deploy.yml b/olimp-deploy.yml index 9893fa7..66b28f2 100644 --- a/olimp-deploy.yml +++ b/olimp-deploy.yml @@ -4,6 +4,10 @@ - {role: base_setup, tags: deploy_base} - {role: system_cleanup, tags: deploy_cleanup} +- hosts: pve-server + roles: + - { role: pve_monitoring, tags: deploy_pve_monitoring } + - hosts: gateway-server roles: - { role: docker, tags: deploy_docker } diff --git a/roles/pve_monitoring/tasks/main.yml b/roles/pve_monitoring/tasks/main.yml new file mode 100644 index 0000000..0c3730f --- /dev/null +++ b/roles/pve_monitoring/tasks/main.yml @@ -0,0 +1,210 @@ +--- +# ========== 1. Установка storcli (если отсутствует) ========== +- name: Check if storcli is already installed + stat: + path: /opt/MegaRAID/storcli/storcli64 + register: storcli_installed + +- name: Download storcli + get_url: + url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb + dest: /tmp/storcli.deb + mode: '0644' + when: not storcli_installed.stat.exists + +- name: Install storcli + apt: + deb: /tmp/storcli.deb + state: present + when: not storcli_installed.stat.exists + +- name: Verify storcli works + command: /opt/MegaRAID/storcli/storcli64 /c0 show + register: storcli_test + changed_when: false + failed_when: storcli_test.rc != 0 + + +# ========== 2. Настройка pve_exporter (Python) ========== +- name: Install Python dependencies + apt: + name: + - python3 + - python3-pip + - python3-venv + state: present + +- name: Create pve_exporter user + user: + name: pve_exporter + system: yes + shell: /usr/sbin/nologin + create_home: no + +- name: Create pve_exporter directories + file: + path: "{{ item }}" + state: directory + owner: pve_exporter + group: pve_exporter + mode: '0755' + loop: + - /opt/pve_exporter + - /opt/pve_exporter/config + +- name: Deploy pve_exporter venv and install package + command: | + python3 -m venv /opt/pve_exporter/venv && + /opt/pve_exporter/venv/bin/pip install --upgrade pip && + /opt/pve_exporter/venv/bin/pip install prometheus-pve + args: + chdir: /opt/pve_exporter + creates: /opt/pve_exporter/venv/bin/pve_exporter + become: yes + become_user: pve_exporter + +- name: Deploy pve_exporter config (with vault secrets) + template: + src: pve_exporter_config.yml.j2 + dest: /opt/pve_exporter/config/config.yml + owner: pve_exporter + group: pve_exporter + mode: '0600' + +- name: Create pve_exporter systemd service + copy: + content: | + [Unit] + Description=Proxmox VE Exporter + After=network.target + + [Service] + Type=simple + User=pve_exporter + WorkingDirectory=/opt/pve_exporter + ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \ + --server 0.0.0.0 \ + --port 9223 \ + --config /opt/pve_exporter/config/config.yml + Restart=always + RestartSec=10 + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/pve_exporter.service + mode: '0644' + +- name: Enable and start pve_exporter + systemd: + name: pve_exporter + enabled: yes + state: started + daemon_reload: yes + + +# ========== 3. RAID monitoring via storcli + node_exporter textfile ========== +- name: Ensure node_exporter textfile dir exists + file: + path: /var/lib/node_exporter/textfile_collector + state: directory + owner: node_exporter + group: node_exporter + mode: '0755' + ignore_errors: yes # если node_exporter ещё не установлен — не падать + +- name: Deploy storcli → Prometheus metrics script + copy: + content: | + #!/bin/bash + OUT=/var/lib/node_exporter/textfile_collector/megaraid.prom + TMP=$(mktemp) + + echo "# HELP megaraid_disk_temperature_celsius Disk temperature from MegaRAID (°C)" > "$TMP" + echo "# TYPE megaraid_disk_temperature_celsius gauge" >> "$TMP" + echo "# HELP megaraid_disk_state 1=Online, 0=Offline/Failed" >> "$TMP" + echo "# TYPE megaraid_disk_state gauge" >> "$TMP" + echo "# HELP megaraid_disk_media_errors_total Media errors" >> "$TMP" + echo "# TYPE megaraid_disk_media_errors_total counter" >> "$TMP" + echo "# HELP megaraid_disk_other_errors_total Other errors" >> "$TMP" + echo "# TYPE megaraid_disk_other_errors_total counter" >> "$TMP" + echo "# HELP megaraid_disk_predictive_failures_total Predictive failure count" >> "$TMP" + echo "# TYPE megaraid_disk_predictive_failures_total counter" >> "$TMP" + + /opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \ + awk ' + BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0 } + /^Drive \// { + if (slot != "") { + sv = (state=="Onln")?1:0; + gsub(/[^a-zA-Z0-9._-]/,"_",model); + printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp; + printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv; + printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me; + printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe; + printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf; + temp=-1; me=0; oe=0; pf=0; + } + match($0, /\/c0\/e[0-9]+\/s([0-9]+)/, m); slot=m[1]; + } + /DID/ && slot { did=$2 } + /Model Number/ && slot { model=$2 } + /Drive Temperature/ && slot { gsub(/[^0-9]/,"",$2); temp=($2==""?-1:$2) } + /Firmware state/ && slot { gsub(/.*: /,"",$0); state=$0 } + /Media Error Count/ && slot { me=$2 } + /Other Error Count/ && slot { oe=$2 } + /Predictive Failure Count/ && slot { pf=$2 } + END { + if (slot!="") { + sv=(state=="Onln")?1:0; + gsub(/[^a-zA-Z0-9._-]/,"_",model); + printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp; + printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv; + printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me; + printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe; + printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf; + } + }' >> "$TMP" + + mv "$TMP" "$OUT" + chown node_exporter:node_exporter "$OUT" 2>/dev/null || true + dest: /usr/local/bin/export-megaraid-metrics.sh + mode: '0755' + +- name: Allow node_exporter to run storcli + copy: + content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64' + dest: /etc/sudoers.d/node_exporter_storcli + validate: 'visudo -cf %s' + mode: '0440' + +- name: Create RAID export service + copy: + content: | + [Unit] + Description=Export MegaRAID metrics + After=network.target + + [Service] + Type=oneshot + User=node_exporter + ExecStart=/usr/bin/sudo /usr/local/bin/export-megaraid-metrics.sh + dest: /etc/systemd/system/export-megaraid-metrics.service + +- name: Create RAID export timer (every 5 min) + copy: + content: | + [Unit] + Description=Run RAID metric export every 5 minutes + [Timer] + OnBootSec=60 + OnUnitActiveSec=300 + [Install] + WantedBy=timers.target + dest: /etc/systemd/system/export-megaraid-metrics.timer + +- name: Enable RAID metrics timer + systemd: + name: export-megaraid-metrics.timer + enabled: yes + state: started + daemon_reload: yes \ No newline at end of file diff --git a/roles/pve_monitoring/templates/pve_exporter_config.yml.j2 b/roles/pve_monitoring/templates/pve_exporter_config.yml.j2 new file mode 100644 index 0000000..64a1184 --- /dev/null +++ b/roles/pve_monitoring/templates/pve_exporter_config.yml.j2 @@ -0,0 +1,6 @@ +pve: + host: "localhost" + username: "{{ pve_exporter_user }}" + token_name: "{{ pve_exporter_token_name }}" + token_value: "{{ pve_exporter_token_value }}" + verify_ssl: false \ No newline at end of file