Update 5 files
- /roles/pve_monitoring/tasks/main.yml - /roles/pve_monitoring/templates/pve_exporter_config.yml.j2 - /inventories/hosts - /olimp-deploy.yml - /group_vars/all.yml
This commit is contained in:
parent
6596c2263f
commit
00570077d9
@ -37,6 +37,10 @@ custom_directories:
|
|||||||
cleanup_packages:
|
cleanup_packages:
|
||||||
- gparted
|
- gparted
|
||||||
|
|
||||||
|
pve_exporter_user: "pve_exporter@pve"
|
||||||
|
pve_exporter_token_name: "grafana"
|
||||||
|
pve_exporter_token_value: "93f61884-7c2f-40b6-ae6c-ab36a4eba467"
|
||||||
|
|
||||||
# ------------ gateway (192.168.1.201) ------------
|
# ------------ gateway (192.168.1.201) ------------
|
||||||
npm_base_dir: "/opt/npm"
|
npm_base_dir: "/opt/npm"
|
||||||
npm_data_dir: "/opt/npm/data"
|
npm_data_dir: "/opt/npm/data"
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
[infra]
|
[infra]
|
||||||
|
proxmox ansible_host=192.168.1.200 int_ip=192.168.1.200 ansible_python_interpreter=/usr/bin/python3
|
||||||
gateway ansible_host=192.168.1.201 int_ip=192.168.1.201 ansible_python_interpreter=/usr/bin/python3
|
gateway ansible_host=192.168.1.201 int_ip=192.168.1.201 ansible_python_interpreter=/usr/bin/python3
|
||||||
data ansible_host=192.168.1.202 int_ip=192.168.1.202 ansible_python_interpreter=/usr/bin/python3
|
data ansible_host=192.168.1.202 int_ip=192.168.1.202 ansible_python_interpreter=/usr/bin/python3
|
||||||
media ansible_host=192.168.1.203 int_ip=192.168.1.203 ansible_python_interpreter=/usr/bin/python3
|
media ansible_host=192.168.1.203 int_ip=192.168.1.203 ansible_python_interpreter=/usr/bin/python3
|
||||||
@ -9,6 +10,9 @@ games ansible_host=192.168.1.207 int_ip=192.168.1.207 ansible_python_i
|
|||||||
manage ansible_host=192.168.1.228 int_ip=192.168.1.228 ansible_python_interpreter=/usr/bin/python3
|
manage ansible_host=192.168.1.228 int_ip=192.168.1.228 ansible_python_interpreter=/usr/bin/python3
|
||||||
#gitlab ansible_host=192.168.1.229 int_ip=192.168.1.229 ansible_python_interpreter=/usr/bin/python3
|
#gitlab ansible_host=192.168.1.229 int_ip=192.168.1.229 ansible_python_interpreter=/usr/bin/python3
|
||||||
|
|
||||||
|
[pve-server]
|
||||||
|
proxmox
|
||||||
|
|
||||||
[gateway-server]
|
[gateway-server]
|
||||||
gateway
|
gateway
|
||||||
|
|
||||||
|
|||||||
@ -4,6 +4,10 @@
|
|||||||
- {role: base_setup, tags: deploy_base}
|
- {role: base_setup, tags: deploy_base}
|
||||||
- {role: system_cleanup, tags: deploy_cleanup}
|
- {role: system_cleanup, tags: deploy_cleanup}
|
||||||
|
|
||||||
|
- hosts: pve-server
|
||||||
|
roles:
|
||||||
|
- { role: pve_monitoring, tags: deploy_pve_monitoring }
|
||||||
|
|
||||||
- hosts: gateway-server
|
- hosts: gateway-server
|
||||||
roles:
|
roles:
|
||||||
- { role: docker, tags: deploy_docker }
|
- { role: docker, tags: deploy_docker }
|
||||||
|
|||||||
210
roles/pve_monitoring/tasks/main.yml
Normal file
210
roles/pve_monitoring/tasks/main.yml
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
---
|
||||||
|
# ========== 1. Установка storcli (если отсутствует) ==========
|
||||||
|
- name: Check if storcli is already installed
|
||||||
|
stat:
|
||||||
|
path: /opt/MegaRAID/storcli/storcli64
|
||||||
|
register: storcli_installed
|
||||||
|
|
||||||
|
- name: Download storcli
|
||||||
|
get_url:
|
||||||
|
url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb
|
||||||
|
dest: /tmp/storcli.deb
|
||||||
|
mode: '0644'
|
||||||
|
when: not storcli_installed.stat.exists
|
||||||
|
|
||||||
|
- name: Install storcli
|
||||||
|
apt:
|
||||||
|
deb: /tmp/storcli.deb
|
||||||
|
state: present
|
||||||
|
when: not storcli_installed.stat.exists
|
||||||
|
|
||||||
|
- name: Verify storcli works
|
||||||
|
command: /opt/MegaRAID/storcli/storcli64 /c0 show
|
||||||
|
register: storcli_test
|
||||||
|
changed_when: false
|
||||||
|
failed_when: storcli_test.rc != 0
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 2. Настройка pve_exporter (Python) ==========
|
||||||
|
- name: Install Python dependencies
|
||||||
|
apt:
|
||||||
|
name:
|
||||||
|
- python3
|
||||||
|
- python3-pip
|
||||||
|
- python3-venv
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Create pve_exporter user
|
||||||
|
user:
|
||||||
|
name: pve_exporter
|
||||||
|
system: yes
|
||||||
|
shell: /usr/sbin/nologin
|
||||||
|
create_home: no
|
||||||
|
|
||||||
|
- name: Create pve_exporter directories
|
||||||
|
file:
|
||||||
|
path: "{{ item }}"
|
||||||
|
state: directory
|
||||||
|
owner: pve_exporter
|
||||||
|
group: pve_exporter
|
||||||
|
mode: '0755'
|
||||||
|
loop:
|
||||||
|
- /opt/pve_exporter
|
||||||
|
- /opt/pve_exporter/config
|
||||||
|
|
||||||
|
- name: Deploy pve_exporter venv and install package
|
||||||
|
command: |
|
||||||
|
python3 -m venv /opt/pve_exporter/venv &&
|
||||||
|
/opt/pve_exporter/venv/bin/pip install --upgrade pip &&
|
||||||
|
/opt/pve_exporter/venv/bin/pip install prometheus-pve
|
||||||
|
args:
|
||||||
|
chdir: /opt/pve_exporter
|
||||||
|
creates: /opt/pve_exporter/venv/bin/pve_exporter
|
||||||
|
become: yes
|
||||||
|
become_user: pve_exporter
|
||||||
|
|
||||||
|
- name: Deploy pve_exporter config (with vault secrets)
|
||||||
|
template:
|
||||||
|
src: pve_exporter_config.yml.j2
|
||||||
|
dest: /opt/pve_exporter/config/config.yml
|
||||||
|
owner: pve_exporter
|
||||||
|
group: pve_exporter
|
||||||
|
mode: '0600'
|
||||||
|
|
||||||
|
- name: Create pve_exporter systemd service
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Proxmox VE Exporter
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
User=pve_exporter
|
||||||
|
WorkingDirectory=/opt/pve_exporter
|
||||||
|
ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \
|
||||||
|
--server 0.0.0.0 \
|
||||||
|
--port 9223 \
|
||||||
|
--config /opt/pve_exporter/config/config.yml
|
||||||
|
Restart=always
|
||||||
|
RestartSec=10
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
dest: /etc/systemd/system/pve_exporter.service
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Enable and start pve_exporter
|
||||||
|
systemd:
|
||||||
|
name: pve_exporter
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
|
|
||||||
|
|
||||||
|
# ========== 3. RAID monitoring via storcli + node_exporter textfile ==========
|
||||||
|
- name: Ensure node_exporter textfile dir exists
|
||||||
|
file:
|
||||||
|
path: /var/lib/node_exporter/textfile_collector
|
||||||
|
state: directory
|
||||||
|
owner: node_exporter
|
||||||
|
group: node_exporter
|
||||||
|
mode: '0755'
|
||||||
|
ignore_errors: yes # если node_exporter ещё не установлен — не падать
|
||||||
|
|
||||||
|
- name: Deploy storcli → Prometheus metrics script
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
#!/bin/bash
|
||||||
|
OUT=/var/lib/node_exporter/textfile_collector/megaraid.prom
|
||||||
|
TMP=$(mktemp)
|
||||||
|
|
||||||
|
echo "# HELP megaraid_disk_temperature_celsius Disk temperature from MegaRAID (°C)" > "$TMP"
|
||||||
|
echo "# TYPE megaraid_disk_temperature_celsius gauge" >> "$TMP"
|
||||||
|
echo "# HELP megaraid_disk_state 1=Online, 0=Offline/Failed" >> "$TMP"
|
||||||
|
echo "# TYPE megaraid_disk_state gauge" >> "$TMP"
|
||||||
|
echo "# HELP megaraid_disk_media_errors_total Media errors" >> "$TMP"
|
||||||
|
echo "# TYPE megaraid_disk_media_errors_total counter" >> "$TMP"
|
||||||
|
echo "# HELP megaraid_disk_other_errors_total Other errors" >> "$TMP"
|
||||||
|
echo "# TYPE megaraid_disk_other_errors_total counter" >> "$TMP"
|
||||||
|
echo "# HELP megaraid_disk_predictive_failures_total Predictive failure count" >> "$TMP"
|
||||||
|
echo "# TYPE megaraid_disk_predictive_failures_total counter" >> "$TMP"
|
||||||
|
|
||||||
|
/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all 2>/dev/null | \
|
||||||
|
awk '
|
||||||
|
BEGIN { slot=""; did=""; model=""; temp=-1; state=""; me=0; oe=0; pf=0 }
|
||||||
|
/^Drive \// {
|
||||||
|
if (slot != "") {
|
||||||
|
sv = (state=="Onln")?1:0;
|
||||||
|
gsub(/[^a-zA-Z0-9._-]/,"_",model);
|
||||||
|
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
|
||||||
|
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv;
|
||||||
|
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me;
|
||||||
|
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe;
|
||||||
|
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf;
|
||||||
|
temp=-1; me=0; oe=0; pf=0;
|
||||||
|
}
|
||||||
|
match($0, /\/c0\/e[0-9]+\/s([0-9]+)/, m); slot=m[1];
|
||||||
|
}
|
||||||
|
/DID/ && slot { did=$2 }
|
||||||
|
/Model Number/ && slot { model=$2 }
|
||||||
|
/Drive Temperature/ && slot { gsub(/[^0-9]/,"",$2); temp=($2==""?-1:$2) }
|
||||||
|
/Firmware state/ && slot { gsub(/.*: /,"",$0); state=$0 }
|
||||||
|
/Media Error Count/ && slot { me=$2 }
|
||||||
|
/Other Error Count/ && slot { oe=$2 }
|
||||||
|
/Predictive Failure Count/ && slot { pf=$2 }
|
||||||
|
END {
|
||||||
|
if (slot!="") {
|
||||||
|
sv=(state=="Onln")?1:0;
|
||||||
|
gsub(/[^a-zA-Z0-9._-]/,"_",model);
|
||||||
|
printf "megaraid_disk_temperature_celsius{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,temp;
|
||||||
|
printf "megaraid_disk_state{slot=\"%s\",did=\"%s\",model=\"%s\",state=\"%s\"} %s\n",slot,did,model,state,sv;
|
||||||
|
printf "megaraid_disk_media_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,me;
|
||||||
|
printf "megaraid_disk_other_errors_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,oe;
|
||||||
|
printf "megaraid_disk_predictive_failures_total{slot=\"%s\",did=\"%s\",model=\"%s\"} %s\n",slot,did,model,pf;
|
||||||
|
}
|
||||||
|
}' >> "$TMP"
|
||||||
|
|
||||||
|
mv "$TMP" "$OUT"
|
||||||
|
chown node_exporter:node_exporter "$OUT" 2>/dev/null || true
|
||||||
|
dest: /usr/local/bin/export-megaraid-metrics.sh
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Allow node_exporter to run storcli
|
||||||
|
copy:
|
||||||
|
content: 'node_exporter ALL=(ALL) NOPASSWD: /opt/MegaRAID/storcli/storcli64'
|
||||||
|
dest: /etc/sudoers.d/node_exporter_storcli
|
||||||
|
validate: 'visudo -cf %s'
|
||||||
|
mode: '0440'
|
||||||
|
|
||||||
|
- name: Create RAID export service
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Export MegaRAID metrics
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=oneshot
|
||||||
|
User=node_exporter
|
||||||
|
ExecStart=/usr/bin/sudo /usr/local/bin/export-megaraid-metrics.sh
|
||||||
|
dest: /etc/systemd/system/export-megaraid-metrics.service
|
||||||
|
|
||||||
|
- name: Create RAID export timer (every 5 min)
|
||||||
|
copy:
|
||||||
|
content: |
|
||||||
|
[Unit]
|
||||||
|
Description=Run RAID metric export every 5 minutes
|
||||||
|
[Timer]
|
||||||
|
OnBootSec=60
|
||||||
|
OnUnitActiveSec=300
|
||||||
|
[Install]
|
||||||
|
WantedBy=timers.target
|
||||||
|
dest: /etc/systemd/system/export-megaraid-metrics.timer
|
||||||
|
|
||||||
|
- name: Enable RAID metrics timer
|
||||||
|
systemd:
|
||||||
|
name: export-megaraid-metrics.timer
|
||||||
|
enabled: yes
|
||||||
|
state: started
|
||||||
|
daemon_reload: yes
|
||||||
@ -0,0 +1,6 @@
|
|||||||
|
pve:
|
||||||
|
host: "localhost"
|
||||||
|
username: "{{ pve_exporter_user }}"
|
||||||
|
token_name: "{{ pve_exporter_token_name }}"
|
||||||
|
token_value: "{{ pve_exporter_token_value }}"
|
||||||
|
verify_ssl: false
|
||||||
Loading…
Reference in New Issue
Block a user