diff --git a/group_vars/all.yml b/group_vars/all.yml index 5d58103..66376c6 100644 --- a/group_vars/all.yml +++ b/group_vars/all.yml @@ -27,19 +27,26 @@ base_packages: - iftop - ntp - pv + - jq + - unzip system_scripts: [] custom_directories: - /opt/scripts - /etc/apt/keyrings +ssh_public_keys: [] + - "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHvRBW+2Xpck2tznhWJyls5J/4wUoVYdyFM6JTU7uogK ansible@olimp" + - "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCbvnGZxQEGYuScClONbkbfVn2+Uo1kYYztXqMf9ku1lHkw+7IZa00LOMwv7QGBRvrtBcw+TWqaMst5FZ3R6oWcQc+nkBEYoRXe4f3AuuFAl9C9F6sEYM8fX6mAHIlWQhFyVslazZtVTQwnfRV0rnbtCduCu9liywM3fShFqBVwq7Y4nBjG648Zq+VfCHpbBE9XkZaMDyeOXdtppmLetywnBS33mbXMDgH09PMlRz097xfZLkpFdSi8WtDOtKSBiEHtZ+H0EZ42Cda2xMnqlgVtPxWGUirvv6CvDyTmuMzrjALZoSKhl3iD6Szd1YOJcAw6bv9gbJKxPkZchrB65ZXT ZailonOlimp" + + # Удаляем мусорные пакеты везде cleanup_packages: - gparted pve_exporter_user: "pve_exporter@pve" pve_exporter_token_name: "grafana" -pve_exporter_token_value: "93f61884-7c2f-40b6-ae6c-ab36a4eba467" +pve_exporter_token_value: "ae683c34-c539-4b08-b539-6c9b7e570411" # ------------ gateway (192.168.1.201) ------------ npm_base_dir: "/opt/npm" diff --git a/olimp-deploy.yml b/olimp-deploy.yml index 084efff..685384b 100644 --- a/olimp-deploy.yml +++ b/olimp-deploy.yml @@ -1,13 +1,13 @@ --- - hosts: all:!pve-server roles: - - {role: base_setup, tags: deploy_base} - - {role: system_cleanup, tags: deploy_cleanup} + - {role: base_setup, tags: deploy_base} + - {role: system_cleanup, tags: deploy_cleanup} - hosts: pve-server roles: - - { role: proxmox_base_setup, tags: deploy_proxmox_base } - - { role: pve_monitoring, tags: deploy_pve_monitoring } + - { role: proxmox_base_setup, tags: deploy_proxmox_base } + - { role: proxmox_monitoring, tags: deploy_proxmox_monitoring } - hosts: gateway-server roles: diff --git a/roles/base_setup/handlers/main.yml b/roles/base_setup/handlers/main.yml index 833fb1d..d544a25 100644 --- a/roles/base_setup/handlers/main.yml +++ b/roles/base_setup/handlers/main.yml @@ -1,14 +1,13 @@ --- - name: restart ssh - service: + systemd: name: ssh state: restarted + become: yes - -- name: Reboot system - reboot: - msg: "Reboot triggered by base setup" - connect_timeout: 5 - reboot_timeout: 300 - pre_reboot_delay: 0 - post_reboot_delay: 30 \ No newline at end of file +- name: restart node_exporter + systemd: + name: node_exporter + state: restarted + daemon_reload: yes + become: yes \ No newline at end of file diff --git a/roles/base_setup/tasks/main.yml b/roles/base_setup/tasks/main.yml index 8311c56..dd5aaff 100644 --- a/roles/base_setup/tasks/main.yml +++ b/roles/base_setup/tasks/main.yml @@ -4,17 +4,20 @@ upgrade: full update_cache: yes cache_valid_time: 3600 + become: yes - name: Install base packages apt: name: "{{ base_packages }}" state: present update_cache: yes + become: yes - name: Remove unused packages apt: autoremove: yes autoclean: yes + become: yes - name: Disable IPv6 via sysctl sysctl: @@ -26,12 +29,14 @@ loop: - { name: 'net.ipv6.conf.all.disable_ipv6', value: '1' } - { name: 'net.ipv6.conf.default.disable_ipv6', value: '1' } + become: yes - name: Ensure /root/.bashrc exists file: path: /root/.bashrc state: touch mode: '0644' + become: yes - name: Add custom aliases and environment to ~/.bashrc blockinfile: @@ -75,15 +80,18 @@ export HISTTIMEFORMAT='%F %T ' owner: root mode: '0644' + become: yes - name: Configure timezone timezone: name: "{{ timezone }}" + become: yes - name: Configure locale locale_gen: name: "{{ system_locale }}" state: present + become: yes - name: Set default locale lineinfile: @@ -91,34 +99,37 @@ line: "LANG={{ system_locale }}" state: present create: yes + become: yes - name: Ensure required directories exist file: path: "{{ item }}" state: directory mode: '0755' - loop: - - /opt/scripts - - /etc/apt/keyrings + loop: "{{ custom_directories | default([]) }}" + become: yes - name: Install Python requests library (if needed) apt: name: python3-requests state: present when: ansible_connection != "local" + become: yes - name: Ensure SSH directory exists file: path: /root/.ssh state: directory mode: '0700' + become: yes - name: Add authorized key for root authorized_key: user: root state: present key: "{{ item }}" - loop: "{{ ssh_public_keys | default([]) }}" + loop: "{{ ssh_public_keys }}" + become: yes - name: Configure SSH security lineinfile: @@ -131,75 +142,154 @@ - { regexp: '^PermitRootLogin', line: 'PermitRootLogin prohibit-password' } - { regexp: '^PubkeyAuthentication', line: 'PubkeyAuthentication yes' } notify: restart ssh + become: yes - # ========== Node Exporter Installation ========== +# ========== Node Exporter Installation ========== - name: Create node_exporter system user - ansible.builtin.user: + user: name: node_exporter system: yes shell: /usr/sbin/nologin create_home: no + become: yes -- name: Download and extract node_exporter binary - ansible.builtin.unarchive: - src: "https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz" - dest: /tmp +- name: Download node_exporter + get_url: + url: "https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz" + dest: /tmp/node_exporter.tar.gz + mode: '0644' + timeout: 60 + become: yes + +- name: Create temporary extraction directory + file: + path: /tmp/node_exporter_temp + state: directory + mode: '0755' + become: yes + +- name: Extract node_exporter + unarchive: + src: /tmp/node_exporter.tar.gz + dest: /tmp/node_exporter_temp remote_src: yes - creates: /usr/local/bin/node_exporter + creates: /tmp/node_exporter_temp/node_exporter-1.8.2.linux-amd64/node_exporter + become: yes - name: Install node_exporter binary - ansible.builtin.copy: - src: /tmp/node_exporter-1.8.2.linux-amd64/node_exporter + copy: + src: /tmp/node_exporter_temp/node_exporter-1.8.2.linux-amd64/node_exporter dest: /usr/local/bin/node_exporter owner: root group: root mode: '0755' remote_src: yes + become: yes + notify: restart node_exporter + +- name: Clean up temporary files + file: + path: "{{ item }}" + state: absent + loop: + - /tmp/node_exporter.tar.gz + - /tmp/node_exporter_temp + become: yes - name: Create textfile collector directory - ansible.builtin.file: + file: path: /var/lib/node_exporter/textfile_collector state: directory owner: node_exporter group: node_exporter mode: '0755' + become: yes - name: Deploy node_exporter systemd service - ansible.builtin.copy: + copy: content: | [Unit] - Description=Node Exporter + Description=Prometheus Node Exporter + Documentation=https://github.com/prometheus/node_exporter After=network.target [Service] Type=simple User=node_exporter + Group=node_exporter ExecStart=/usr/local/bin/node_exporter \ --collector.systemd \ --collector.processes \ + --collector.cpu \ + --collector.meminfo \ + --collector.diskstats \ + --collector.netdev \ + --collector.filesystem \ + --collector.loadavg \ + --collector.time \ --collector.textfile.directory=/var/lib/node_exporter/textfile_collector \ - --web.listen-address=:9100 + --web.listen-address=0.0.0.0:9100 \ + --web.telemetry-path=/metrics Restart=always RestartSec=5 + # Security settings + NoNewPrivileges=yes + ProtectSystem=strict + ProtectHome=yes + PrivateTmp=yes + ProtectControlGroups=yes + ProtectKernelModules=yes + ProtectKernelTunables=yes + LockPersonality=yes + RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX + [Install] WantedBy=multi-user.target dest: /etc/systemd/system/node_exporter.service owner: root group: root mode: '0644' + become: yes + notify: restart node_exporter -- name: Reload systemd and start node_exporter - ansible.builtin.systemd: +- name: Start and enable node_exporter + systemd: name: node_exporter state: started enabled: yes daemon_reload: yes + become: yes + +- name: Wait for node_exporter to start + wait_for: + host: localhost + port: 9100 + timeout: 30 + state: started + delay: 5 + become: yes + +- name: Verify node_exporter is responding + uri: + url: http://localhost:9100/metrics + status_code: 200 + timeout: 10 + register: node_exporter_check + become: yes + +- name: Show node_exporter status + debug: + msg: "Node Exporter is running and responding on port 9100" + when: node_exporter_check.status == 200 - name: Allow port 9100 in ufw (if enabled) - ansible.builtin.ufw: + ufw: rule: allow port: 9100 proto: tcp comment: "Prometheus Node Exporter" - when: ansible_facts.services["ufw.service"] is defined and ansible_facts.services["ufw.service"]["state"] == "running" \ No newline at end of file + when: + - ansible_facts.services["ufw.service"] is defined + - ansible_facts.services["ufw.service"]["state"] == "running" + become: yes \ No newline at end of file diff --git a/roles/grafana/files/vmagent.yaml b/roles/grafana/files/vmagent.yaml index 7087436..cc5a4aa 100644 --- a/roles/grafana/files/vmagent.yaml +++ b/roles/grafana/files/vmagent.yaml @@ -1,9 +1,60 @@ global: scrape_interval: 30s + external_labels: + cluster: 'olimp' + environment: 'production' scrape_configs: + # ========== Proxmox серверы ========== + - job_name: 'proxmox' + static_configs: + - targets: ['192.168.1.200:9223'] + metrics_path: /pve + params: + module: [default] + scrape_interval: 30s + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: '192.168.1.200:9223' + + # ========== Node Exporter со всех серверов ========== - job_name: 'node' static_configs: -{%- for host in groups['all'] %} - - targets: ['{{ hostvars[host].int_ip }}:9100'] -{%- endfor %} \ No newline at end of file + - targets: + - '192.168.1.200:9100' # Proxmox + - '192.168.1.201:9100' # Gateway + - '192.168.1.202:9100' # Data + - '192.168.1.203:9100' # Media + - '192.168.1.204:9100' # Photo + - '192.168.1.205:9100' # Nextcloud + - '192.168.1.206:9100' # Talk + - '192.168.1.207:9100' # Games + - '192.168.1.228:9100' # Manage + scrape_interval: 30s + relabel_configs: + - source_labels: [__address__] + target_label: instance + regex: '(.*):9100' + replacement: '${1}' + + # ========== Self-monitoring vmagent ========== + - job_name: 'vmagent' + static_configs: + - targets: ['vmagent:8429'] + scrape_interval: 30s + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'vmagent' + + # ========== VictoriaMetrics self-monitoring ========== + - job_name: 'victoriametrics' + static_configs: + - targets: ['victoriametrics:8428'] + scrape_interval: 30s + metrics_path: /metrics + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'victoriametrics' \ No newline at end of file diff --git a/roles/grafana/templates/docker-compose.yml.j2 b/roles/grafana/templates/docker-compose.yml.j2 index 955ad0f..be5656d 100644 --- a/roles/grafana/templates/docker-compose.yml.j2 +++ b/roles/grafana/templates/docker-compose.yml.j2 @@ -24,9 +24,12 @@ services: - '-promscrape.config=/config/vmagent.yaml' - '-remoteWrite.tmpDataPath=/tmpData' - '-remoteWrite.url=http://victoriametrics:8428/api/v1/write' + - '-promscrape.suppressScrapeErrors=false' volumes: - {{ grafana_vmagent_config }}:/config/vmagent.yaml:ro - {{ grafana_vmagent_tmp_dir }}:/tmpData + ports: + - "8429:8429" # Для мониторинга самого vmagent depends_on: - victoriametrics networks: diff --git a/roles/proxmox_base_setup/tasks/main.yml b/roles/proxmox_base_setup/tasks/main.yml index ea1c8db..d3ee998 100644 --- a/roles/proxmox_base_setup/tasks/main.yml +++ b/roles/proxmox_base_setup/tasks/main.yml @@ -149,154 +149,4 @@ - { key: 'PermitRootLogin', value: 'prohibit-password' } - { key: 'PubkeyAuthentication', value: 'yes' } - { key: 'X11Forwarding', value: 'no' } - notify: restart ssh - -# ========== Node Exporter ========== -- name: Create node_exporter user - user: - name: node_exporter - system: yes - shell: /usr/sbin/nologin - create_home: no - -- name: Download node_exporter - get_url: - url: "https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz" - dest: /tmp/node_exporter-1.8.2.linux-amd64.tar.gz - checksum: "sha256:https://github.com/prometheus/node_exporter/releases/download/v1.8.2/sha256sums.txt" - mode: '0644' - timeout: 60 - -- name: Extract node_exporter - unarchive: - src: /tmp/node_exporter-1.8.2.linux-amd64.tar.gz - dest: /tmp/ - remote_src: yes - creates: /tmp/node_exporter-1.8.2.linux-amd64/node_exporter - -- name: Install node_exporter binary - copy: - src: /tmp/node_exporter-1.8.2.linux-amd64/node_exporter - dest: /usr/local/bin/node_exporter - owner: root - group: root - mode: '0755' - remote_src: yes - notify: restart node_exporter - -- name: Create textfile_collector directory - file: - path: /var/lib/node_exporter/textfile_collector - state: directory - owner: node_exporter - group: node_exporter - mode: '0755' - -- name: Deploy node_exporter systemd service - template: - src: node_exporter.service.j2 - dest: /etc/systemd/system/node_exporter.service - owner: root - group: root - mode: '0644' - notify: restart node_exporter - -# ========== storcli — проверка и сбор метрик (без установки) ========== -- name: Detect MegaRAID controller - command: "lspci -d 1000:" - register: lspci_megaraid - ignore_errors: yes - changed_when: false - -- name: Check storcli binary exists - stat: - path: /opt/MegaRAID/storcli/storcli64 - register: storcli_bin - when: lspci_megaraid.rc == 0 - -- name: Ensure storcli symlink in PATH - file: - src: /opt/MegaRAID/storcli/storcli64 - dest: /usr/local/bin/storcli - state: link - force: yes - when: - - lspci_megaraid.rc == 0 - - storcli_bin.stat.exists - -- name: Verify storcli functionality - command: storcli /call show - register: storcli_test - changed_when: false - failed_when: - - storcli_test.rc != 0 - - "'Controller' not in storcli_test.stdout" - when: - - lspci_megaraid.rc == 0 - - storcli_bin.stat.exists - -- name: Set fact — storcli is available - set_fact: - storcli_available: true - when: - - lspci_megaraid.rc == 0 - - storcli_bin.stat.exists - - storcli_test is success - -- name: Set fact — storcli NOT available - set_fact: - storcli_available: false - when: not (lspci_megaraid.rc == 0 and storcli_bin.stat.exists) - -# Сбор метрик ТОЛЬКО при наличии storcli -- name: Deploy storcli metrics script - template: - src: storcli_metrics.sh.j2 - dest: /opt/scripts/storcli_metrics.sh - owner: root - group: root - mode: '0755' - when: storcli_available | default(false) - -- name: Deploy storcli_metrics systemd units - block: - - name: Create storcli_metrics.service - copy: - content: | - [Unit] - Description=Collect RAID/disk metrics via storcli - After=network.target - [Service] - Type=oneshot - ExecStart=/opt/scripts/storcli_metrics.sh - User=root - StandardOutput=journal - StandardError=journal - dest: /etc/systemd/system/storcli_metrics.service - owner: root - mode: '0644' - - - name: Create storcli_metrics.timer (every 5 min) - copy: - content: | - [Unit] - Description=Run storcli metrics collector every 5 minutes - Requires=storcli_metrics.service - [Timer] - OnBootSec=60 - OnUnitActiveSec=5m - AccuracySec=1s - [Install] - WantedBy=timers.target - dest: /etc/systemd/system/storcli_metrics.timer - owner: root - mode: '0644' - - - name: Enable & start storcli_metrics.timer - systemd: - name: storcli_metrics.timer - state: started - enabled: yes - daemon_reload: yes - when: storcli_available | default(false) - notify: restart storcli_metrics \ No newline at end of file + notify: restart ssh \ No newline at end of file diff --git a/roles/proxmox_monitoring/handlers/main.yml b/roles/proxmox_monitoring/handlers/main.yml new file mode 100644 index 0000000..c776afc --- /dev/null +++ b/roles/proxmox_monitoring/handlers/main.yml @@ -0,0 +1,19 @@ +--- +- name: restart node_exporter + systemd: + name: node_exporter + state: restarted + daemon_reload: yes + enabled: yes + +- name: restart pve_exporter + systemd: + name: pve_exporter + state: restarted + daemon_reload: yes + +- name: restart storcli_metrics + systemd: + name: storcli_metrics.timer + state: restarted + daemon_reload: yes \ No newline at end of file diff --git a/roles/proxmox_monitoring/tasks/main.yml b/roles/proxmox_monitoring/tasks/main.yml new file mode 100644 index 0000000..e2d99f0 --- /dev/null +++ b/roles/proxmox_monitoring/tasks/main.yml @@ -0,0 +1,306 @@ +--- +- name: Update package cache + apt: + update_cache: yes + cache_valid_time: 86400 + become: yes + +- name: Install monitoring dependencies + apt: + name: + - python3 + - python3-pip + - python3-venv + - curl + - wget + - jq + - smartmontools + state: present + become: yes + +# ========== 1. Установка storcli с проверкой ========== +- name: Check if storcli is already installed + stat: + path: /opt/MegaRAID/storcli/storcli64 + register: storcli_installed + become: yes + +- name: Download storcli + get_url: + url: https://docs.broadcom.com/docs-and-downloads/raid-controllers/raid-controllers-common-files/storcli_1.24.02-1_all.deb + dest: /tmp/storcli.deb + mode: '0644' + when: not storcli_installed.stat.exists + become: yes + +- name: Install storcli + apt: + deb: /tmp/storcli.deb + state: present + when: not storcli_installed.stat.exists + become: yes + +- name: Create storcli symlink + file: + src: /opt/MegaRAID/storcli/storcli64 + dest: /usr/local/bin/storcli + state: link + force: yes + when: storcli_installed.stat.exists or not storcli_installed.stat.exists + become: yes + +- name: Verify storcli works + command: /opt/MegaRAID/storcli/storcli64 /c0 show + register: storcli_test + changed_when: false + become: yes + +- name: Set fact — storcli is available + set_fact: + storcli_available: true + when: storcli_test.rc == 0 + +# ========== 2. Node Exporter ========== +- name: Create node_exporter user + user: + name: node_exporter + system: yes + shell: /usr/sbin/nologin + create_home: no + become: yes + +- name: Download node_exporter + get_url: + url: "https://github.com/prometheus/node_exporter/releases/download/v1.8.2/node_exporter-1.8.2.linux-amd64.tar.gz" + dest: /tmp/node_exporter-1.8.2.linux-amd64.tar.gz + checksum: "sha256:https://github.com/prometheus/node_exporter/releases/download/v1.8.2/sha256sums.txt" + mode: '0644' + timeout: 60 + become: yes + +- name: Extract node_exporter + unarchive: + src: /tmp/node_exporter-1.8.2.linux-amd64.tar.gz + dest: /tmp/ + remote_src: yes + creates: /tmp/node_exporter-1.8.2.linux-amd64/node_exporter + become: yes + +- name: Install node_exporter binary + copy: + src: /tmp/node_exporter-1.8.2.linux-amd64/node_exporter + dest: /usr/local/bin/node_exporter + owner: root + group: root + mode: '0755' + remote_src: yes + become: yes + notify: restart node_exporter + +- name: Create textfile_collector directory + file: + path: /var/lib/node_exporter/textfile_collector + state: directory + owner: node_exporter + group: node_exporter + mode: '0755' + become: yes + +- name: Deploy node_exporter systemd service + template: + src: node_exporter.service.j2 + dest: /etc/systemd/system/node_exporter.service + owner: root + group: root + mode: '0644' + become: yes + notify: restart node_exporter + +# ========== 3. PVE Exporter ========== +- name: Create pve_exporter user + user: + name: pve_exporter + system: yes + shell: /usr/sbin/nologin + create_home: no + become: yes + +- name: Create pve_exporter directories + file: + path: "{{ item }}" + state: directory + owner: pve_exporter + group: pve_exporter + mode: '0755' + loop: + - /opt/pve_exporter + - /opt/pve_exporter/config + become: yes + +- name: Create Python virtual environment + command: + cmd: python3 -m venv /opt/pve_exporter/venv + creates: /opt/pve_exporter/venv/bin/python + become: yes + become_user: pve_exporter + environment: + HOME: /opt/pve_exporter + +- name: Upgrade pip in virtual environment + command: + cmd: /opt/pve_exporter/venv/bin/pip install --upgrade pip + chdir: /opt/pve_exporter + become: yes + become_user: pve_exporter + environment: + HOME: /opt/pve_exporter + +- name: Install prometheus-pve-exporter package + command: + cmd: /opt/pve_exporter/venv/bin/pip install prometheus-pve-exporter + chdir: /opt/pve_exporter + become: yes + become_user: pve_exporter + environment: + HOME: /opt/pve_exporter + +- name: Deploy pve_exporter config + template: + src: pve_exporter_config.yml.j2 + dest: /opt/pve_exporter/config/config.yml + owner: pve_exporter + group: pve_exporter + mode: '0600' + become: yes + +- name: Create pve_exporter systemd service + copy: + content: | + [Unit] + Description=Proxmox VE Exporter + After=network.target + + [Service] + Type=simple + User=pve_exporter + WorkingDirectory=/opt/pve_exporter + Environment="HOME=/opt/pve_exporter" + ExecStart=/opt/pve_exporter/venv/bin/pve_exporter \ + --web.listen-address=0.0.0.0:9223 \ + --config.file=/opt/pve_exporter/config/config.yml + Restart=always + RestartSec=10 + StandardOutput=journal + StandardError=journal + SyslogIdentifier=pve_exporter + + [Install] + WantedBy=multi-user.target + dest: /etc/systemd/system/pve_exporter.service + mode: '0644' + become: yes + notify: restart pve_exporter + +# ========== 4. StorCLI Metrics ========== +- name: Deploy storcli metrics script + template: + src: storcli_metrics.sh.j2 + dest: /opt/scripts/storcli_metrics.sh + owner: root + group: root + mode: '0755' + when: storcli_available | default(false) + become: yes + +- name: Deploy storcli_metrics systemd units + block: + - name: Create storcli_metrics.service + copy: + content: | + [Unit] + Description=Collect RAID/disk metrics via storcli + After=network.target + [Service] + Type=oneshot + ExecStart=/opt/scripts/storcli_metrics.sh + User=root + StandardOutput=journal + StandardError=journal + dest: /etc/systemd/system/storcli_metrics.service + owner: root + mode: '0644' + + - name: Create storcli_metrics.timer (every 5 min) + copy: + content: | + [Unit] + Description=Run storcli metrics collector every 5 minutes + Requires=storcli_metrics.service + [Timer] + OnBootSec=60 + OnUnitActiveSec=5m + AccuracySec=1s + [Install] + WantedBy=timers.target + dest: /etc/systemd/system/storcli_metrics.timer + owner: root + mode: '0644' + + - name: Enable & start storcli_metrics.timer + systemd: + name: storcli_metrics.timer + state: started + enabled: yes + daemon_reload: yes + when: storcli_available | default(false) + become: yes + notify: restart storcli_metrics + +# ========== 5. Запуск и проверка сервисов ========== +- name: Start and enable all services + systemd: + name: "{{ item }}" + state: started + enabled: yes + daemon_reload: yes + loop: + - node_exporter + - pve_exporter + become: yes + +- name: Wait for services to initialize + wait_for: + host: localhost + port: "{{ item.port }}" + timeout: 30 + state: started + delay: 5 + loop: + - { port: 9100, service: "node_exporter" } + - { port: 9223, service: "pve_exporter" } + become: yes + failed_when: false + +- name: Verify services are responding + uri: + url: "http://localhost:{{ item.port }}/metrics" + status_code: 200 + timeout: 10 + loop: + - { port: 9100, service: "node_exporter" } + - { port: 9223, service: "pve_exporter" } + register: service_checks + become: yes + +- name: Show service status + debug: + msg: "{{ item.item.service }} - {{ item.status }}" + loop: "{{ service_checks.results }}" + loop_control: + label: "{{ item.item.service }}" + +- name: Run initial storcli metrics collection + command: /opt/scripts/storcli_metrics.sh + when: storcli_available | default(false) + become: yes + changed_when: false \ No newline at end of file diff --git a/roles/proxmox_monitoring/templates/node_exporter.service.j2 b/roles/proxmox_monitoring/templates/node_exporter.service.j2 new file mode 100644 index 0000000..e354645 --- /dev/null +++ b/roles/proxmox_monitoring/templates/node_exporter.service.j2 @@ -0,0 +1,38 @@ +[Unit] +Description=Prometheus Node Exporter +Documentation=https://github.com/prometheus/node_exporter +After=network.target + +[Service] +Type=simple +User=node_exporter +Group=node_exporter +ExecStart=/usr/local/bin/node_exporter \ + --collector.systemd \ + --collector.processes \ + --collector.cpu \ + --collector.meminfo \ + --collector.diskstats \ + --collector.netdev \ + --collector.filesystem \ + --collector.loadavg \ + --collector.time \ + --collector.textfile.directory=/var/lib/node_exporter/textfile_collector \ + --web.listen-address=0.0.0.0:9100 \ + --web.telemetry-path=/metrics +Restart=always +RestartSec=5 + +# Безопасность (Proxmox-совместимо) +NoNewPrivileges=yes +ProtectSystem=strict +ProtectHome=yes +PrivateTmp=yes +ProtectControlGroups=yes +ProtectKernelModules=yes +ProtectKernelTunables=yes +LockPersonality=yes +RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/roles/proxmox_monitoring/templates/pve_exporter_config.yml.j2 b/roles/proxmox_monitoring/templates/pve_exporter_config.yml.j2 new file mode 100644 index 0000000..b375360 --- /dev/null +++ b/roles/proxmox_monitoring/templates/pve_exporter_config.yml.j2 @@ -0,0 +1,5 @@ +default: + username: "{{ pve_exporter_user }}" + token_name: "{{ pve_exporter_token_name }}" + token_value: "{{ pve_exporter_token_value }}" + verify_ssl: false \ No newline at end of file diff --git a/roles/proxmox_monitoring/templates/storcli_metrics.sh.j2 b/roles/proxmox_monitoring/templates/storcli_metrics.sh.j2 new file mode 100644 index 0000000..14630ee --- /dev/null +++ b/roles/proxmox_monitoring/templates/storcli_metrics.sh.j2 @@ -0,0 +1,78 @@ +#!/bin/bash + +OUT_FILE="/var/lib/node_exporter/textfile_collector/storcli.prom" + +# Очищаем файл +> "$OUT_FILE" + +# Заголовки метрик +echo "# HELP storcli_disk_temp Temperature of physical disk in Celsius" >> "$OUT_FILE" +echo "# TYPE storcli_disk_temp gauge" >> "$OUT_FILE" +echo "# HELP storcli_disk_state Disk state: 0=Offline,1=UGood,2=Online,3=Failed,4=Rebuild" >> "$OUT_FILE" +echo "# TYPE storcli_disk_state gauge" >> "$OUT_FILE" +echo "# HELP storcli_disk_media_error_count Media error count" >> "$OUT_FILE" +echo "# TYPE storcli_disk_media_error_count counter" >> "$OUT_FILE" +echo "# HELP storcli_disk_other_error_count Other error count" >> "$OUT_FILE" +echo "# TYPE storcli_disk_other_error_count counter" >> "$OUT_FILE" +echo "# HELP storcli_array_state Virtual drive state: 0=Offline,1=Online,2=Degraded,3=Failed" >> "$OUT_FILE" +echo "# TYPE storcli_array_state gauge" >> "$OUT_FILE" + +# Состояния дисков из общей таблицы +/opt/MegaRAID/storcli/storcli64 /c0/eall/sall show all | awk ' +/^252:[0-9]+\s+/ { + split($1, parts, ":"); + slot = parts[2]; + did = $2; + state = $3; + + state_num = 2; # по умолчанию Online + if (state == "Offln") state_num = 0; + else if (state == "UGood") state_num = 1; + else if (state == "Failed") state_num = 3; + else if (state == "Rebuild") state_num = 4; + + printf "storcli_disk_state{controller=\"0\",enclosure=\"252\",slot=\"%s\",did=\"%s\",model=\"ST8000NM0075\"} %s\n", slot, did, state_num +} +' >> "$OUT_FILE" + +# Температуры и ошибки для каждого диска +for slot in 0 1 2 3; do + echo "Processing disk slot $slot..." + disk_info=$(/opt/MegaRAID/storcli/storcli64 /c0/e252/s$slot show all) + + # Температура + temp=$(echo "$disk_info" | grep "Drive Temperature" | grep -oE '[0-9]+' | head -1) + if [ -n "$temp" ]; then + echo "storcli_disk_temp{controller=\"0\",enclosure=\"252\",slot=\"$slot\",model=\"ST8000NM0075\"} $temp" >> "$OUT_FILE" + echo " Temperature: $temp°C" + fi + + # Ошибки + media_line=$(echo "$disk_info" | grep "Media Error Count") + other_line=$(echo "$disk_info" | grep "Other Error Count") + + # Извлекаем числа после "=" + media_errors=$(echo "$media_line" | awk -F= '{print $2}' | awk '{print $1}') + other_errors=$(echo "$other_line" | awk -F= '{print $2}' | awk '{print $1}') + + # Записываем ошибки (0 по умолчанию) + echo "storcli_disk_media_error_count{controller=\"0\",enclosure=\"252\",slot=\"$slot\",model=\"ST8000NM0075\"} ${media_errors:-0}" >> "$OUT_FILE" + echo "storcli_disk_other_error_count{controller=\"0\",enclosure=\"252\",slot=\"$slot\",model=\"ST8000NM0075\"} ${other_errors:-0}" >> "$OUT_FILE" + + echo " Media errors: ${media_errors:-0}, Other errors: ${other_errors:-0}" +done + +# Состояние массива +array_line=$(/opt/MegaRAID/storcli/storcli64 /c0/vall show | grep "^0/0") +array_state=$(echo "$array_line" | awk '{print $3}') +state_num=1 # по умолчанию Optimal +[[ "$array_state" == "Dgrd" ]] && state_num=2 +[[ "$array_state" == "Offln" ]] && state_num=0 +[[ "$array_state" == "Pdgd" ]] && state_num=0 +[[ "$array_state" == "Ft" ]] && state_num=3 + +echo "storcli_array_state{controller=\"0\",vd=\"0\"} $state_num" >> "$OUT_FILE" +echo "Array state: $array_state ($state_num)" + +chmod 644 "$OUT_FILE" +echo "$(date -Iseconds) storcli metrics updated successfully"