Добавил настройку автоматического рестарта кластера по patroni api

2025-04-03 17:38:19 +03:00 · 2025-04-03 17:38:19 +03:00 · d325e6019a
commit d325e6019a
parent 8242af3182
7 changed files with 293 additions and 100 deletions
--- a/README.md
+++ b/README.md
@ -10,6 +10,7 @@
 - Проверка состояния кластера после изменений
 - Уведомление о необходимости перезагрузки нод (если требуется)
 - Автоматическое управление историей конфигурационных файлов
+- Автоматическая перезагрузка кластера (если требуется) настраивается в настройках

 ### Требования
 - Ansible 2.9+
@ -22,7 +23,9 @@

 - config_dir (по умолчанию: "/ansible/history") - директория для хранения истории конфигураций
 - config_file (по умолчанию: "/ansible/patroni_config.yaml") - путь к файлу с изменениями конфигурации
- patroni_host (по умолчанию: "10.14.0.180") - хост кластера Patroni
+- patroni_host (по умолчанию: "127.0.0.1") - хост кластера Patroni
+- patroni_api_port (по умолчанию: "8008") - порт кластера Patroni
+- autorestart (по умолчанию: "false") - включение автоматического рестарта кластера, если при изменении настроек они требуют перезагрузки

 ### Как внести изменения в конфиг кластера?
 1. Для начала создайте новый branch по имени кластера.
--- a/patroni_config.yaml
+++ b/patroni_config.yaml
@ -1,5 +1,82 @@
+#loop_wait: 10
+#master_start_timeout: 300
+#maximum_lag_on_failover: 1048576
 postgresql:
-  parameters:
-    max_connections: 300
-    shared_buffers: "12GB"
-  use_pg_rewind: true
+    parameters:
+#        archive_command: pgbackrest --stanza=sandbox-postgres-cluster archive-push
+#            %p || cd .
+#        archive_mode: true
+#        archive_timeout: 1800s
+#        auto_explain.log_analyze: true
+#        auto_explain.log_buffers: true
+#        auto_explain.log_min_duration: 10s
+#        auto_explain.log_nested_statements: true
+#       auto_explain.log_timing: false
+#        auto_explain.log_triggers: true
+#        auto_explain.log_verbose: true
+#        autovacuum: true
+#        autovacuum_analyze_scale_factor: 0.02
+#        autovacuum_max_workers: 8
+#        autovacuum_naptime: 1s
+#        autovacuum_vacuum_cost_delay: 20
+#        autovacuum_vacuum_cost_limit: 200
+#        autovacuum_vacuum_scale_factor: 0.01
+#        checkpoint_completion_target: 0.9
+#        checkpoint_timeout: 15min
+#        default_statistics_target: 100
+#        effective_cache_size: 12GB
+#        effective_io_concurrency: 200
+#        hot_standby: true
+#        huge_pages: try
+#        log_checkpoints: true
+#        log_directory: /var/log/postgresql
+#        log_filename: postgresql-%a.log
+#        log_line_prefix: '%t [%p-%l] %r %q%u@%d '
+#        log_lock_waits: true
+#        log_rotation_age: 1d
+#        log_rotation_size: 0
+#        log_temp_files: 0
+#        log_timezone: Europe/Moscow
+#        log_truncate_on_rotation: true
+#        logging_collector: true
+#        maintenance_work_mem: 1GB
+        max_connections: 100
+#        max_files_per_process: 4096
+#        max_locks_per_transaction: 64
+#        max_parallel_maintenance_workers: 4
+#        max_parallel_workers: 8
+#        max_parallel_workers_per_gather: 4
+#        max_prepared_transactions: 0
+#        max_replication_slots: 10
+#        max_wal_senders: 10
+#        max_wal_size: 8GB
+#        max_worker_processes: 8
+#        min_wal_size: 2GB
+#        pg_stat_statements.max: 10000
+#        pg_stat_statements.save: false
+#        pg_stat_statements.track: all
+#        random_page_cost: 1.1
+#        seq_page_cost: 1
+#        shared_buffers: 4GB
+#        shared_preload_libraries: pg_stat_statements,auto_explain,pg_cron,pg_partman_bgw,redis_fdw
+#        superuser_reserved_connections: 5
+#        synchronous_commit: true
+#        timezone: Europe/Moscow
+#        track_activities: true
+#        track_counts: true
+#        track_functions: all
+#        track_io_timing: true
+#        wal_buffers: 16MB
+#        wal_keep_segments: 1000
+#        wal_level: replica
+#        wal_log_hints: true
+#        work_mem: 10485kB
+#    recovery_conf:
+#        restore_command: pgbackrest --stanza=sandbox-postgres-cluster archive-get
+#            %f %p
+#    use_pg_rewind: true
+#retry_timeout: 10
+#synchronous_mode: true
+#synchronous_mode_strict: true
+#synchronous_node_count: 1
+#ttl: 100
--- a/requirements.yml
+++ b/requirements.yml
@ -1,4 +1,4 @@
 ---
 collections:
-  - name: maxhoesel.proxmox
-    version: 5.0.1
+#  - name: maxhoesel.proxmox
+#    version: 5.0.1
--- a/roles/apply/handlers/main.yaml
+++ b/roles/apply/handlers/main.yaml
@ -1,4 +1,21 @@
 ---
- name: Log cleanup results
+# handlers/main.yml
+- name: Verify config application
+  ansible.builtin.uri:
+    url: "http://{{ patroni_host }}:{{ patroni_api_port }}/config"
+    method: GET
+    return_content: yes
+    status_code: 200
+  register: config_verification
+  delegate_to: localhost
+  listen: "config applied"
+
+- name: Log config changes
  ansible.builtin.debug:
-    msg: "Removed {{ (old_configs.files | sort(attribute='mtime'))[:-10] | length }} old config files"
+    msg: "Конфигурация успешно применена. Новые параметры: {{ config_verification.json | to_nice_json }}"
+  listen: "config applied"
+
+- name: Log cleanup
+  ansible.builtin.debug:
+    msg: "Удалены старые конфигурационные файлы, сохранено последние 10 версий"
+  listen: "Log cleanup"
--- a/roles/apply/tasks/role/main.yaml
+++ b/roles/apply/tasks/role/main.yaml
@ -1,7 +1,8 @@
 ---
- name: Apply new configuration
+# tasks/main.yml
+- name: Apply new Patroni configuration
  ansible.builtin.uri:
-    url: "http://{{ patroni_host }}:8008/config"
+    url: "http://{{ patroni_host }}:{{ patroni_api_port }}/config"
    method: PATCH
    body: "{{ new_config | to_json }}"
    body_format: json
@ -10,103 +11,196 @@
      Content-Type: "application/json"
  register: apply_result
  changed_when: apply_result.status == 200
+  notify: "config applied"

- name: Force wait for config to apply  # noqa: no-handler
+- name: Wait for config propagation # noqa: no-handler
  ansible.builtin.wait_for:
    timeout: 30
    delay: 5
  when: apply_result is changed

- name: Get verified cluster status  # noqa: no-handler
-  ansible.builtin.uri:
-    url: "http://{{ patroni_host }}:8008/cluster"
-    method: GET
-    return_content: yes
-    status_code: 200
-  register: verified_cluster_status
-  delegate_to: localhost
-  connection: local
+- name: Check for pending restarts # noqa: no-handler
  when: apply_result is changed
-
- name: Display confirmed cluster status
-  ansible.builtin.debug:
-    msg: |
-      === CONFIRMED CLUSTER STATUS ===
-      Leader: {{ (verified_cluster_status.json.members | selectattr('role', 'equalto', 'leader') | map(attribute='name') | first) | default('UNKNOWN') }}
-      Members:
-      {% for member in verified_cluster_status.json.members %}
-      - {{ member.name }} [{{ member.role | upper }}]
-        State: {{ member.state | default('UNKNOWN') }}
-        Lag: {{ member.lag | default(0) }}MB
-        Timeline: {{ member.timeline | default('N/A') }}
-        Pending restart: {{ member.pending_restart | default(false) | ternary('YES', 'NO') }}
-      {% endfor %}
-      Config Applied: {{ apply_result is changed | ternary('YES', 'NO') }}
-      ================================
-  delegate_to: localhost
-  connection: local
  run_once: true
+  block:
+    - name: Get cluster status with retry
+      ansible.builtin.uri:
+        url: "http://{{ patroni_host }}:{{ patroni_api_port }}/cluster"
+        method: GET
+        return_content: yes
+        status_code: 200
+      register: cluster_status
+      until: cluster_status.json is defined
+      retries: 3
+      delay: 2
+      delegate_to: localhost
+      run_once: true

- name: Refresh cluster status
-  ansible.builtin.uri:
-    url: "http://{{ patroni_host }}:8008/cluster"
-    method: GET
-    return_content: yes
-    status_code: 200
-  register: refreshed_cluster_status
-  delegate_to: localhost
-  run_once: true
-  when: verified_cluster_status is defined
+    - name: Check restart flags
+      ansible.builtin.set_fact:
+        needs_restart: >-
+          {{
+            (cluster_status.json.members |
+             map(attribute='pending_restart', default=false) |
+             select('equalto', true) | list | length > 0) or
+            (cluster_status.json.members |
+             map(attribute='tags.pending_restart', default=false) |
+             select('equalto', true) | list | length > 0)
+          }}
+        node_names: "{{ cluster_status.json.members | map(attribute='name') | list }}"
+        node_info: >-
+          {% set info = {} %}
+          {% for member in cluster_status.json.members %}
+          {% set _ = info.update({member.name: {'role': member.role}}) %}
+          {% endfor %}
+          {{ info }}
+      run_once: true
+  rescue:
+    - name: Set no restart needed
+      ansible.builtin.set_fact:
+        needs_restart: false
+      run_once: true

- name: Safe check for pending restarts
-  ansible.builtin.set_fact:
-    needs_restart: >-
-      {{
-        (refreshed_cluster_status.json.members |
-         map(attribute='pending_restart', default=false) |
-         select('equalto', true) | list | count > 0) or
-        (refreshed_cluster_status.json.members |
-         map(attribute='tags.pending_restart', default=false) |
-         select('equalto', true) | list | count > 0)
-      }}
-    node_names: >-
-      {{
-        refreshed_cluster_status.json.members |
-        map(attribute='name') |
-        list
-      }}
-  when:
-    - refreshed_cluster_status.json is defined
-    - refreshed_cluster_status.json.members is defined
-  run_once: true
-  delegate_to: localhost

- name: Show restart warning if needed
+- name: Display restart warning
  ansible.builtin.debug:
    msg: |
      {% if needs_restart %}
-      ==================================
-      ВНИМАНИЕ: ТРЕБУЕТСЯ ПЕРЕЗАГРУЗКА
-      ==================================
+        {% if autorestart %}
+          ================================================
+          ПРЕДУПРЕЖДЕНИЕ: АВТОМАТИЧЕСКИЙ ПЕРЕЗАПУСК КЛАСТЕРА
+          ================================================
+          Следующие ноды будут перезапущены:
+          {% for node in node_names %}
+          - {{ node }} ({{ node_info[node].role | default('UNKNOWN') }})
+          {% endfor %}

-      Не, я конечно могу и сам ролью, но вдруг кластер в проде или еще где!!!
-      Так что лучше выполнить следующую команду на одной из нод кластера:
+          Для отмены нажмите Ctrl+C в течение 10 секунд
+        {% else %}
+          ============================================
+          ВНИМАНИЕ: НЕОБХОДИМ РУЧНОЙ ПЕРЕЗАПУСК КЛАСТЕРА
+          ============================================
+          Выполните на одной из нод:

-          patronictl restart {{ node_names | join(' ') }}
+          patronictl restart -c /etc/patrony.yml {{ node_names | join(' ') }}

-      Затронутые ноды:
-      {% for node in node_names %}
-      - {{ node }}
-      {% endfor %}
+          Ноды для перезапуска:
+          {% for node in node_names %}
+          - {{ node }} ({{ node_info[node].role | default('UNKNOWN') }})
+          {% endfor %}
+        {% endif %}
      {% else %}
-      ==================================
-      СТАТУС: Перезагрузка не требуется
-      ==================================
+        ================================
+        ПЕРЕЗАГРУЗКА НЕ ТРЕБУЕТСЯ
+        ================================
      {% endif %}
  delegate_to: localhost
  run_once: true
+  when:
+    - needs_restart is defined
+    - node_names is defined
+    - node_info is defined

- name: Archive old configurations
+- name: Confirm automatic restart
+  ansible.builtin.pause:
+    prompt: "Подтвердите автоматический перезапуск кластера (Enter - продолжить, Ctrl+C - отмена)"
+    seconds: 10
+  when:
+    - needs_restart | default(false)
+    - autorestart | default(false)
+  delegate_to: localhost
+  run_once: true
+
+- name: Execute cluster restart
+  when:
+    - needs_restart | default(false)
+    - autorestart | bool
+    - cluster_status is defined
+    - cluster_status.json is defined
+    - cluster_status.json.members is defined
+  run_once: true
+  block:
+    - name: Find nodes needing restart
+      ansible.builtin.set_fact:
+        nodes_to_restart: >-
+          {%
+            set nodes = []
+          %}{%
+            for member in cluster_status.json.members
+          %}{%
+            if member.pending_restart is defined and member.pending_restart or
+               member.tags.pending_restart is defined and member.tags.pending_restart
+          %}{%
+            set _ = nodes.append(member)
+          %}{%
+            endif
+          %}{%
+            endfor
+          %}{{
+            nodes
+          }}
+
+    - name: Restart nodes via API
+      ansible.builtin.uri:
+        url: "http://{{ item.host }}:{{ patroni_api_port }}/restart"
+        method: POST
+        body_format: json
+        body:
+          restart_pending: true
+          timeout: 60
+        status_code: [200, 503]
+      loop: "{{ nodes_to_restart | default([]) }}"
+      loop_control:
+        label: "{{ item.name }}"
+      register: restart_results
+      ignore_errors: yes
+      changed_when: >
+        restart_results.status == 200 or
+        restart_results.status == 503
+
+    - name: Wait for cluster stabilization
+      block:
+        - name: Check cluster status until stable
+          ansible.builtin.uri:
+            url: "http://{{ patroni_host }}:{{ patroni_api_port }}/cluster"
+            method: GET
+            return_content: yes
+            status_code: 200
+          register: cluster_health
+          until: >
+            cluster_health.json.members | 
+            selectattr('state', 'match', '^(running|streaming)$') | 
+            list | length == cluster_health.json.members | length
+          retries: 12
+          delay: 10
+          delegate_to: localhost
+          run_once: true
+
+    - name: Show restart results
+      ansible.builtin.debug:
+        msg: |
+          ========================
+          РЕЗУЛЬТАТЫ ПЕРЕЗАГРУЗКИ
+          ========================
+          Нода: {{ item.item.name }}
+          Роль: {{ item.item.role }}
+          Статус: {% if item.status == 200 %}Успешно перезапущена{% elif item.status == 503 %}Перезапуск в процессе{% else %}Ошибка (код {{ item.status }}){% endif %}
+          Время выполнения: {{ item.elapsed }} сек
+          {% if item.item.pending_restart_reason is defined %}
+          Причина перезагрузки:
+          {% for param, values in item.item.pending_restart_reason.items() %}
+            - {{ param }}: было {{ values.old_value }}, стало {{ values.new_value }}
+          {% endfor %}
+          {% endif %}
+          ------------------------
+      loop: "{{ restart_results.results | default([]) }}"
+      loop_control:
+        label: ""
+      run_once: true
+
+- name: Archive old configurations # noqa: no-handler
+  when: apply_result is changed
+  run_once: true
  block:
    - name: Find old config files
      ansible.builtin.find:
@ -117,14 +211,12 @@
      delegate_to: localhost
      connection: local

-    - name: Remove excess configs (keep last 10)
+    - name: Remove excess configs
      ansible.builtin.file:
        path: "{{ item.path }}"
        state: absent
      loop: "{{ (old_configs.files | sort(attribute='mtime'))[:-10] }}"
-      when:
-        - old_configs.matched > 10
-        - apply_result is changed
+      when: old_configs.matched > 10
      delegate_to: localhost
      connection: local
-      notify: Log cleanup results
+      notify: "Log cleanup"
--- a/roles/deploy.yaml
+++ b/roles/deploy.yaml
@ -1,5 +1,6 @@
 ---
 - name: Подготовка ро<D180>к изменению конфнастроек кла<D0BB>стера
  import_playbook: prepare/deploy.yaml
+
 - name: Применение изменений нас<D0B0>троек кластера
  import_playbook: apply/deploy.yaml
--- a/vars/secrets.yml
+++ b/vars/secrets.yml
@ -1,10 +1,13 @@
 $ANSIBLE_VAULT;1.1;AES256
-37376136623761343135636239653137353661303631663536613265366431333339663866643265
-3033653765613632313661393166363238643137346330620a643233623433633963333035646466
-34633366623262643165326331333937623064356131306663623362663663343861383735616365
-3363646132393166310a353965346531616330396666383732656430633630323438326161323965
-64323865636265303331663166393232376138663965613361623361303663353737623238373435
-30316161616234356264643762653036626132613664316137646665323335663232393535353131
-37636331646364313839653438323461353638363936623131626161353936303839393533326162
-31623833313834646233303961656633633933386135396439373463623362316561313138643631
-6663
+37613833393263643830623437366465373832623162373161383334336162326635663538326537
+3335386563373734636232356164636530393236353466610a366432353562343063376132643331
+30656666326633616639383966386439663264306536396533343861656566343539376130343930
+3932346663303035350a376233326363613763383139646262313531396466616635393166616435
+30643637336364656432376436373161623438316165353534643135313831636565353638363734
+61663964653362363533633664626435613738613538633761393231353435646463633661643839
+61616239386133353964656133316463343036666234636132316334323865653937323830313065
+34646633613736663362363631363131393439623137633162383235663938633237386439623562
+61646233393030663464353864656362356138643635383561653063333839353139666432323765
+37396633303231396631336264393032386561666534376635383962366365333934313734323632
+62346631396162383438303434383031333662386132393434353832323631653533346363333534
+36616639623533633639