Добавил настройку автоматического рестарта кластера по patroni api
This commit is contained in:
parent
8242af3182
commit
d325e6019a
@ -10,6 +10,7 @@
|
||||
- Проверка состояния кластера после изменений
|
||||
- Уведомление о необходимости перезагрузки нод (если требуется)
|
||||
- Автоматическое управление историей конфигурационных файлов
|
||||
- Автоматическая перезагрузка кластера (если требуется) настраивается в настройках
|
||||
|
||||
### Требования
|
||||
- Ansible 2.9+
|
||||
@ -22,7 +23,9 @@
|
||||
|
||||
- config_dir (по умолчанию: "/ansible/history") - директория для хранения истории конфигураций
|
||||
- config_file (по умолчанию: "/ansible/patroni_config.yaml") - путь к файлу с изменениями конфигурации
|
||||
- patroni_host (по умолчанию: "10.14.0.180") - хост кластера Patroni
|
||||
- patroni_host (по умолчанию: "127.0.0.1") - хост кластера Patroni
|
||||
- patroni_api_port (по умолчанию: "8008") - порт кластера Patroni
|
||||
- autorestart (по умолчанию: "false") - включение автоматического рестарта кластера, если при изменении настроек они требуют перезагрузки
|
||||
|
||||
### Как внести изменения в конфиг кластера?
|
||||
1. Для начала создайте новый branch по имени кластера.
|
||||
|
@ -1,5 +1,82 @@
|
||||
#loop_wait: 10
|
||||
#master_start_timeout: 300
|
||||
#maximum_lag_on_failover: 1048576
|
||||
postgresql:
|
||||
parameters:
|
||||
max_connections: 300
|
||||
shared_buffers: "12GB"
|
||||
use_pg_rewind: true
|
||||
parameters:
|
||||
# archive_command: pgbackrest --stanza=sandbox-postgres-cluster archive-push
|
||||
# %p || cd .
|
||||
# archive_mode: true
|
||||
# archive_timeout: 1800s
|
||||
# auto_explain.log_analyze: true
|
||||
# auto_explain.log_buffers: true
|
||||
# auto_explain.log_min_duration: 10s
|
||||
# auto_explain.log_nested_statements: true
|
||||
# auto_explain.log_timing: false
|
||||
# auto_explain.log_triggers: true
|
||||
# auto_explain.log_verbose: true
|
||||
# autovacuum: true
|
||||
# autovacuum_analyze_scale_factor: 0.02
|
||||
# autovacuum_max_workers: 8
|
||||
# autovacuum_naptime: 1s
|
||||
# autovacuum_vacuum_cost_delay: 20
|
||||
# autovacuum_vacuum_cost_limit: 200
|
||||
# autovacuum_vacuum_scale_factor: 0.01
|
||||
# checkpoint_completion_target: 0.9
|
||||
# checkpoint_timeout: 15min
|
||||
# default_statistics_target: 100
|
||||
# effective_cache_size: 12GB
|
||||
# effective_io_concurrency: 200
|
||||
# hot_standby: true
|
||||
# huge_pages: try
|
||||
# log_checkpoints: true
|
||||
# log_directory: /var/log/postgresql
|
||||
# log_filename: postgresql-%a.log
|
||||
# log_line_prefix: '%t [%p-%l] %r %q%u@%d '
|
||||
# log_lock_waits: true
|
||||
# log_rotation_age: 1d
|
||||
# log_rotation_size: 0
|
||||
# log_temp_files: 0
|
||||
# log_timezone: Europe/Moscow
|
||||
# log_truncate_on_rotation: true
|
||||
# logging_collector: true
|
||||
# maintenance_work_mem: 1GB
|
||||
max_connections: 100
|
||||
# max_files_per_process: 4096
|
||||
# max_locks_per_transaction: 64
|
||||
# max_parallel_maintenance_workers: 4
|
||||
# max_parallel_workers: 8
|
||||
# max_parallel_workers_per_gather: 4
|
||||
# max_prepared_transactions: 0
|
||||
# max_replication_slots: 10
|
||||
# max_wal_senders: 10
|
||||
# max_wal_size: 8GB
|
||||
# max_worker_processes: 8
|
||||
# min_wal_size: 2GB
|
||||
# pg_stat_statements.max: 10000
|
||||
# pg_stat_statements.save: false
|
||||
# pg_stat_statements.track: all
|
||||
# random_page_cost: 1.1
|
||||
# seq_page_cost: 1
|
||||
# shared_buffers: 4GB
|
||||
# shared_preload_libraries: pg_stat_statements,auto_explain,pg_cron,pg_partman_bgw,redis_fdw
|
||||
# superuser_reserved_connections: 5
|
||||
# synchronous_commit: true
|
||||
# timezone: Europe/Moscow
|
||||
# track_activities: true
|
||||
# track_counts: true
|
||||
# track_functions: all
|
||||
# track_io_timing: true
|
||||
# wal_buffers: 16MB
|
||||
# wal_keep_segments: 1000
|
||||
# wal_level: replica
|
||||
# wal_log_hints: true
|
||||
# work_mem: 10485kB
|
||||
# recovery_conf:
|
||||
# restore_command: pgbackrest --stanza=sandbox-postgres-cluster archive-get
|
||||
# %f %p
|
||||
# use_pg_rewind: true
|
||||
#retry_timeout: 10
|
||||
#synchronous_mode: true
|
||||
#synchronous_mode_strict: true
|
||||
#synchronous_node_count: 1
|
||||
#ttl: 100
|
||||
|
@ -1,4 +1,4 @@
|
||||
---
|
||||
collections:
|
||||
- name: maxhoesel.proxmox
|
||||
version: 5.0.1
|
||||
# - name: maxhoesel.proxmox
|
||||
# version: 5.0.1
|
||||
|
@ -1,4 +1,21 @@
|
||||
---
|
||||
- name: Log cleanup results
|
||||
# handlers/main.yml
|
||||
- name: Verify config application
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ patroni_host }}:{{ patroni_api_port }}/config"
|
||||
method: GET
|
||||
return_content: yes
|
||||
status_code: 200
|
||||
register: config_verification
|
||||
delegate_to: localhost
|
||||
listen: "config applied"
|
||||
|
||||
- name: Log config changes
|
||||
ansible.builtin.debug:
|
||||
msg: "Removed {{ (old_configs.files | sort(attribute='mtime'))[:-10] | length }} old config files"
|
||||
msg: "Конфигурация успешно применена. Новые параметры: {{ config_verification.json | to_nice_json }}"
|
||||
listen: "config applied"
|
||||
|
||||
- name: Log cleanup
|
||||
ansible.builtin.debug:
|
||||
msg: "Удалены старые конфигурационные файлы, сохранено последние 10 версий"
|
||||
listen: "Log cleanup"
|
||||
|
@ -1,7 +1,8 @@
|
||||
---
|
||||
- name: Apply new configuration
|
||||
# tasks/main.yml
|
||||
- name: Apply new Patroni configuration
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ patroni_host }}:8008/config"
|
||||
url: "http://{{ patroni_host }}:{{ patroni_api_port }}/config"
|
||||
method: PATCH
|
||||
body: "{{ new_config | to_json }}"
|
||||
body_format: json
|
||||
@ -10,103 +11,196 @@
|
||||
Content-Type: "application/json"
|
||||
register: apply_result
|
||||
changed_when: apply_result.status == 200
|
||||
notify: "config applied"
|
||||
|
||||
- name: Force wait for config to apply # noqa: no-handler
|
||||
- name: Wait for config propagation # noqa: no-handler
|
||||
ansible.builtin.wait_for:
|
||||
timeout: 30
|
||||
delay: 5
|
||||
when: apply_result is changed
|
||||
|
||||
- name: Get verified cluster status # noqa: no-handler
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ patroni_host }}:8008/cluster"
|
||||
method: GET
|
||||
return_content: yes
|
||||
status_code: 200
|
||||
register: verified_cluster_status
|
||||
delegate_to: localhost
|
||||
connection: local
|
||||
- name: Check for pending restarts # noqa: no-handler
|
||||
when: apply_result is changed
|
||||
|
||||
- name: Display confirmed cluster status
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
=== CONFIRMED CLUSTER STATUS ===
|
||||
Leader: {{ (verified_cluster_status.json.members | selectattr('role', 'equalto', 'leader') | map(attribute='name') | first) | default('UNKNOWN') }}
|
||||
Members:
|
||||
{% for member in verified_cluster_status.json.members %}
|
||||
- {{ member.name }} [{{ member.role | upper }}]
|
||||
State: {{ member.state | default('UNKNOWN') }}
|
||||
Lag: {{ member.lag | default(0) }}MB
|
||||
Timeline: {{ member.timeline | default('N/A') }}
|
||||
Pending restart: {{ member.pending_restart | default(false) | ternary('YES', 'NO') }}
|
||||
{% endfor %}
|
||||
Config Applied: {{ apply_result is changed | ternary('YES', 'NO') }}
|
||||
================================
|
||||
delegate_to: localhost
|
||||
connection: local
|
||||
run_once: true
|
||||
block:
|
||||
- name: Get cluster status with retry
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ patroni_host }}:{{ patroni_api_port }}/cluster"
|
||||
method: GET
|
||||
return_content: yes
|
||||
status_code: 200
|
||||
register: cluster_status
|
||||
until: cluster_status.json is defined
|
||||
retries: 3
|
||||
delay: 2
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
|
||||
- name: Refresh cluster status
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ patroni_host }}:8008/cluster"
|
||||
method: GET
|
||||
return_content: yes
|
||||
status_code: 200
|
||||
register: refreshed_cluster_status
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
when: verified_cluster_status is defined
|
||||
- name: Check restart flags
|
||||
ansible.builtin.set_fact:
|
||||
needs_restart: >-
|
||||
{{
|
||||
(cluster_status.json.members |
|
||||
map(attribute='pending_restart', default=false) |
|
||||
select('equalto', true) | list | length > 0) or
|
||||
(cluster_status.json.members |
|
||||
map(attribute='tags.pending_restart', default=false) |
|
||||
select('equalto', true) | list | length > 0)
|
||||
}}
|
||||
node_names: "{{ cluster_status.json.members | map(attribute='name') | list }}"
|
||||
node_info: >-
|
||||
{% set info = {} %}
|
||||
{% for member in cluster_status.json.members %}
|
||||
{% set _ = info.update({member.name: {'role': member.role}}) %}
|
||||
{% endfor %}
|
||||
{{ info }}
|
||||
run_once: true
|
||||
rescue:
|
||||
- name: Set no restart needed
|
||||
ansible.builtin.set_fact:
|
||||
needs_restart: false
|
||||
run_once: true
|
||||
|
||||
- name: Safe check for pending restarts
|
||||
ansible.builtin.set_fact:
|
||||
needs_restart: >-
|
||||
{{
|
||||
(refreshed_cluster_status.json.members |
|
||||
map(attribute='pending_restart', default=false) |
|
||||
select('equalto', true) | list | count > 0) or
|
||||
(refreshed_cluster_status.json.members |
|
||||
map(attribute='tags.pending_restart', default=false) |
|
||||
select('equalto', true) | list | count > 0)
|
||||
}}
|
||||
node_names: >-
|
||||
{{
|
||||
refreshed_cluster_status.json.members |
|
||||
map(attribute='name') |
|
||||
list
|
||||
}}
|
||||
when:
|
||||
- refreshed_cluster_status.json is defined
|
||||
- refreshed_cluster_status.json.members is defined
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Show restart warning if needed
|
||||
- name: Display restart warning
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
{% if needs_restart %}
|
||||
==================================
|
||||
ВНИМАНИЕ: ТРЕБУЕТСЯ ПЕРЕЗАГРУЗКА
|
||||
==================================
|
||||
{% if autorestart %}
|
||||
================================================
|
||||
ПРЕДУПРЕЖДЕНИЕ: АВТОМАТИЧЕСКИЙ ПЕРЕЗАПУСК КЛАСТЕРА
|
||||
================================================
|
||||
Следующие ноды будут перезапущены:
|
||||
{% for node in node_names %}
|
||||
- {{ node }} ({{ node_info[node].role | default('UNKNOWN') }})
|
||||
{% endfor %}
|
||||
|
||||
Не, я конечно могу и сам ролью, но вдруг кластер в проде или еще где!!!
|
||||
Так что лучше выполнить следующую команду на одной из нод кластера:
|
||||
Для отмены нажмите Ctrl+C в течение 10 секунд
|
||||
{% else %}
|
||||
============================================
|
||||
ВНИМАНИЕ: НЕОБХОДИМ РУЧНОЙ ПЕРЕЗАПУСК КЛАСТЕРА
|
||||
============================================
|
||||
Выполните на одной из нод:
|
||||
|
||||
patronictl restart {{ node_names | join(' ') }}
|
||||
patronictl restart -c /etc/patrony.yml {{ node_names | join(' ') }}
|
||||
|
||||
Затронутые ноды:
|
||||
{% for node in node_names %}
|
||||
- {{ node }}
|
||||
{% endfor %}
|
||||
Ноды для перезапуска:
|
||||
{% for node in node_names %}
|
||||
- {{ node }} ({{ node_info[node].role | default('UNKNOWN') }})
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
==================================
|
||||
СТАТУС: Перезагрузка не требуется
|
||||
==================================
|
||||
================================
|
||||
ПЕРЕЗАГРУЗКА НЕ ТРЕБУЕТСЯ
|
||||
================================
|
||||
{% endif %}
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
when:
|
||||
- needs_restart is defined
|
||||
- node_names is defined
|
||||
- node_info is defined
|
||||
|
||||
- name: Archive old configurations
|
||||
- name: Confirm automatic restart
|
||||
ansible.builtin.pause:
|
||||
prompt: "Подтвердите автоматический перезапуск кластера (Enter - продолжить, Ctrl+C - отмена)"
|
||||
seconds: 10
|
||||
when:
|
||||
- needs_restart | default(false)
|
||||
- autorestart | default(false)
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
|
||||
- name: Execute cluster restart
|
||||
when:
|
||||
- needs_restart | default(false)
|
||||
- autorestart | bool
|
||||
- cluster_status is defined
|
||||
- cluster_status.json is defined
|
||||
- cluster_status.json.members is defined
|
||||
run_once: true
|
||||
block:
|
||||
- name: Find nodes needing restart
|
||||
ansible.builtin.set_fact:
|
||||
nodes_to_restart: >-
|
||||
{%
|
||||
set nodes = []
|
||||
%}{%
|
||||
for member in cluster_status.json.members
|
||||
%}{%
|
||||
if member.pending_restart is defined and member.pending_restart or
|
||||
member.tags.pending_restart is defined and member.tags.pending_restart
|
||||
%}{%
|
||||
set _ = nodes.append(member)
|
||||
%}{%
|
||||
endif
|
||||
%}{%
|
||||
endfor
|
||||
%}{{
|
||||
nodes
|
||||
}}
|
||||
|
||||
- name: Restart nodes via API
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ item.host }}:{{ patroni_api_port }}/restart"
|
||||
method: POST
|
||||
body_format: json
|
||||
body:
|
||||
restart_pending: true
|
||||
timeout: 60
|
||||
status_code: [200, 503]
|
||||
loop: "{{ nodes_to_restart | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.name }}"
|
||||
register: restart_results
|
||||
ignore_errors: yes
|
||||
changed_when: >
|
||||
restart_results.status == 200 or
|
||||
restart_results.status == 503
|
||||
|
||||
- name: Wait for cluster stabilization
|
||||
block:
|
||||
- name: Check cluster status until stable
|
||||
ansible.builtin.uri:
|
||||
url: "http://{{ patroni_host }}:{{ patroni_api_port }}/cluster"
|
||||
method: GET
|
||||
return_content: yes
|
||||
status_code: 200
|
||||
register: cluster_health
|
||||
until: >
|
||||
cluster_health.json.members |
|
||||
selectattr('state', 'match', '^(running|streaming)$') |
|
||||
list | length == cluster_health.json.members | length
|
||||
retries: 12
|
||||
delay: 10
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
|
||||
- name: Show restart results
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
========================
|
||||
РЕЗУЛЬТАТЫ ПЕРЕЗАГРУЗКИ
|
||||
========================
|
||||
Нода: {{ item.item.name }}
|
||||
Роль: {{ item.item.role }}
|
||||
Статус: {% if item.status == 200 %}Успешно перезапущена{% elif item.status == 503 %}Перезапуск в процессе{% else %}Ошибка (код {{ item.status }}){% endif %}
|
||||
Время выполнения: {{ item.elapsed }} сек
|
||||
{% if item.item.pending_restart_reason is defined %}
|
||||
Причина перезагрузки:
|
||||
{% for param, values in item.item.pending_restart_reason.items() %}
|
||||
- {{ param }}: было {{ values.old_value }}, стало {{ values.new_value }}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
------------------------
|
||||
loop: "{{ restart_results.results | default([]) }}"
|
||||
loop_control:
|
||||
label: ""
|
||||
run_once: true
|
||||
|
||||
- name: Archive old configurations # noqa: no-handler
|
||||
when: apply_result is changed
|
||||
run_once: true
|
||||
block:
|
||||
- name: Find old config files
|
||||
ansible.builtin.find:
|
||||
@ -117,14 +211,12 @@
|
||||
delegate_to: localhost
|
||||
connection: local
|
||||
|
||||
- name: Remove excess configs (keep last 10)
|
||||
- name: Remove excess configs
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: absent
|
||||
loop: "{{ (old_configs.files | sort(attribute='mtime'))[:-10] }}"
|
||||
when:
|
||||
- old_configs.matched > 10
|
||||
- apply_result is changed
|
||||
when: old_configs.matched > 10
|
||||
delegate_to: localhost
|
||||
connection: local
|
||||
notify: Log cleanup results
|
||||
notify: "Log cleanup"
|
||||
|
@ -1,5 +1,6 @@
|
||||
---
|
||||
- name: Подготовка ро<D180>к изменению конфнастроек кла<D0BB>стера
|
||||
import_playbook: prepare/deploy.yaml
|
||||
|
||||
- name: Применение изменений нас<D0B0>троек кластера
|
||||
import_playbook: apply/deploy.yaml
|
||||
|
@ -1,10 +1,13 @@
|
||||
$ANSIBLE_VAULT;1.1;AES256
|
||||
37376136623761343135636239653137353661303631663536613265366431333339663866643265
|
||||
3033653765613632313661393166363238643137346330620a643233623433633963333035646466
|
||||
34633366623262643165326331333937623064356131306663623362663663343861383735616365
|
||||
3363646132393166310a353965346531616330396666383732656430633630323438326161323965
|
||||
64323865636265303331663166393232376138663965613361623361303663353737623238373435
|
||||
30316161616234356264643762653036626132613664316137646665323335663232393535353131
|
||||
37636331646364313839653438323461353638363936623131626161353936303839393533326162
|
||||
31623833313834646233303961656633633933386135396439373463623362316561313138643631
|
||||
6663
|
||||
37613833393263643830623437366465373832623162373161383334336162326635663538326537
|
||||
3335386563373734636232356164636530393236353466610a366432353562343063376132643331
|
||||
30656666326633616639383966386439663264306536396533343861656566343539376130343930
|
||||
3932346663303035350a376233326363613763383139646262313531396466616635393166616435
|
||||
30643637336364656432376436373161623438316165353534643135313831636565353638363734
|
||||
61663964653362363533633664626435613738613538633761393231353435646463633661643839
|
||||
61616239386133353964656133316463343036666234636132316334323865653937323830313065
|
||||
34646633613736663362363631363131393439623137633162383235663938633237386439623562
|
||||
61646233393030663464353864656362356138643635383561653063333839353139666432323765
|
||||
37396633303231396631336264393032386561666534376635383962366365333934313734323632
|
||||
62346631396162383438303434383031333662386132393434353832323631653533346363333534
|
||||
36616639623533633639
|
||||
|
Loading…
x
Reference in New Issue
Block a user