Add playbook for offline'ing a cluster host safetly
This commit is contained in:
parent
622481e231
commit
8f2ffd6619
131
playbooks/node-down.yaml
Normal file
131
playbooks/node-down.yaml
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
---
|
||||||
|
- name: Check cluster state
|
||||||
|
hosts: cluster
|
||||||
|
any_errors_fatal: true
|
||||||
|
vars_prompt:
|
||||||
|
- name: skylab_node_down
|
||||||
|
prompt: "Select node to offline (one of: {{ groups.cluster | join(', ') }})"
|
||||||
|
private: false
|
||||||
|
tasks:
|
||||||
|
- name: Validate user input
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- skylab_node_down in groups.cluster
|
||||||
|
fail_msg: >-
|
||||||
|
ERROR: Host '{{ skylab_node_down }}' is not a valid cluster node (one
|
||||||
|
of: {{ groups.cluster | join(', ') }})
|
||||||
|
|
||||||
|
- name: Fetch node swarm ID
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: !unsafe docker info --format '{{ .Swarm.NodeID}}'
|
||||||
|
changed_when: false
|
||||||
|
register: _docker_node_id_raw
|
||||||
|
|
||||||
|
- name: Fetch swarm node availability
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: docker node inspect {{ _docker_node_id_raw.stdout.strip() }} --format '{{ '{{ .Spec.Availability}}' }}'
|
||||||
|
changed_when: false
|
||||||
|
register: _docker_node_availability_raw
|
||||||
|
|
||||||
|
- name: Set common facts
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
_target_node: "{{ skylab_node_down }}"
|
||||||
|
_docker_node_id: "{{ _docker_node_id_raw.stdout.strip() }}"
|
||||||
|
_docker_node_availability: "{{ _docker_node_availability_raw.stdout.strip() }}"
|
||||||
|
# Use the next host in the group, unless that would exceed the length of the group,
|
||||||
|
# in which case use the first host in the group
|
||||||
|
_target_alt: >-
|
||||||
|
{{ groups.cluster[
|
||||||
|
lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1
|
||||||
|
if (lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1) < (groups.cluster | length)
|
||||||
|
else 0]
|
||||||
|
}}
|
||||||
|
|
||||||
|
# I'm not sure how to do this without invoking a loop, so here we are
|
||||||
|
- name: Set common fact for node addresses
|
||||||
|
vars:
|
||||||
|
_node_addresses:
|
||||||
|
- "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4.address }}"
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
_node_addresses: "{{ _node_addresses + [item.address] }}"
|
||||||
|
loop: "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4_secondaries }}"
|
||||||
|
loop_control:
|
||||||
|
label: "{{ item.address }}"
|
||||||
|
|
||||||
|
- name: Set facts for target node
|
||||||
|
when: inventory_hostname == _target_node
|
||||||
|
ansible.builtin.set_fact:
|
||||||
|
_needs_docker_migration: "{{ (_docker_node_availability | lower != 'drain') | bool }}"
|
||||||
|
|
||||||
|
- name: Check cluster settings
|
||||||
|
when: inventory_hostname != _target_node
|
||||||
|
ansible.builtin.assert:
|
||||||
|
that:
|
||||||
|
- skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
|
||||||
|
- _docker_node_availability | lower == 'active'
|
||||||
|
fail_msg: >-
|
||||||
|
ERROR: Node '{{ inventory_hostname }}' is already marked as unavailable. All cluster
|
||||||
|
nodes must be available before a new node can be moved to unavailable status.
|
||||||
|
|
||||||
|
- name: Offline node
|
||||||
|
hosts: "{{ skylab_node_down }}"
|
||||||
|
tasks:
|
||||||
|
- name: Migrate services off target node
|
||||||
|
when: _needs_docker_migration
|
||||||
|
block:
|
||||||
|
- name: Fetch current cluster service state
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: !unsafe docker service ls --format '{{json .}}'
|
||||||
|
changed_when: false
|
||||||
|
register: _cluster_service_prestate
|
||||||
|
|
||||||
|
- name: Disable NAT rule {{ _skylab_adguard_nat_rule }}
|
||||||
|
delegate_to: core
|
||||||
|
connection: ansible.netcommon.network_cli
|
||||||
|
community.network.edgeos_config:
|
||||||
|
lines:
|
||||||
|
- set service nat rule {{ _skylab_adguard_nat_rule }} disable
|
||||||
|
|
||||||
|
- name: Update node availability
|
||||||
|
vars:
|
||||||
|
ansible_python_interpreter: "{{ skylab_state_dir }}/ansible-runtime/bin/python"
|
||||||
|
community.docker.docker_node:
|
||||||
|
availability: drain
|
||||||
|
hostname: "{{ _docker_node_id }}"
|
||||||
|
register: _node_availability_status
|
||||||
|
|
||||||
|
- name: Wait for services to shutdown
|
||||||
|
ansible.builtin.pause:
|
||||||
|
seconds: 10
|
||||||
|
|
||||||
|
- name: Wait for services to migrate
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: !unsafe docker service ls --format '{{json .}}'
|
||||||
|
changed_when: false
|
||||||
|
register: _cluster_service_poststate
|
||||||
|
until: _cluster_service_poststate.stdout == _cluster_service_prestate.stdout
|
||||||
|
retries: 120
|
||||||
|
delay: 5
|
||||||
|
|
||||||
|
- name: Enable NAT rule {{ _skylab_adguard_nat_rule }}
|
||||||
|
delegate_to: core
|
||||||
|
connection: ansible.netcommon.network_cli
|
||||||
|
community.network.edgeos_config:
|
||||||
|
lines:
|
||||||
|
- delete service nat rule {{ _skylab_adguard_nat_rule }} disable
|
||||||
|
save: true
|
||||||
|
|
||||||
|
- name: Delete address from node
|
||||||
|
become: true
|
||||||
|
when: skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ip address delete {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ skylab_cluster.interface }}
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
|
- name: Assign address to alt node
|
||||||
|
delegate_to: "{{ _target_alt }}"
|
||||||
|
become: true
|
||||||
|
when: skylab_cluster.address | ansible.netcommon.ipaddr('address') not in hostvars[_target_alt]._node_addresses
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: ip address add {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ hostvars[_target_alt].skylab_cluster.interface }}
|
||||||
|
changed_when: true
|
Reference in New Issue
Block a user