Add playbook for offline'ing a cluster host safetly

This commit is contained in:
Ethan Paul 2021-10-28 00:26:34 -04:00
parent 622481e231
commit 8f2ffd6619
No known key found for this signature in database
GPG Key ID: 6A337337DF6B5B1A

131
playbooks/node-down.yaml Normal file
View File

@ -0,0 +1,131 @@
---
- name: Check cluster state
hosts: cluster
any_errors_fatal: true
vars_prompt:
- name: skylab_node_down
prompt: "Select node to offline (one of: {{ groups.cluster | join(', ') }})"
private: false
tasks:
- name: Validate user input
ansible.builtin.assert:
that:
- skylab_node_down in groups.cluster
fail_msg: >-
ERROR: Host '{{ skylab_node_down }}' is not a valid cluster node (one
of: {{ groups.cluster | join(', ') }})
- name: Fetch node swarm ID
ansible.builtin.command:
cmd: !unsafe docker info --format '{{ .Swarm.NodeID}}'
changed_when: false
register: _docker_node_id_raw
- name: Fetch swarm node availability
ansible.builtin.command:
cmd: docker node inspect {{ _docker_node_id_raw.stdout.strip() }} --format '{{ '{{ .Spec.Availability}}' }}'
changed_when: false
register: _docker_node_availability_raw
- name: Set common facts
ansible.builtin.set_fact:
_target_node: "{{ skylab_node_down }}"
_docker_node_id: "{{ _docker_node_id_raw.stdout.strip() }}"
_docker_node_availability: "{{ _docker_node_availability_raw.stdout.strip() }}"
# Use the next host in the group, unless that would exceed the length of the group,
# in which case use the first host in the group
_target_alt: >-
{{ groups.cluster[
lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1
if (lookup('ansible.utils.index_of', groups.cluster, 'eq', skylab_node_down) + 1) < (groups.cluster | length)
else 0]
}}
# I'm not sure how to do this without invoking a loop, so here we are
- name: Set common fact for node addresses
vars:
_node_addresses:
- "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4.address }}"
ansible.builtin.set_fact:
_node_addresses: "{{ _node_addresses + [item.address] }}"
loop: "{{ lookup('vars', 'ansible_' + skylab_cluster.interface).ipv4_secondaries }}"
loop_control:
label: "{{ item.address }}"
- name: Set facts for target node
when: inventory_hostname == _target_node
ansible.builtin.set_fact:
_needs_docker_migration: "{{ (_docker_node_availability | lower != 'drain') | bool }}"
- name: Check cluster settings
when: inventory_hostname != _target_node
ansible.builtin.assert:
that:
- skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
- _docker_node_availability | lower == 'active'
fail_msg: >-
ERROR: Node '{{ inventory_hostname }}' is already marked as unavailable. All cluster
nodes must be available before a new node can be moved to unavailable status.
- name: Offline node
hosts: "{{ skylab_node_down }}"
tasks:
- name: Migrate services off target node
when: _needs_docker_migration
block:
- name: Fetch current cluster service state
ansible.builtin.command:
cmd: !unsafe docker service ls --format '{{json .}}'
changed_when: false
register: _cluster_service_prestate
- name: Disable NAT rule {{ _skylab_adguard_nat_rule }}
delegate_to: core
connection: ansible.netcommon.network_cli
community.network.edgeos_config:
lines:
- set service nat rule {{ _skylab_adguard_nat_rule }} disable
- name: Update node availability
vars:
ansible_python_interpreter: "{{ skylab_state_dir }}/ansible-runtime/bin/python"
community.docker.docker_node:
availability: drain
hostname: "{{ _docker_node_id }}"
register: _node_availability_status
- name: Wait for services to shutdown
ansible.builtin.pause:
seconds: 10
- name: Wait for services to migrate
ansible.builtin.command:
cmd: !unsafe docker service ls --format '{{json .}}'
changed_when: false
register: _cluster_service_poststate
until: _cluster_service_poststate.stdout == _cluster_service_prestate.stdout
retries: 120
delay: 5
- name: Enable NAT rule {{ _skylab_adguard_nat_rule }}
delegate_to: core
connection: ansible.netcommon.network_cli
community.network.edgeos_config:
lines:
- delete service nat rule {{ _skylab_adguard_nat_rule }} disable
save: true
- name: Delete address from node
become: true
when: skylab_cluster.address | ansible.netcommon.ipaddr('address') in _node_addresses
ansible.builtin.command:
cmd: ip address delete {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ skylab_cluster.interface }}
changed_when: true
- name: Assign address to alt node
delegate_to: "{{ _target_alt }}"
become: true
when: skylab_cluster.address | ansible.netcommon.ipaddr('address') not in hostvars[_target_alt]._node_addresses
ansible.builtin.command:
cmd: ip address add {{ skylab_cluster.address | ansible.netcommon.ipaddr('host/prefix') }} dev {{ hostvars[_target_alt].skylab_cluster.interface }}
changed_when: true