Skip to content

Commit 554e2ac

Browse files
authored
feat: handle node drain failure without manual intervention by uncordoning node (#35)
1 parent 38815bb commit 554e2ac

File tree

6 files changed

+53
-20
lines changed

6 files changed

+53
-20
lines changed

README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,5 @@ provider "kubernetes" {
110110
- [ ] Explore Keepalived `max_auto_priority` option
111111
- [ ] Assert podman version
112112
- [ ] Token rotation
113-
- [ ] Stop Zicanati at start of playbook and start at the end
113+
- [ ] Stop Zincati at start of playbook and start at the end
114114
- [ ] Knownhost management
115-
- [ ] Variables for fine-grained node draining (`--disable-eviction`, timeout, etc)
116-
- [ ] Handle node drain failure without manual intervention

ansible/playbook.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@
136136
__k3s_files: "{{ k3s_files }}"
137137
__k3s_kubelet_configs: "{{ k3s_kubelet_configs }}"
138138
__k3s_registries_config: "{{ k3s_registries_config }}"
139+
__k3s_drain_options: "{{ k3s_drain_options }}"
139140
- name: Configure existing k3s servers
140141
hosts: servers:!new_servers
141142
gather_facts: false
@@ -159,6 +160,7 @@
159160
__k3s_files: "{{ k3s_files }}"
160161
__k3s_kubelet_configs: "{{ k3s_kubelet_configs }}"
161162
__k3s_registries_config: "{{ k3s_registries_config }}"
163+
__k3s_drain_options: "{{ k3s_drain_options }}"
162164
- name: Configure new k3s agents
163165
hosts: new_agents
164166
gather_facts: false
@@ -181,6 +183,7 @@
181183
__k3s_files: "{{ k3s_files }}"
182184
__k3s_kubelet_configs: "{{ k3s_kubelet_configs }}"
183185
__k3s_registries_config: "{{ k3s_registries_config }}"
186+
__k3s_drain_options: "{{ k3s_drain_options }}"
184187
- name: Configure existing k3s agents
185188
hosts: agents:!new_agents
186189
gather_facts: false
@@ -204,6 +207,7 @@
204207
__k3s_files: "{{ k3s_files }}"
205208
__k3s_kubelet_configs: "{{ k3s_kubelet_configs }}"
206209
__k3s_registries_config: "{{ k3s_registries_config }}"
210+
__k3s_drain_options: "{{ k3s_drain_options }}"
207211
- name: Cleanup machines
208212
hosts: servers:agents
209213
gather_facts: false

ansible/roles/k3s/defaults/main.yaml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,5 +26,9 @@ __k3s_selinux_download_baseurl: "https://github.com/k3s-io/k3s-selinux/releases/
2626
__k3s_selinux_download_package: "k3s-selinux-{{ __k3s_selinux_major_minor }}-{{ __k3s_selinux_patch }}.coreos.noarch.rpm"
2727
__k3s_selinux_download_checksum: sha256sum-coreos-noarch.txt
2828
__k3s_selinux_rpm_ostree_package: "k3s-selinux-{{ __k3s_selinux_major_minor }}-{{ __k3s_selinux_patch }}.coreos.noarch"
29-
__k3s_drain_opts: --ignore-daemonsets --delete-emptydir-data --force --timeout 10m
29+
__k3s_drain_options:
30+
deletion_fallback: false
31+
eviction_timeout: "30m"
32+
deletion_timeout: "10m"
33+
__k3s_drain_cli_options: --ignore-daemonsets --delete-emptydir-data --force
3034
__k3s_https_listen_port: 8443

ansible/roles/k3s/tasks/drain.yaml

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,23 @@
44
changed_when: true
55

66
- name: Drain node
7-
ansible.builtin.command:
8-
cmd: "{{ __k3s_binary_dir }}/k3s kubectl drain {{ __k3s_drain_opts }} {{ __k3s_node_config['node-name'] }}"
9-
register: __k3s_drain
10-
failed_when: __k3s_drain.rc not in [0, 1]
11-
changed_when: true
12-
13-
- name: Force drain node
14-
ansible.builtin.command:
15-
cmd: "{{ __k3s_binary_dir }}/k3s kubectl drain {{ __k3s_drain_opts }} --disable-eviction {{ __k3s_node_config['node-name'] }}"
16-
when: __k3s_drain.rc == 1
17-
changed_when: true
7+
block:
8+
- name: Drain node (evict)
9+
ansible.builtin.command:
10+
cmd: "{{ __k3s_binary_dir }}/k3s kubectl drain {{ __k3s_drain_cli_options }} --timeout {{ __k3s_drain_options.eviction_timeout }} {{ __k3s_node_config['node-name'] }}"
11+
register: __k3s_drain
12+
failed_when: __k3s_drain.rc != 0 and not __k3s_drain_options.deletion_fallback
13+
changed_when: true
14+
- name: Drain node (delete)
15+
ansible.builtin.command:
16+
cmd: "{{ __k3s_binary_dir }}/k3s kubectl drain {{ __k3s_drain_cli_options }} --timeout {{ __k3s_drain_options.deletion_timeout }} --disable-eviction {{ __k3s_node_config['node-name'] }}"
17+
when: __k3s_drain.rc != 0 and __k3s_drain_options.deletion_fallback
18+
changed_when: true
19+
rescue:
20+
- name: Uncordon node
21+
ansible.builtin.command:
22+
cmd: "{{ __k3s_binary_dir }}/k3s kubectl uncordon {{ __k3s_node_config['node-name'] }}"
23+
changed_when: true
24+
- name: Drain failed
25+
ansible.builtin.fail:
26+
msg: "Draining node {{ __k3s_node_config['node-name'] }} failed."

main.tf

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ locals {
2222
k3s_kubelet_configs = var.kubelet_configs
2323
k3s_registries_config = var.registries_config
2424
k3s_cleanup = var.cleanup
25+
k3s_drain_options = var.drain_options
2526
system_upgrade_trigger = var.system_upgrade_trigger
2627
haproxy_container_image = var.haproxy_container_image
2728
haproxy_container_image_tag = var.haproxy_container_image_tag
@@ -117,9 +118,9 @@ resource "ansible_navigator_run" "this" {
117118
}
118119
}
119120
timeouts = {
120-
create = "60m"
121-
update = "60m"
122-
delete = "60m"
121+
create = "2h"
122+
update = "4h"
123+
delete = "1h"
123124
}
124125
run_on_destroy = var.reset_on_destroy
125126
destroy_playbook = file("${path.module}/ansible/destroy_playbook.yaml")

variables.tf

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ variable "server_machines" {
6565
config = optional(object({
6666
cluster_init = optional(bool, false)
6767
node_name = optional(string)
68-
with_node_id = optional(bool, false)
6968
node_ip = optional(string)
7069
node_external_ip = optional(string)
7170
node_label = optional(map(string), {})
@@ -102,7 +101,6 @@ variable "agent_machine_groups" {
102101
})
103102
config = optional(object({
104103
node_name = optional(string)
105-
with_node_id = optional(bool, false)
106104
node_ip = optional(string)
107105
node_external_ip = optional(string)
108106
node_label = optional(map(string), {})
@@ -248,6 +246,25 @@ variable "registries_config" {
248246
description = "Registry configuration to be used by k3s when generating the containerd configuration."
249247
}
250248

249+
variable "drain_options" {
250+
type = object({
251+
deletion_fallback = optional(bool, false)
252+
eviction_timeout = optional(string, "30m")
253+
deletion_timeout = optional(string, "10m")
254+
})
255+
nullable = false
256+
default = {}
257+
description = "Node drain options."
258+
validation {
259+
condition = can(timeadd(timestamp(), var.drain_options.eviction_timeout))
260+
error_message = "The eviction_timeout must be a valid duration (e.g., '10m', '30s', '1h30m')."
261+
}
262+
validation {
263+
condition = can(timeadd(timestamp(), var.drain_options.deletion_timeout))
264+
error_message = "The deletion_timeout must be a valid duration (e.g., '10m', '30s', '1h30m')."
265+
}
266+
}
267+
251268
variable "system_upgrade_trigger" {
252269
type = string
253270
nullable = false

0 commit comments

Comments
 (0)