merged k3s-install into master

This commit is contained in:
Karim Naufal 2022-02-17 22:11:54 +01:00
commit 75789a11f3
12 changed files with 231 additions and 88 deletions

5
.gitignore vendored
View File

@ -4,4 +4,9 @@ crash.log
kubeconfig.yaml
kubeconfig.yaml-e
terraform.tfvars
<<<<<<< HEAD
plans.yaml
=======
plans.yaml
traefik_config.yaml
>>>>>>> k3s-install

View File

@ -128,20 +128,26 @@ By default, we have 3 control planes configured and 2 agents, with automatic upg
**But if you want to remain HA, it's important to keep a number of control planes nodes of at least 3 (2 to maintain quorum when 1 goes down for automated upgrades and reboot for instance), see [Rancher's doc on HA](https://rancher.com/docs/k3s/latest/en/installation/ha-embedded/).**
Otherwise, it's important to turn off automatic upgrades (see below) and reboots for the control-plane nodes (2 or less), and do the maintenance yourself.
Otherwise, it's important to turn off automatic upgrades (see below) for the control-plane nodes (2 or less), and do the maintenance yourself.
## Automatic upgrade
By default, MicroOS and its embedded k3s instance get upgraded automatically on each node, and reboot safely via [Kured](https://github.com/weaveworks/kured) installed in the cluster.
By default, MicroOS gets upgraded automatically on each node, and reboot safely via [Kured](https://github.com/weaveworks/kured) installed in the cluster.
_About [Kured](https://github.com/weaveworks/kured), it does not have a latest tag present for its image, but it's pretty compatible, so you can just manually update the tag from once every year for instance._
As for k3s it is also automatically upgrades thanks to Rancher's <https://github.com/rancher/system-upgrade-controller>. By default it follows the k3s `stable` channel, but you can also change to `latest` one if needed, or specify a target version to upgrade to via the upgrade plan. You can copy and modify the one in the templates for that! More on the subject in [k3s upgrades basic](https://rancher.com/docs/k3s/latest/en/upgrades/basic/).
_Last but not least, if you wish to turn off automatic upgrade on a specific node, you need to ssh into it and issue the following command:_
_If you wish to turn off automatic MicroOS upgrades on a specific node, you need to ssh into it and issue the following command:_
```sh
systemctl --now disable transactional-update.timer
```
_To turn off k3s upgrades, you can either set the `k3s_upgrade=true` label in the node you want, or set it to `false`. To just remove it, apply:_
```sh
kubectl -n system-upgrade label node <node-name> k3s_upgrade-
```
## Takedown
If you want to takedown the cluster, you can proceed as follows:

View File

@ -31,17 +31,21 @@ resource "hcloud_server" "agents" {
destination = "/root/config.ign"
}
# Combustion script file to install k3s-selinux
provisioner "file" {
content = local.combustion_script
destination = "/root/script"
}
# Install MicroOS
provisioner "remote-exec" {
inline = local.MicroOS_install_commands
inline = local.microOS_install_commands
}
# Issue a reboot command
# Issue a reboot command and wait for the node to reboot
provisioner "local-exec" {
command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 3"
command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 5"
}
# Wait for MicroOS to reboot and be ready
provisioner "local-exec" {
command = <<-EOT
until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null
@ -52,50 +56,42 @@ resource "hcloud_server" "agents" {
EOT
}
# Generating and uploading the agent.conf file
provisioner "file" {
content = templatefile("${path.module}/templates/agent.conf.tpl", {
server = "https://${local.first_control_plane_network_ip}:6443"
token = random_password.k3s_token.result
})
destination = "/etc/rancher/k3s/agent.conf"
}
# Generating k3s agent config file
provisioner "file" {
content = yamlencode({
node-name = self.name
server = "https://${local.first_control_plane_network_ip}:6443"
token = random_password.k3s_token.result
kubelet-arg = "cloud-provider=external"
flannel-iface = "eth1"
node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 257 + count.index)
node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 513 + count.index)
node-label = var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : []
})
destination = "/etc/rancher/k3s/config.yaml"
destination = "/tmp/config.yaml"
}
# Run the agent
# Install k3s agent
provisioner "remote-exec" {
inline = local.install_k3s_agent
}
# Upon reboot verify that k3s agent starts correctly
provisioner "remote-exec" {
inline = [
# set the hostname in a persistent fashion
"hostnamectl set-hostname ${self.name}",
# first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
"rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
# then we start k3s agent and join the cluster
"systemctl enable k3s-agent",
<<-EOT
until systemctl status k3s-agent > /dev/null
do
systemctl start k3s-agent
echo "Starting k3s-agent and joining the cluster..."
timeout 120 bash <<EOF
until systemctl status k3s-agent > /dev/null; do
echo "Waiting for the k3s agent to start..."
sleep 2
done
EOF
EOT
]
}
network {
network_id = hcloud_network.k3s.id
ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 257 + count.index)
ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 513 + count.index)
}
depends_on = [

View File

@ -1,5 +1,5 @@
locals {
first_control_plane_network_ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 2)
first_control_plane_network_ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 257)
hcloud_image_name = "ubuntu-20.04"
ssh_public_key = trimspace(file(var.public_key))
@ -18,11 +18,12 @@ locals {
csi_version = var.hetzner_csi_version != null ? var.hetzner_csi_version : data.github_release.hetzner_csi.release_tag
kured_version = data.github_release.kured.release_tag
MicroOS_install_commands = [
microOS_install_commands = [
"set -ex",
"apt-get update",
"apt-get install -y aria2",
"aria2c --follow-metalink=mem https://download.opensuse.org/tumbleweed/appliances/openSUSE-MicroOS.x86_64-k3s-kvm-and-xen.qcow2.meta4",
"qemu-img convert -p -f qcow2 -O host_device $(ls -a | grep -ie '^opensuse.*microos.*k3s.*qcow2$') /dev/sda",
"aria2c --follow-metalink=mem https://download.opensuse.org/tumbleweed/appliances/openSUSE-MicroOS.x86_64-kvm-and-xen.qcow2.meta4",
"qemu-img convert -p -f qcow2 -O host_device $(ls -a | grep -ie '^opensuse.*microos.*qcow2$') /dev/sda",
"sgdisk -e /dev/sda",
"parted -s /dev/sda resizepart 4 99%",
"parted -s /dev/sda mkpart primary ext2 99% 100%",
@ -32,6 +33,31 @@ locals {
"mount /dev/sda5 /mnt",
"mkdir /mnt/ignition",
"cp /root/config.ign /mnt/ignition/config.ign",
"mkdir /mnt/combustion",
"cp /root/script /mnt/combustion/script",
"umount /mnt"
]
combustion_script = <<EOF
#!/bin/bash
# combustion: network
rpm --import https://rpm.rancher.io/public.key
zypper refresh
zypper --gpg-auto-import-keys install -y https://rpm.rancher.io/k3s/stable/common/microos/noarch/k3s-selinux-0.4-1.sle.noarch.rpm
udevadm settle
EOF
common_commands_install_k3s = [
"set -ex",
# first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
"rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
# prepare the k3s config directory
"mkdir -p /etc/rancher/k3s",
# move the config file into place
"mv /tmp/config.yaml /etc/rancher/k3s/config.yaml"
]
install_k3s_server = concat(local.common_commands_install_k3s, ["curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_SELINUX_RPM=true INSTALL_K3S_SKIP_START=true INSTALL_K3S_EXEC=server sh -"])
install_k3s_agent = concat(local.common_commands_install_k3s, ["curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_SELINUX_RPM=true INSTALL_K3S_EXEC=agent sh -"])
}

View File

@ -29,17 +29,21 @@ resource "hcloud_server" "first_control_plane" {
destination = "/root/config.ign"
}
# Combustion script file to install k3s-selinux
provisioner "file" {
content = local.combustion_script
destination = "/root/script"
}
# Install MicroOS
provisioner "remote-exec" {
inline = local.MicroOS_install_commands
inline = local.microOS_install_commands
}
# Issue a reboot command
# Issue a reboot command and wait for the node to reboot
provisioner "local-exec" {
command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 3"
command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 5"
}
# Wait for MicroOS to reboot and be ready
provisioner "local-exec" {
command = <<-EOT
until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null
@ -63,36 +67,36 @@ resource "hcloud_server" "first_control_plane" {
advertise-address = local.first_control_plane_network_ip
token = random_password.k3s_token.result
node-taint = var.allow_scheduling_on_control_plane ? [] : ["node-role.kubernetes.io/master:NoSchedule"]
node-label = var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : []
})
destination = "/etc/rancher/k3s/config.yaml"
destination = "/tmp/config.yaml"
}
# Run the first control plane
# Install k3s server
provisioner "remote-exec" {
inline = local.install_k3s_server
}
# Upon reboot verify that the k3s server is starts, and wait for k3s to be ready to receive commands
provisioner "remote-exec" {
inline = [
# set the hostname in a persistent fashion
"hostnamectl set-hostname ${self.name}",
# first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
"rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
# prepare a directory for our post-installation kustomizations
"systemctl start k3s",
# prepare the post_install directory
"mkdir -p /tmp/post_install",
# then we initiate the cluster
"systemctl enable k3s-server",
# wait for k3s to get ready
# wait for k3s to become ready
<<-EOT
timeout 120 bash <<EOF
until systemctl status k3s-server > /dev/null; do
systemctl start k3s-server
echo "Initiating the cluster..."
sleep 1
until systemctl status k3s > /dev/null; do
echo "Waiting for the k3s server to start..."
sleep 2
done
until [ -e /etc/rancher/k3s/k3s.yaml ]; do
echo "Waiting for kubectl config..."
sleep 1
sleep 2
done
until [[ "\$(kubectl get --raw='/readyz' 2> /dev/null)" == "ok" ]]; do
echo "Waiting for the cluster to become ready..."
sleep 1
sleep 2
done
EOF
EOT
@ -108,11 +112,13 @@ resource "hcloud_server" "first_control_plane" {
"https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/download/${local.ccm_version}/ccm-networks.yaml",
"https://raw.githubusercontent.com/hetznercloud/csi-driver/${local.csi_version}/deploy/kubernetes/hcloud-csi.yml",
"https://github.com/weaveworks/kured/releases/download/${local.kured_version}/kured-${local.kured_version}-dockerhub.yaml",
"./traefik.yaml"
"https://raw.githubusercontent.com/rancher/system-upgrade-controller/master/manifests/system-upgrade-controller.yaml",
"traefik.yaml",
]
patchesStrategicMerge = [
file("${path.module}/patches/kured.yaml"),
file("${path.module}/patches/ccm.yaml")
file("${path.module}/patches/ccm.yaml"),
file("${path.module}/patches/system-upgrade-controller.yaml")
]
})
destination = "/tmp/post_install/kustomization.yaml"
@ -132,9 +138,20 @@ resource "hcloud_server" "first_control_plane" {
destination = "/tmp/post_install/traefik.yaml"
}
# Upload the system upgrade controller plans config
provisioner "file" {
content = templatefile(
"${path.module}/templates/plans.yaml.tpl",
{
channel = var.k3s_upgrade_channel
})
destination = "/tmp/post_install/plans.yaml"
}
# Deploy secrets, logging is automatically disabled due to sensitive variables
provisioner "remote-exec" {
inline = [
"set -ex",
"kubectl -n kube-system create secret generic hcloud --from-literal=token=${var.hcloud_token} --from-literal=network=${hcloud_network.k3s.name}",
"kubectl -n kube-system create secret generic hcloud-csi --from-literal=token=${var.hcloud_token}",
]
@ -143,6 +160,7 @@ resource "hcloud_server" "first_control_plane" {
# Deploy our post-installation kustomization
provisioner "remote-exec" {
inline = [
"set -ex",
# This ugly hack is here, because terraform serializes the
# embedded yaml files with "- |2", when there is more than
# one yamldocument in the embedded file. Kustomize does not understand
@ -153,6 +171,9 @@ resource "hcloud_server" "first_control_plane" {
# manifests themselves
"sed -i 's/^- |[0-9]\\+$/- |/g' /tmp/post_install/kustomization.yaml",
"kubectl apply -k /tmp/post_install",
"echo 'Waiting for the system-upgrade-controller deployment to become available...'",
"kubectl -n system-upgrade wait --for=condition=available --timeout=120s deployment/system-upgrade-controller",
"kubectl -n system-upgrade apply -f /tmp/post_install/plans.yaml"
]
}

View File

@ -0,0 +1,18 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: system-upgrade-controller
namespace: system-upgrade
spec:
template:
spec:
containers:
- name: system-upgrade-controller
volumeMounts:
- name: ca-certificates
mountPath: /var/lib/ca-certificates
volumes:
- name: ca-certificates
hostPath:
path: /var/lib/ca-certificates
type: Directory

View File

@ -30,17 +30,21 @@ resource "hcloud_server" "control_planes" {
destination = "/root/config.ign"
}
# Combustion script file to install k3s-selinux
provisioner "file" {
content = local.combustion_script
destination = "/root/script"
}
# Install MicroOS
provisioner "remote-exec" {
inline = local.MicroOS_install_commands
inline = local.microOS_install_commands
}
# Issue a reboot command
# Issue a reboot command and wait for the node to reboot
provisioner "local-exec" {
command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 3"
command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 5"
}
# Wait for MicroOS to reboot and be ready
provisioner "local-exec" {
command = <<-EOT
until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null
@ -56,43 +60,44 @@ resource "hcloud_server" "control_planes" {
content = yamlencode({
node-name = self.name
server = "https://${local.first_control_plane_network_ip}:6443"
token = random_password.k3s_token.result
cluster-init = true
disable-cloud-controller = true
disable = "servicelb, local-storage"
flannel-iface = "eth1"
kubelet-arg = "cloud-provider=external"
node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index)
advertise-address = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index)
tls-san = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index)
token = random_password.k3s_token.result
node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index)
advertise-address = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index)
tls-san = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index)
node-taint = var.allow_scheduling_on_control_plane ? [] : ["node-role.kubernetes.io/master:NoSchedule"]
node-label = var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : []
})
destination = "/etc/rancher/k3s/config.yaml"
destination = "/tmp/config.yaml"
}
# Run an other control plane server
# Install k3s server
provisioner "remote-exec" {
inline = local.install_k3s_server
}
# Upon reboot verify that the k3s server starts correctly
provisioner "remote-exec" {
inline = [
# set the hostname in a persistent fashion
"hostnamectl set-hostname ${self.name}",
# first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
"rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
# then then we start k3s in server mode and join the cluster
"systemctl enable k3s-server",
"systemctl start k3s",
<<-EOT
until systemctl status k3s-server > /dev/null
do
systemctl start k3s-server
echo "Waiting on other 'learning' control planes, patience is the mother of all virtues..."
timeout 120 bash <<EOF
until systemctl status k3s > /dev/null; do
echo "Waiting for the k3s server to start..."
sleep 2
done
EOF
EOT
]
}
network {
network_id = hcloud_network.k3s.id
ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index)
ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index)
}
depends_on = [

View File

@ -1,3 +0,0 @@
SERVER_URL="${server}"
NODE_TOKEN="${token}"
AGENT_OPTS=""

50
templates/plans.yaml.tpl Normal file
View File

@ -0,0 +1,50 @@
# Doc: https://rancher.com/docs/k3s/latest/en/upgrades/automated/
# agent plan
apiVersion: upgrade.cattle.io/v1
kind: Plan
metadata:
name: k3s-agent
namespace: system-upgrade
labels:
k3s_upgrade: agent
spec:
concurrency: 1
channel: https://update.k3s.io/v1-release/channels/${channel}
nodeSelector:
matchExpressions:
- {key: k3s_upgrade, operator: Exists}
- {key: k3s_upgrade, operator: NotIn, values: ["disabled", "false"]}
- {key: node-role.kubernetes.io/master, operator: NotIn, values: ["true"]}
serviceAccountName: system-upgrade
prepare:
image: rancher/k3s-upgrade
args: ["prepare", "k3s-server"]
drain:
force: true
skipWaitForDeleteTimeout: 60
upgrade:
image: rancher/k3s-upgrade
---
# server plan
apiVersion: upgrade.cattle.io/v1
kind: Plan
metadata:
name: k3s-server
namespace: system-upgrade
labels:
k3s_upgrade: server
spec:
concurrency: 1
channel: https://update.k3s.io/v1-release/channels/${channel}
nodeSelector:
matchExpressions:
- {key: k3s_upgrade, operator: Exists}
- {key: k3s_upgrade, operator: NotIn, values: ["disabled", "false"]}
- {key: node-role.kubernetes.io/master, operator: In, values: ["true"]}
tolerations:
- {key: node-role.kubernetes.io/master, effect: NoSchedule, operator: Exists}
- {key: CriticalAddonsOnly, effect: NoExecute, operator: Exists}
serviceAccountName: system-upgrade
cordon: true
upgrade:
image: rancher/k3s-upgrade

View File

@ -10,9 +10,9 @@ spec:
type: LoadBalancer
annotations:
"load-balancer.hetzner.cloud/name": "traefik"
# make hetzners load-balancer connect to our nodes via our private k3s-net.
# make hetzners load-balancer connect to our nodes via our private k3s
"load-balancer.hetzner.cloud/use-private-ip": "true"
# keep hetzner-ccm from exposing our private ingress ip, which in general isn't routeable from the public internet.
# keep hetzner-ccm from exposing our private ingress ip, which in general isn't routeable from the public internet
"load-balancer.hetzner.cloud/disable-private-ingress": "true"
# disable ipv6 by default, because external-dns doesn't support AAAA for hcloud yet https://github.com/kubernetes-sigs/external-dns/issues/2044
"load-balancer.hetzner.cloud/ipv6-disabled": "${lb_disable_ipv6}"

View File

@ -29,3 +29,10 @@ agents_num = 2
# If you want to allow non-control-plane workloads to run on the control-plane nodes set "true" below. The default is "false".
# allow_scheduling_on_control_plane = true
# If you want to disable automatic upgrade of k3s (stable channel), you can set this to false, default is "true".
# automatically_upgrade_k3s = false
# If you would like to specify the k3s upgrade channel from the get go, you can do so, the default is "stable".
# For a list of available channels, see https://rancher.com/docs/k3s/latest/en/upgrades/basic/ and https://update.k3s.io/v1-release/channels
# k3s_upgrade_channel = "latest"

View File

@ -84,3 +84,15 @@ variable "allow_scheduling_on_control_plane" {
default = false
description = "Whether to allow non-control-plane workloads to run on the control-plane nodes"
}
variable "k3s_upgrade_channel" {
type = string
default = "stable"
description = "Allows you to specify the k3s upgrade channel"
}
variable "automatically_upgrade_k3s" {
type = bool
default = true
description = "Whether to automatically upgrade k3s based on the selected channel"
}