diff --git a/.gitignore b/.gitignore index 1202e92..228a291 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,9 @@ crash.log kubeconfig.yaml kubeconfig.yaml-e terraform.tfvars -plans.yaml \ No newline at end of file +<<<<<<< HEAD +plans.yaml +======= +plans.yaml +traefik_config.yaml +>>>>>>> k3s-install diff --git a/README.md b/README.md index f9a2a77..d4c0581 100644 --- a/README.md +++ b/README.md @@ -128,20 +128,26 @@ By default, we have 3 control planes configured and 2 agents, with automatic upg **But if you want to remain HA, it's important to keep a number of control planes nodes of at least 3 (2 to maintain quorum when 1 goes down for automated upgrades and reboot for instance), see [Rancher's doc on HA](https://rancher.com/docs/k3s/latest/en/installation/ha-embedded/).** -Otherwise, it's important to turn off automatic upgrades (see below) and reboots for the control-plane nodes (2 or less), and do the maintenance yourself. +Otherwise, it's important to turn off automatic upgrades (see below) for the control-plane nodes (2 or less), and do the maintenance yourself. ## Automatic upgrade -By default, MicroOS and its embedded k3s instance get upgraded automatically on each node, and reboot safely via [Kured](https://github.com/weaveworks/kured) installed in the cluster. +By default, MicroOS gets upgraded automatically on each node, and reboot safely via [Kured](https://github.com/weaveworks/kured) installed in the cluster. -_About [Kured](https://github.com/weaveworks/kured), it does not have a latest tag present for its image, but it's pretty compatible, so you can just manually update the tag from once every year for instance._ +As for k3s it is also automatically upgrades thanks to Rancher's . By default it follows the k3s `stable` channel, but you can also change to `latest` one if needed, or specify a target version to upgrade to via the upgrade plan. You can copy and modify the one in the templates for that! More on the subject in [k3s upgrades basic](https://rancher.com/docs/k3s/latest/en/upgrades/basic/). -_Last but not least, if you wish to turn off automatic upgrade on a specific node, you need to ssh into it and issue the following command:_ +_If you wish to turn off automatic MicroOS upgrades on a specific node, you need to ssh into it and issue the following command:_ ```sh systemctl --now disable transactional-update.timer ``` +_To turn off k3s upgrades, you can either set the `k3s_upgrade=true` label in the node you want, or set it to `false`. To just remove it, apply:_ + +```sh +kubectl -n system-upgrade label node k3s_upgrade- +``` + ## Takedown If you want to takedown the cluster, you can proceed as follows: diff --git a/agents.tf b/agents.tf index 9b107d3..9faed3b 100644 --- a/agents.tf +++ b/agents.tf @@ -31,17 +31,21 @@ resource "hcloud_server" "agents" { destination = "/root/config.ign" } + # Combustion script file to install k3s-selinux + provisioner "file" { + content = local.combustion_script + destination = "/root/script" + } + # Install MicroOS provisioner "remote-exec" { - inline = local.MicroOS_install_commands + inline = local.microOS_install_commands } - # Issue a reboot command + # Issue a reboot command and wait for the node to reboot provisioner "local-exec" { - command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 3" + command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 5" } - - # Wait for MicroOS to reboot and be ready provisioner "local-exec" { command = <<-EOT until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null @@ -52,50 +56,42 @@ resource "hcloud_server" "agents" { EOT } - - # Generating and uploading the agent.conf file - provisioner "file" { - content = templatefile("${path.module}/templates/agent.conf.tpl", { - server = "https://${local.first_control_plane_network_ip}:6443" - token = random_password.k3s_token.result - }) - destination = "/etc/rancher/k3s/agent.conf" - } - # Generating k3s agent config file provisioner "file" { content = yamlencode({ node-name = self.name + server = "https://${local.first_control_plane_network_ip}:6443" + token = random_password.k3s_token.result kubelet-arg = "cloud-provider=external" flannel-iface = "eth1" - node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 257 + count.index) + node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 513 + count.index) + node-label = var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : [] }) - destination = "/etc/rancher/k3s/config.yaml" + destination = "/tmp/config.yaml" } - # Run the agent + # Install k3s agent + provisioner "remote-exec" { + inline = local.install_k3s_agent + } + + # Upon reboot verify that k3s agent starts correctly provisioner "remote-exec" { inline = [ - # set the hostname in a persistent fashion - "hostnamectl set-hostname ${self.name}", - # first we disable automatic reboot (after transactional updates), and configure the reboot method as kured - "rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf", - # then we start k3s agent and join the cluster - "systemctl enable k3s-agent", <<-EOT - until systemctl status k3s-agent > /dev/null - do - systemctl start k3s-agent - echo "Starting k3s-agent and joining the cluster..." + timeout 120 bash < /dev/null; do + echo "Waiting for the k3s agent to start..." sleep 2 done + EOF EOT ] } network { network_id = hcloud_network.k3s.id - ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 257 + count.index) + ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 513 + count.index) } depends_on = [ diff --git a/locals.tf b/locals.tf index 4cb851a..6c5a399 100644 --- a/locals.tf +++ b/locals.tf @@ -1,5 +1,5 @@ locals { - first_control_plane_network_ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 2) + first_control_plane_network_ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 257) hcloud_image_name = "ubuntu-20.04" ssh_public_key = trimspace(file(var.public_key)) @@ -18,11 +18,12 @@ locals { csi_version = var.hetzner_csi_version != null ? var.hetzner_csi_version : data.github_release.hetzner_csi.release_tag kured_version = data.github_release.kured.release_tag - MicroOS_install_commands = [ + microOS_install_commands = [ "set -ex", + "apt-get update", "apt-get install -y aria2", - "aria2c --follow-metalink=mem https://download.opensuse.org/tumbleweed/appliances/openSUSE-MicroOS.x86_64-k3s-kvm-and-xen.qcow2.meta4", - "qemu-img convert -p -f qcow2 -O host_device $(ls -a | grep -ie '^opensuse.*microos.*k3s.*qcow2$') /dev/sda", + "aria2c --follow-metalink=mem https://download.opensuse.org/tumbleweed/appliances/openSUSE-MicroOS.x86_64-kvm-and-xen.qcow2.meta4", + "qemu-img convert -p -f qcow2 -O host_device $(ls -a | grep -ie '^opensuse.*microos.*qcow2$') /dev/sda", "sgdisk -e /dev/sda", "parted -s /dev/sda resizepart 4 99%", "parted -s /dev/sda mkpart primary ext2 99% 100%", @@ -32,6 +33,31 @@ locals { "mount /dev/sda5 /mnt", "mkdir /mnt/ignition", "cp /root/config.ign /mnt/ignition/config.ign", + "mkdir /mnt/combustion", + "cp /root/script /mnt/combustion/script", "umount /mnt" ] + + combustion_script = < /etc/transactional-update.conf", + # prepare the k3s config directory + "mkdir -p /etc/rancher/k3s", + # move the config file into place + "mv /tmp/config.yaml /etc/rancher/k3s/config.yaml" + ] + + install_k3s_server = concat(local.common_commands_install_k3s, ["curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_SELINUX_RPM=true INSTALL_K3S_SKIP_START=true INSTALL_K3S_EXEC=server sh -"]) + + install_k3s_agent = concat(local.common_commands_install_k3s, ["curl -sfL https://get.k3s.io | INSTALL_K3S_SKIP_SELINUX_RPM=true INSTALL_K3S_EXEC=agent sh -"]) } diff --git a/master.tf b/master.tf index 48a1d4c..a0f695b 100644 --- a/master.tf +++ b/master.tf @@ -29,17 +29,21 @@ resource "hcloud_server" "first_control_plane" { destination = "/root/config.ign" } + # Combustion script file to install k3s-selinux + provisioner "file" { + content = local.combustion_script + destination = "/root/script" + } + # Install MicroOS provisioner "remote-exec" { - inline = local.MicroOS_install_commands + inline = local.microOS_install_commands } - # Issue a reboot command + # Issue a reboot command and wait for the node to reboot provisioner "local-exec" { - command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 3" + command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 5" } - - # Wait for MicroOS to reboot and be ready provisioner "local-exec" { command = <<-EOT until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null @@ -63,36 +67,36 @@ resource "hcloud_server" "first_control_plane" { advertise-address = local.first_control_plane_network_ip token = random_password.k3s_token.result node-taint = var.allow_scheduling_on_control_plane ? [] : ["node-role.kubernetes.io/master:NoSchedule"] + node-label = var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : [] }) - destination = "/etc/rancher/k3s/config.yaml" + destination = "/tmp/config.yaml" } - # Run the first control plane + # Install k3s server + provisioner "remote-exec" { + inline = local.install_k3s_server + } + + # Upon reboot verify that the k3s server is starts, and wait for k3s to be ready to receive commands provisioner "remote-exec" { inline = [ - # set the hostname in a persistent fashion - "hostnamectl set-hostname ${self.name}", - # first we disable automatic reboot (after transactional updates), and configure the reboot method as kured - "rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf", - # prepare a directory for our post-installation kustomizations + "systemctl start k3s", + # prepare the post_install directory "mkdir -p /tmp/post_install", - # then we initiate the cluster - "systemctl enable k3s-server", - # wait for k3s to get ready + # wait for k3s to become ready <<-EOT timeout 120 bash < /dev/null; do - systemctl start k3s-server - echo "Initiating the cluster..." - sleep 1 + until systemctl status k3s > /dev/null; do + echo "Waiting for the k3s server to start..." + sleep 2 done until [ -e /etc/rancher/k3s/k3s.yaml ]; do echo "Waiting for kubectl config..." - sleep 1 + sleep 2 done until [[ "\$(kubectl get --raw='/readyz' 2> /dev/null)" == "ok" ]]; do echo "Waiting for the cluster to become ready..." - sleep 1 + sleep 2 done EOF EOT @@ -108,11 +112,13 @@ resource "hcloud_server" "first_control_plane" { "https://github.com/hetznercloud/hcloud-cloud-controller-manager/releases/download/${local.ccm_version}/ccm-networks.yaml", "https://raw.githubusercontent.com/hetznercloud/csi-driver/${local.csi_version}/deploy/kubernetes/hcloud-csi.yml", "https://github.com/weaveworks/kured/releases/download/${local.kured_version}/kured-${local.kured_version}-dockerhub.yaml", - "./traefik.yaml" + "https://raw.githubusercontent.com/rancher/system-upgrade-controller/master/manifests/system-upgrade-controller.yaml", + "traefik.yaml", ] patchesStrategicMerge = [ file("${path.module}/patches/kured.yaml"), - file("${path.module}/patches/ccm.yaml") + file("${path.module}/patches/ccm.yaml"), + file("${path.module}/patches/system-upgrade-controller.yaml") ] }) destination = "/tmp/post_install/kustomization.yaml" @@ -132,9 +138,20 @@ resource "hcloud_server" "first_control_plane" { destination = "/tmp/post_install/traefik.yaml" } + # Upload the system upgrade controller plans config + provisioner "file" { + content = templatefile( + "${path.module}/templates/plans.yaml.tpl", + { + channel = var.k3s_upgrade_channel + }) + destination = "/tmp/post_install/plans.yaml" + } + # Deploy secrets, logging is automatically disabled due to sensitive variables provisioner "remote-exec" { inline = [ + "set -ex", "kubectl -n kube-system create secret generic hcloud --from-literal=token=${var.hcloud_token} --from-literal=network=${hcloud_network.k3s.name}", "kubectl -n kube-system create secret generic hcloud-csi --from-literal=token=${var.hcloud_token}", ] @@ -143,6 +160,7 @@ resource "hcloud_server" "first_control_plane" { # Deploy our post-installation kustomization provisioner "remote-exec" { inline = [ + "set -ex", # This ugly hack is here, because terraform serializes the # embedded yaml files with "- |2", when there is more than # one yamldocument in the embedded file. Kustomize does not understand @@ -153,6 +171,9 @@ resource "hcloud_server" "first_control_plane" { # manifests themselves "sed -i 's/^- |[0-9]\\+$/- |/g' /tmp/post_install/kustomization.yaml", "kubectl apply -k /tmp/post_install", + "echo 'Waiting for the system-upgrade-controller deployment to become available...'", + "kubectl -n system-upgrade wait --for=condition=available --timeout=120s deployment/system-upgrade-controller", + "kubectl -n system-upgrade apply -f /tmp/post_install/plans.yaml" ] } diff --git a/patches/system-upgrade-controller.yaml b/patches/system-upgrade-controller.yaml new file mode 100644 index 0000000..fc904de --- /dev/null +++ b/patches/system-upgrade-controller.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: system-upgrade-controller + namespace: system-upgrade +spec: + template: + spec: + containers: + - name: system-upgrade-controller + volumeMounts: + - name: ca-certificates + mountPath: /var/lib/ca-certificates + volumes: + - name: ca-certificates + hostPath: + path: /var/lib/ca-certificates + type: Directory diff --git a/servers.tf b/servers.tf index 5f1d82c..99a076d 100644 --- a/servers.tf +++ b/servers.tf @@ -30,17 +30,21 @@ resource "hcloud_server" "control_planes" { destination = "/root/config.ign" } + # Combustion script file to install k3s-selinux + provisioner "file" { + content = local.combustion_script + destination = "/root/script" + } + # Install MicroOS provisioner "remote-exec" { - inline = local.MicroOS_install_commands + inline = local.microOS_install_commands } - # Issue a reboot command + # Issue a reboot command and wait for the node to reboot provisioner "local-exec" { - command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 3" + command = "ssh ${local.ssh_args} root@${self.ipv4_address} '(sleep 2; reboot)&'; sleep 5" } - - # Wait for MicroOS to reboot and be ready provisioner "local-exec" { command = <<-EOT until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null @@ -56,43 +60,44 @@ resource "hcloud_server" "control_planes" { content = yamlencode({ node-name = self.name server = "https://${local.first_control_plane_network_ip}:6443" + token = random_password.k3s_token.result cluster-init = true disable-cloud-controller = true disable = "servicelb, local-storage" flannel-iface = "eth1" kubelet-arg = "cloud-provider=external" - node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index) - advertise-address = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index) - tls-san = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index) - token = random_password.k3s_token.result + node-ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index) + advertise-address = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index) + tls-san = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index) node-taint = var.allow_scheduling_on_control_plane ? [] : ["node-role.kubernetes.io/master:NoSchedule"] + node-label = var.automatically_upgrade_k3s ? ["k3s_upgrade=true"] : [] }) - destination = "/etc/rancher/k3s/config.yaml" + destination = "/tmp/config.yaml" } - # Run an other control plane server + # Install k3s server + provisioner "remote-exec" { + inline = local.install_k3s_server + } + + # Upon reboot verify that the k3s server starts correctly provisioner "remote-exec" { inline = [ - # set the hostname in a persistent fashion - "hostnamectl set-hostname ${self.name}", - # first we disable automatic reboot (after transactional updates), and configure the reboot method as kured - "rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf", - # then then we start k3s in server mode and join the cluster - "systemctl enable k3s-server", + "systemctl start k3s", <<-EOT - until systemctl status k3s-server > /dev/null - do - systemctl start k3s-server - echo "Waiting on other 'learning' control planes, patience is the mother of all virtues..." + timeout 120 bash < /dev/null; do + echo "Waiting for the k3s server to start..." sleep 2 done + EOF EOT ] } network { network_id = hcloud_network.k3s.id - ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 3 + count.index) + ip = cidrhost(hcloud_network_subnet.k3s.ip_range, 258 + count.index) } depends_on = [ diff --git a/templates/agent.conf.tpl b/templates/agent.conf.tpl deleted file mode 100644 index fad0449..0000000 --- a/templates/agent.conf.tpl +++ /dev/null @@ -1,3 +0,0 @@ -SERVER_URL="${server}" -NODE_TOKEN="${token}" -AGENT_OPTS="" diff --git a/templates/plans.yaml.tpl b/templates/plans.yaml.tpl new file mode 100644 index 0000000..337aa59 --- /dev/null +++ b/templates/plans.yaml.tpl @@ -0,0 +1,50 @@ +# Doc: https://rancher.com/docs/k3s/latest/en/upgrades/automated/ +# agent plan +apiVersion: upgrade.cattle.io/v1 +kind: Plan +metadata: + name: k3s-agent + namespace: system-upgrade + labels: + k3s_upgrade: agent +spec: + concurrency: 1 + channel: https://update.k3s.io/v1-release/channels/${channel} + nodeSelector: + matchExpressions: + - {key: k3s_upgrade, operator: Exists} + - {key: k3s_upgrade, operator: NotIn, values: ["disabled", "false"]} + - {key: node-role.kubernetes.io/master, operator: NotIn, values: ["true"]} + serviceAccountName: system-upgrade + prepare: + image: rancher/k3s-upgrade + args: ["prepare", "k3s-server"] + drain: + force: true + skipWaitForDeleteTimeout: 60 + upgrade: + image: rancher/k3s-upgrade +--- +# server plan +apiVersion: upgrade.cattle.io/v1 +kind: Plan +metadata: + name: k3s-server + namespace: system-upgrade + labels: + k3s_upgrade: server +spec: + concurrency: 1 + channel: https://update.k3s.io/v1-release/channels/${channel} + nodeSelector: + matchExpressions: + - {key: k3s_upgrade, operator: Exists} + - {key: k3s_upgrade, operator: NotIn, values: ["disabled", "false"]} + - {key: node-role.kubernetes.io/master, operator: In, values: ["true"]} + tolerations: + - {key: node-role.kubernetes.io/master, effect: NoSchedule, operator: Exists} + - {key: CriticalAddonsOnly, effect: NoExecute, operator: Exists} + serviceAccountName: system-upgrade + cordon: true + upgrade: + image: rancher/k3s-upgrade \ No newline at end of file diff --git a/templates/traefik_config.yaml.tpl b/templates/traefik_config.yaml.tpl index 62e9c26..f8156f8 100644 --- a/templates/traefik_config.yaml.tpl +++ b/templates/traefik_config.yaml.tpl @@ -10,9 +10,9 @@ spec: type: LoadBalancer annotations: "load-balancer.hetzner.cloud/name": "traefik" - # make hetzners load-balancer connect to our nodes via our private k3s-net. + # make hetzners load-balancer connect to our nodes via our private k3s "load-balancer.hetzner.cloud/use-private-ip": "true" - # keep hetzner-ccm from exposing our private ingress ip, which in general isn't routeable from the public internet. + # keep hetzner-ccm from exposing our private ingress ip, which in general isn't routeable from the public internet "load-balancer.hetzner.cloud/disable-private-ingress": "true" # disable ipv6 by default, because external-dns doesn't support AAAA for hcloud yet https://github.com/kubernetes-sigs/external-dns/issues/2044 "load-balancer.hetzner.cloud/ipv6-disabled": "${lb_disable_ipv6}" diff --git a/terraform.tfvars.example b/terraform.tfvars.example index d0bd3a1..323955c 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -29,3 +29,10 @@ agents_num = 2 # If you want to allow non-control-plane workloads to run on the control-plane nodes set "true" below. The default is "false". # allow_scheduling_on_control_plane = true + +# If you want to disable automatic upgrade of k3s (stable channel), you can set this to false, default is "true". +# automatically_upgrade_k3s = false + +# If you would like to specify the k3s upgrade channel from the get go, you can do so, the default is "stable". +# For a list of available channels, see https://rancher.com/docs/k3s/latest/en/upgrades/basic/ and https://update.k3s.io/v1-release/channels +# k3s_upgrade_channel = "latest" diff --git a/variables.tf b/variables.tf index e8734df..e23c2bc 100644 --- a/variables.tf +++ b/variables.tf @@ -84,3 +84,15 @@ variable "allow_scheduling_on_control_plane" { default = false description = "Whether to allow non-control-plane workloads to run on the control-plane nodes" } + +variable "k3s_upgrade_channel" { + type = string + default = "stable" + description = "Allows you to specify the k3s upgrade channel" +} + +variable "automatically_upgrade_k3s" { + type = bool + default = true + description = "Whether to automatically upgrade k3s based on the selected channel" +}