pre master

2022-02-10 03:01:40 +01:00 · 2022-02-10 03:01:40 +01:00 · cd6b5e2768
commit cd6b5e2768
parent 036404c983
8 changed files with 2542 additions and 26 deletions
--- a/.files/openSUSE-MicroOS.x86_64-k3s-kvm-and-xen.qcow2.meta4
+++ b/.files/openSUSE-MicroOS.x86_64-k3s-kvm-and-xen.qcow2.meta4
--- a/README.md
+++ b/README.md
@ -33,7 +33,7 @@ _Please note that we are not affiliated to Hetzner, this is just an open source

 - Maintenance free with auto-upgrade to the latest version of MicroOS and k3s.
 - Proper use of the underlying Hetzner private network to remove the need for encryption and make the cluster both fast and secure.
- Automatic HA with the default setting of two control-plane and agents nodes.
+- Automatic HA with the default setting of three control-plane and two agents nodes.
 - Ability to add or remove as many nodes as you want while the cluster stays running.
 - Automatic Traefik ingress controller attached to a Hetzner load balancer with proxy protocol turned on.
 - (Optional) Out of the box config of Traefik with SSL certficate auto-generation.
@ -93,11 +93,15 @@ When the cluster is up and running, you can do whatever you wish with it! 🎉

 You can scale the number of nodes up and down without any issues. If you are going to scale down, just make sure to properly `kubectl drain` the nodes in question first. Then just edit these variables in `terraform.tfvars` and re-apply terraform with `terraform apply -auto-approve`.

+**If you want to remain HA, it's important to keep a number of control planes of at least 3, see [Rancher's doc on HA](https://rancher.com/docs/k3s/latest/en/installation/ha-embedded/).**
+
+Otherwise, you may want to turn off automated updates and reboots of the control-plane nodes (2 or less), and do these maintenance manually.
+
 For instance:

 ```tfvars
-servers_num = 2
-agents_num = 3
+servers_num = 3
+agents_num = 2
 ```

 ### Useful commands
@ -143,6 +147,7 @@ If you want to takedown the cluster, you can proceed as follows:
 kubectl delete -k hetzner/csi
 kubectl delete -k hetzner/ccm
 hcloud load-balancer delete traefik
+hcloud network delete k3s
 terraform destroy -auto-approve
 ```

--- a/agents.tf
+++ b/agents.tf
@ -8,7 +8,7 @@ resource "hcloud_server" "agents" {
  location           = var.location
  ssh_keys           = [hcloud_ssh_key.k3s.id]
  firewall_ids       = [hcloud_firewall.k3s.id]
-  placement_group_id = hcloud_placement_group.k3s_placement_group.id
+  placement_group_id = hcloud_placement_group.k3s.id


  labels = {
@ -53,7 +53,7 @@ resource "hcloud_server" "agents" {
    command = <<-EOT
      until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null
      do
-        echo Waiting for ssh to be ready...
+        echo "Waiting for ssh to be ready..."
        sleep 2
      done
    EOT
@ -98,10 +98,20 @@ resource "hcloud_server" "agents" {
  provisioner "remote-exec" {
    inline = [
      "set -ex",
+      # set the hostname in a persistent fashion
+      "hostnamectl set-hostname ${self.name}",
      # first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
      "rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
-      # then turn on k3s and join the cluster
-      "systemctl --now enable k3s-agent",
+      # then we start k3s agent and join the cluster
+      "systemctl enable k3s-server",
+      <<-EOT
+        until systemctl status k3s-server > /dev/null
+        do
+          systemctl start k3s-server
+          echo "Starting k3s-agent and joining the cluster..."
+          sleep 2
+        done
+      EOT
    ]

    connection {
--- a/main.tf
+++ b/main.tf
@ -187,8 +187,8 @@ resource "local_file" "traefik_config" {
 }


-resource "hcloud_placement_group" "k3s_placement_group" {
-  name = "k3s-placement-group"
+resource "hcloud_placement_group" "k3s" {
+  name = "k3s"
  type = "spread"
  labels = {
    "provisioner" = "terraform",
--- a/master.tf
+++ b/master.tf
@ -7,7 +7,7 @@ resource "hcloud_server" "first_control_plane" {
  location           = var.location
  ssh_keys           = [hcloud_ssh_key.k3s.id]
  firewall_ids       = [hcloud_firewall.k3s.id]
-  placement_group_id = hcloud_placement_group.k3s_placement_group.id
+  placement_group_id = hcloud_placement_group.k3s.id

  labels = {
    "provisioner" = "terraform",
@ -51,7 +51,7 @@ resource "hcloud_server" "first_control_plane" {
    command = <<-EOT
      until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null
      do
-        echo Waiting for ssh to be ready...
+        echo "Waiting for ssh to be ready..."
        sleep 2
      done
    EOT
@ -84,10 +84,20 @@ resource "hcloud_server" "first_control_plane" {
  provisioner "remote-exec" {
    inline = [
      "set -ex",
+      # set the hostname in a persistent fashion
+      "hostnamectl set-hostname ${self.name}",
      # first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
      "rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
      # then we initiate the cluster
-      "systemctl --now enable k3s-server",
+      "systemctl enable k3s-server",
+      <<-EOT
+        until systemctl status k3s-server > /dev/null
+        do
+          systemctl start k3s-server
+          echo "Initiating the cluster..."
+          sleep 2
+        done
+      EOT
    ]

    connection {
@ -102,10 +112,18 @@ resource "hcloud_server" "first_control_plane" {
  provisioner "local-exec" {
    command = <<-EOT
      set -ex
-      sleep 30
+      until ssh -q ${local.ssh_args} root@${self.ipv4_address} [[ -f /etc/rancher/k3s/k3s.yaml ]]
+      do
+        echo "Waiting for the k3s config file to be ready..."
+        sleep 2
+      done
      scp ${local.ssh_args} root@${self.ipv4_address}:/etc/rancher/k3s/k3s.yaml ${path.module}/kubeconfig.yaml
      sed -i -e 's/127.0.0.1/${self.ipv4_address}/g' ${path.module}/kubeconfig.yaml
-      sleep 10 && until kubectl get node ${self.name} --kubeconfig ${path.module}/kubeconfig.yaml; do sleep 5; done
+      until kubectl get node ${self.name} --kubeconfig ${path.module}/kubeconfig.yaml 2> /dev/null || false
+      do 
+        echo "Waiting for the node to become available...";
+        sleep 2
+      done
    EOT
  }

--- a/servers.tf
+++ b/servers.tf
@ -8,7 +8,7 @@ resource "hcloud_server" "control_planes" {
  location           = var.location
  ssh_keys           = [hcloud_ssh_key.k3s.id]
  firewall_ids       = [hcloud_firewall.k3s.id]
-  placement_group_id = hcloud_placement_group.k3s_placement_group.id
+  placement_group_id = hcloud_placement_group.k3s.id

  labels = {
    "provisioner" = "terraform",
@ -52,7 +52,7 @@ resource "hcloud_server" "control_planes" {
    command = <<-EOT
      until ssh ${local.ssh_args} -o ConnectTimeout=2 root@${self.ipv4_address} true 2> /dev/null
      do
-        echo Waiting for ssh to be ready...
+        echo "Waiting for ssh to be ready..."
        sleep 2
      done
    EOT
@ -84,14 +84,24 @@ resource "hcloud_server" "control_planes" {
    }
  }

-  # Run the other control plane
+  # Run an other control plane server
  provisioner "remote-exec" {
    inline = [
      "set -ex",
+      # set the hostname in a persistent fashion
+      "hostnamectl set-hostname ${self.name}",
      # first we disable automatic reboot (after transactional updates), and configure the reboot method as kured
      "rebootmgrctl set-strategy off && echo 'REBOOT_METHOD=kured' > /etc/transactional-update.conf",
-      # then we initiate the cluster
-      "systemctl --now enable k3s-server",
+      # then then we start k3s in server mode and join the cluster
+      "systemctl enable k3s-server",
+      <<-EOT
+        until systemctl status k3s-server > /dev/null
+        do
+          systemctl start k3s-server
+          echo "Waiting on other 'learning' control planes, patience is the mother of virtues..."
+          sleep 2
+        done
+      EOT
    ]

    connection {
--- a/templates/config.ign.tpl
+++ b/templates/config.ign.tpl
@ -14,17 +14,17 @@
  },
  "storage": {
    "files": [
-      {
-        "path": "/etc/hostname",
-        "mode": 420,
-        "overwrite": true,
-        "contents": { "source": "data:,${name}" }
-      },
      {
        "path": "/etc/sysconfig/network/ifcfg-eth1",
        "mode": 420,
        "overwrite": true,
        "contents": { "source": "data:,BOOTPROTO%3D%27dhcp%27%0ASTARTMODE%3D%27auto%27" }
+      },
+      {
+        "path": "/etc/ssh/sshd_config.d/kube-hetzner.conf",
+        "mode": 420,
+        "overwrite": true,
+        "contents": { "source": "data:,PasswordAuthentication%20no%0AX11Forwarding%20no%0AMaxAuthTries%202%0AAllowTcpForwarding%20no%0AAllowAgentForwarding%20no%0AAuthorizedKeysFile%20.ssh%2Fauthorized_keys" }
      }
    ]
  }
--- a/terraform.tfvars.example
+++ b/terraform.tfvars.example
@ -12,7 +12,11 @@ network_region            = "eu-central" # change to `us-east` if location is as
 agent_server_type         = "cpx21"
 control_plane_server_type = "cpx11"
 lb_server_type            = "lb11"
-servers_num               = 2
+
+# At least 3 server nodes is recommended for HA, otherwise you need to turn off automatic upgrade (see ReadMe).
+servers_num               = 3
+
+# For agent nodes, at least 2 is recommended for HA, but you can keep automatic upgrades.
 agents_num                = 2

 # If you want to use a specific Hetzner CCM and CSI version, set them below, otherwise leave as is for the latest versions