From 8de7171cf50b4fa3ab8ee4598dc702f59af45a89 Mon Sep 17 00:00:00 2001 From: Karim Naufal Date: Sun, 5 Dec 2021 10:50:51 +0100 Subject: [PATCH] Switched to k3os and removed cilium --- README.md | 107 ++++++----------------- manifests/helm/cilium/values.yaml | 117 ------------------------- manifests/upgrade/kured.yaml | 139 ------------------------------ master.tf | 5 +- templates/agent.tpl | 2 + templates/master.tpl | 3 + templates/server.tpl | 3 + 7 files changed, 35 insertions(+), 341 deletions(-) delete mode 100644 manifests/helm/cilium/values.yaml delete mode 100644 manifests/upgrade/kured.yaml diff --git a/README.md b/README.md index 1f0c8e6..d0be4be 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@

Kube-Hetzner

- A fully automated, optimized and auto-upgradable, HA-able, k3s cluster on Hetzner Cloud 🤑 + A fully automated, highly optimized and auto-upgradable, HA-able, Kubernetes - k3s on k3os - cluster on Hetzner Cloud 🥳



@@ -25,35 +25,31 @@ ![Product Name Screen Shot][product-screenshot] -[Hetzner Cloud](https://hetzner.com) is a good cloud provider that offers very affordable prices for cloud instances. The goal of this project was to create an optimal Kubernetes installation with it. We wanted functionality that was as close as possible to GKE's auto-pilot. +[Hetzner Cloud](https://hetzner.com) is a good cloud provider that offers very affordable prices for cloud instances. The goal of this project was to create an optimal and highly optimized Kubernetes installation, that is easy maintained, secure, and automatically upgrades itself. We aimed for functionality that was as close as possible to GKE's auto-pilot. Here's what is working at the moment: -- Lightweight and resource-efficient Kubernetes with [k3s](https://github.com/k3s-io/k3s), and Fedora nodes to take advantage of the latest Linux kernels. -- Optimal [Cilium](https://github.com/cilium/cilium) CNI with full BPF support, and Kube-proxy replacement. It uses the Hetzner private subnet underneath to communicate between the nodes, as for the tunneling we use Geneve by default, but native routing also works. -- Automatic OS upgrades, supported by [kured](https://github.com/weaveworks/kured) that initiate a reboot of the node only when necessary and after having drained it properly. +- Lightweight and resource-efficient Kubernetes with [k3s](https://github.com/k3s-io/k3s). +- Powered by k3OS nodes to take advantage of an auto-upgragradable and hardened OS, especially designed to run k3s. That means that both the OS and your kube cluster will stay current and up-to-date. - Automatic HA by setting the required number of servers and agents nodes. -- Automatic k3s upgrade by using Rancher's [system-upgrade-controller](https://github.com/rancher/system-upgrade-controller) and tracking the latest 1.x stable branch. - Optional [Nginx ingress controller](https://kubernetes.github.io/ingress-nginx/) that will automatically use Hetzner's private network to allocate a Hetzner load balancer. -It uses Terraform to deploy as it's easy to use, and Hetzner provides a great [Hetzner Terraform Provider](https://registry.terraform.io/providers/hetznercloud/hcloud/latest/docs). +_It uses Terraform to deploy as it's easy to use, and Hetzner provides a great [Hetzner Terraform Provider](https://registry.terraform.io/providers/hetznercloud/hcloud/latest/docs)._ ## Getting started -Follow those simple steps and your world cheapest Kube cluster will be up and running in no time. +Follow those simple steps and your world cheapest and coolest Kube cluster will be up and running in no time. ### Prerequisites First and foremost, you need to have a Hetzner Cloud account. You can sign up for free [here](https://hetzner.com/cloud/). -Then you'll need you have both the [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli) and [helm](https://helm.sh/docs/intro/install/), and [kubectl](https://kubernetes.io/docs/tasks/tools/) cli installed. The easiest way is to use the [gofish](https://gofi.sh/#install) package manager to install them. +Then you'll need you have the [terraform](https://learn.hashicorp.com/tutorials/terraform/install-cli), [helm](https://helm.sh/docs/intro/install/), and [kubectl](https://kubernetes.io/docs/tasks/tools/) cli installed. The easiest way is to use the [gofish](https://gofi.sh/#install) package manager to install them. ```sh -gofish install terraform -gofish install kubectl -gofish install helm +gofish install terraform && gofish install kubectl && gofish install helm ``` ### Creating terraform.tfvars @@ -64,7 +60,7 @@ gofish install helm ### Customize other variables (Optional) -The number of control plane nodes and worker nodes, and the Hetzner datacenter location, can be customized by adding the variables to your newly created terraform.tfvars file. +The number of control plane nodes and worker nodes, the [Hetzner datacenter location](https://docs.hetzner.com/general/others/data-centers-and-connection/) (.i.e. ngb1, fsn1, hel1 ...etc.), and the [Hetzner server types](https://www.hetzner.com/cloud) (i.e. cpx31, cpx41 ...etc.) can be customized by adding the corresponding variables to your newly created terraform.tfvars file. See the default values in the [variables.tf](variables.tf) file, they correspond to (you can copy-paste and customize): @@ -72,8 +68,8 @@ See the default values in the [variables.tf](variables.tf) file, they correspond servers_num = 2 agents_num = 2 location = "fsn1" -agent_server_type = "cx21" -control_plane_server_type = "cx11" +agent_server_type = "cpx21" +control_plane_server_type = "cpx11" ``` ### Installation @@ -100,7 +96,7 @@ To have a complete and useful setup, it is ideal to have an ingress controller r ```sh helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx helm repo update -helm install --values=manifests/helm/nginx/values.yaml ingress-nginx ingress-nginx/ingress-nginx -n kube-system +helm install --values=manifests/helm/nginx/values.yaml ingress-nginx ingress-nginx/ingress-nginx -n kube-system --kubeconfig kubeconfig.yaml ``` _Note that the default geographic location and instance type of the load balancer can be changed by editing the [values.yaml](manifests/helm/nginx/values.yaml) file._ @@ -129,83 +125,33 @@ hcloud network describe k3s-net - Log into one of your nodes (replace the location of your private key if needed): ```sh -ssh root@xxx.xxx.xxx.xxx -i ~/.ssh/id_ed25519 -o StrictHostKeyChecking=no +ssh rancher@xxx.xxx.xxx.xxx -i ~/.ssh/id_ed25519 -o StrictHostKeyChecking=no ``` -### Cilium commands - -- Check the status of cilium with the following commands (get the cilium pod name first and replace it in the command): - -```sh -kubectl -n kube-system exec --stdin --tty cilium-xxxx -- cilium status -kubectl -n kube-system exec --stdin --tty cilium-xxxx -- cilium status --verbose -``` - -- Monitor cluster traffic with: - -```sh -kubectl -n kube-system exec --stdin --tty cilium-xxxx -- cilium monitor -``` - -- See the list of kube services with: - -```sh -kubectl -n kube-system exec --stdin --tty cilium-xxxx -- cilium service list -``` - -_For more cilium commands, please refer to their corresponding [Documentation](https://docs.cilium.io/en/latest/cheatsheet)._ - ### Automatic upgrade -The nodes and k3s versions are configured to self-upgrade unless you turn that feature off. - -- To turn OS upgrade off, log in to each node and issue: +By default, k3os and its embedded k3s instance get upgraded automatically on each node in an HA and non-disruptive way, thanks to its embedded system upgrade controller. If you wish to turn that feature off, please remove the following label `k3os.io/upgrade=latest` with the following command: ```sh -systemctl disable --now dnf-automatic.timer -``` - -- To turn off k3s upgrade, use kubectl to set the k3s_upgrade label to false for each node (replace the node-name in the command): - -```sh -kubectl label node node-name k3s_upgrade=false +kubectl label node 'k3os.io/upgrade'- --kubeconfig kubeconfig.yaml ``` ### Individual components upgrade To upgrade individual components, you can use the following commands: -- Hetzner CCM +- Hetzner CCM and CSI ```sh -kubectl apply -f https://raw.githubusercontent.com/mysticaltech/kube-hetzner/master/manifests/hcloud-ccm-net.yaml +kubectl apply -f https://raw.githubusercontent.com/mysticaltech/kube-hetzner/master/manifests/hcloud-ccm-net.yaml --kubeconfig kubeconfig.yaml +kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/master/deploy/kubernetes/hcloud-csi.yml --kubeconfig kubeconfig.yaml ``` -- Hetzner CSI - -```sh -kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/master/deploy/kubernetes/hcloud-csi.yml -``` - -- Rancher's system upgrade controller - -```sh -kubectl apply -f https://raw.githubusercontent.com/rancher/system-upgrade-controller/master/manifests/system-upgrade-controller.yaml -``` - -- Kured (used to reboot the nodes after upgrading and draining them) - -```sh -latest=$(curl -s https://api.github.com/repos/weaveworks/kured/releases | jq -r '.[0].tag_name') -kubectl apply -f https://github.com/weaveworks/kured/releases/download/$latest/kured-$latest-dockerhub.yaml -``` - -- Cilium and the Nginx ingress controller +- (Optional, if installed) Nginx ingress controller ```sh helm repo update -helm upgrade --values=manifests/helm/cilium/values.yaml cilium cilium/cilium -n kube-system -helm upgrade --values=manifests/helm/nginx/values.yaml ingress-nginx ingress-nginx/ingress-nginx -n kube-system +helm upgrade --values=manifests/helm/nginx/values.yaml ingress-nginx ingress-nginx/ingress-nginx -n kube-system --kubeconfig kubeconfig.yaml ``` ## Takedown @@ -213,21 +159,17 @@ helm upgrade --values=manifests/helm/nginx/values.yaml ingress-nginx ingress-ngi If you chose to install the Nginx ingress controller, you need to delete it first to release the load balancer, as follows: ```sh -helm delete ingress-nginx -n kube-system +helm delete ingress-nginx -n kube-system --kubeconfig kubeconfig.yaml ``` Then you can proceed to taking down the rest of the cluster with: ```sh +kubectl delete -f https://raw.githubusercontent.com/mysticaltech/kube-hetzner/master/manifests/hcloud-ccm-net.yaml --kubeconfig kubeconfig.yaml +kubectl delete -f https://raw.githubusercontent.com/hetznercloud/csi-driver/master/deploy/kubernetes/hcloud-csi.yml --kubeconfig kubeconfig.yaml terraform destroy -auto-approve ``` -Sometimes, the Hetzner network is still in use and refused to be deleted via terraform, in that case you can force delete it with: - -```sh -hcloud network delete k3s-net -``` - Also, if you had a full blown cluster in use, it's best do delete the whole project in your Hetzner account directly, as there may be other ressources created via operators that are not part of this project. @@ -268,8 +210,7 @@ Project Link: [https://github.com/mysticaltech/kube-hetzner](https://github.com/ - [k-andy](https://github.com/StarpTech/k-andy) was the starting point for this project. It wouldn't have been possible without it. - [Best-README-Template](https://github.com/othneildrew/Best-README-Template) that made writing this readme a lot easier. - - +- [k3os-hetzner])(https://github.com/hughobrien/k3os-hetzner) was the inspiration for the k3os installation method. [contributors-shield]: https://img.shields.io/github/contributors/mysticaltech/kube-hetzner.svg?style=for-the-badge [contributors-url]: https://github.com/mysticaltech/kube-hetzner/graphs/contributors diff --git a/manifests/helm/cilium/values.yaml b/manifests/helm/cilium/values.yaml deleted file mode 100644 index 4ca92a7..0000000 --- a/manifests/helm/cilium/values.yaml +++ /dev/null @@ -1,117 +0,0 @@ -upgradeCompatibility: '1.10' - -debug: - # -- Enable debug logging - enabled: false - # verbose: - -# gke: - # enabled: true - -ipam: - # -- Configure IP Address Management mode. - # ref: https://docs.cilium.io/en/stable/concepts/networking/ipam/ - mode: kubernetes - -# -- Configure the encapsulation configuration for communication between nodes. -# Possible values: -# - disabled (native routing works, however I feel that geneve is more stable, but I may be wrong) -# - vxlan -# - geneve -tunnel: geneve - -# -- Specify the IPv4 CIDR for native routing (ie to avoid IP masquerade for). -# This value corresponds to the configured cluster-cidr. -nativeRoutingCIDR: 10.0.0.0/8 - -# When enabled, causes legacy routing -# endpointRoutes: - # -- Enable use of per endpoint routes instead of routing via - # the cilium_host interface. - # enabled: false - -# -- Enable installation of PodCIDR routes between worker -# nodes if worker nodes share a common L2 network segment. -autoDirectNodeRoutes: false - -bpf: - # -- Allow cluster external access to ClusterIP services. - lbExternalClusterIP: false - - # -- Enable native IP masquerade support in eBPF - masquerade: true - -endpointHealthChecking: - # -- Enable connectivity health checking between virtual endpoints. - enabled: true - -# -- Configure ClusterIP service handling in the host namespace (the node). -hostServices: - # -- Enable host reachable services. - enabled: true - - # -- Supported list of protocols to apply ClusterIP translation to. - protocols: tcp,udp - -externalIPs: - # -- Enable ExternalIPs service support. - enabled: true - -hostPort: - # -- Enable hostPort service support. - enabled: true - -# -- Configure N-S k8s service loadbalancing -nodePort: - # -- Enable the Cilium NodePort service implementation. - enabled: true - -# -- Enable connectivity health checking. -healthChecking: true - -ipv4: - # -- Enable IPv4 support. - enabled: true - -ipv6: - # -- Enable IPv6 support. - enabled: false - -# -- Configure Kubernetes specific configuration -k8s: - # -- requireIPv4PodCIDR enables waiting for Kubernetes to provide the PodCIDR - # range via the Kubernetes node resource - requireIPv4PodCIDR: true - -# -- Configure the kube-proxy replacement in Cilium BPF datapath -# Valid options are "disabled", "probe", "partial", "strict". -# ref: https://docs.cilium.io/en/stable/gettingstarted/kubeproxy-free/ -kubeProxyReplacement: strict - -# -- Enables masquerading of IPv4 traffic leaving the node from endpoints. -enableIPv4Masquerade: true - -monitor: - # -- Enable the cilium-monitor sidecar. - enabled: false - -# -- Configure service load balancing -loadBalancer: - # -- standalone enables the standalone L4LB which does not connect to - # kube-apiserver. - # standalone: false - - # -- algorithm is the name of the load balancing algorithm for backend - # selection e.g. random or maglev - algorithm: maglev - - # dsr mode did probably caused packet drops, so falling back to snat - mode: snat - -# -- The agent can be put into one of the three policy enforcement modes: -# default, always and never. -# ref: https://docs.cilium.io/en/stable/policy/intro/#policy-enforcement-modes -policyEnforcementMode: never - -# -- Enables the enforcement of host policies in the eBPF datapath. -hostFirewall: false \ No newline at end of file diff --git a/manifests/upgrade/kured.yaml b/manifests/upgrade/kured.yaml deleted file mode 100644 index d663835..0000000 --- a/manifests/upgrade/kured.yaml +++ /dev/null @@ -1,139 +0,0 @@ ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: kured -rules: -# Allow kured to read spec.unschedulable -# Allow kubectl to drain/uncordon -# -# NB: These permissions are tightly coupled to the bundled version of kubectl; the ones below -# match https://github.com/kubernetes/kubernetes/blob/v1.19.4/staging/src/k8s.io/kubectl/pkg/cmd/drain/drain.go -# -- apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "patch"] -- apiGroups: [""] - resources: ["pods"] - verbs: ["list","delete","get"] -- apiGroups: ["apps"] - resources: ["daemonsets"] - verbs: ["get"] -- apiGroups: [""] - resources: ["pods/eviction"] - verbs: ["create"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kured -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kured -subjects: -- kind: ServiceAccount - name: kured - namespace: kube-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: kube-system - name: kured -rules: -# Allow kured to lock/unlock itself -- apiGroups: ["apps"] - resources: ["daemonsets"] - resourceNames: ["kured"] - verbs: ["update"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - namespace: kube-system - name: kured -subjects: -- kind: ServiceAccount - namespace: kube-system - name: kured -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: kured ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: kured - namespace: kube-system ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: kured # Must match `--ds-name` - namespace: kube-system # Must match `--ds-namespace` -spec: - selector: - matchLabels: - name: kured - updateStrategy: - type: RollingUpdate - template: - metadata: - labels: - name: kured - spec: - serviceAccountName: kured - tolerations: - - key: node-role.kubernetes.io/master - effect: NoSchedule - hostPID: true # Facilitate entering the host mount namespace via init - restartPolicy: Always - containers: - - name: kured - image: docker.io/weaveworks/kured:1.8.0 - # If you find yourself here wondering why there is no - # :latest tag on Docker Hub,see the FAQ in the README - imagePullPolicy: IfNotPresent - securityContext: - privileged: true # Give permission to nsenter /proc/1/ns/mnt - env: - # Pass in the name of the node on which this pod is scheduled - # for use with drain/uncordon operations and lock acquisition - - name: KURED_NODE_ID - valueFrom: - fieldRef: - fieldPath: spec.nodeName - command: - - /usr/bin/kured - - --reboot-sentinel-command="/usr/bin/needs-restarting -r" -# - --force-reboot=false -# - --drain-grace-period=-1 -# - --skip-wait-for-delete-timeout=0 -# - --drain-timeout=0 -# - --period=1h -# - --ds-namespace=kube-system -# - --ds-name=kured -# - --lock-annotation=weave.works/kured-node-lock -# - --lock-ttl=0 -# - --prometheus-url=http://prometheus.monitoring.svc.cluster.local -# - --alert-filter-regexp=^RebootRequired$ -# - --alert-firing-only=false -# - --reboot-sentinel=/var/run/reboot-required -# - --prefer-no-schedule-taint="" -# - --slack-hook-url=https://hooks.slack.com/... -# - --slack-username=prod -# - --slack-channel=alerting -# - --notify-url="" # See also shoutrrr url format -# - --message-template-drain=Draining node %s -# - --message-template-drain=Rebooting node %s -# - --blocking-pod-selector=runtime=long,cost=expensive -# - --blocking-pod-selector=name=temperamental -# - --blocking-pod-selector=... -# - --reboot-days=sun,mon,tue,wed,thu,fri,sat -# - --start-time=0:00 -# - --end-time=23:59:59 -# - --time-zone=UTC -# - --annotate-nodes=false -# - --lock-release-delay=30m diff --git a/master.tf b/master.tf index d6ffade..5d162f5 100644 --- a/master.tf +++ b/master.tf @@ -47,11 +47,12 @@ resource "hcloud_server" "first_control_plane" { EOT } + # Install the Hetzner Cloud cloud controller and cloud storage interface provisioner "local-exec" { command = <<-EOT - kubectl -n kube-system create secret generic hcloud --from-literal=token=${random_password.k3s_token.result} --from-literal=network=${hcloud_network.k3s.name} --kubeconfig ${path.module}/kubeconfig.yaml + kubectl -n kube-system create secret generic hcloud --from-literal=token=${var.hcloud_token} --from-literal=network=${hcloud_network.k3s.name} --kubeconfig ${path.module}/kubeconfig.yaml kubectl apply -f ${path.module}/manifests/hcloud-ccm-net.yaml --kubeconfig ${path.module}/kubeconfig.yaml - kubectl -n kube-system create secret generic hcloud-csi --from-literal=token=${random_password.k3s_token.result} --kubeconfig ${path.module}/kubeconfig.yaml + kubectl -n kube-system create secret generic hcloud-csi --from-literal=token=${var.hcloud_token} --kubeconfig ${path.module}/kubeconfig.yaml kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/master/deploy/kubernetes/hcloud-csi.yml --kubeconfig ${path.module}/kubeconfig.yaml EOT } diff --git a/templates/agent.tpl b/templates/agent.tpl index c9df5d6..5d12029 100644 --- a/templates/agent.tpl +++ b/templates/agent.tpl @@ -22,6 +22,8 @@ k3os: - "--kubelet-arg" - "cloud-provider=external" - "--flannel-iface=eth1" + - "--node-label" + - "k3os.io/upgrade=latest" token: ${k3s_token} ntp_servers: - 0.de.pool.ntp.org diff --git a/templates/master.tpl b/templates/master.tpl index 05ac341..5493fa5 100644 --- a/templates/master.tpl +++ b/templates/master.tpl @@ -17,6 +17,7 @@ k3os: - server - "--cluster-init" - "--disable-cloud-controller" + - "--disable-network-policy" - "--disable=traefik" - "--disable=servicelb" - "--disable=local-storage" @@ -29,6 +30,8 @@ k3os: - "${master_ip}" - "--kubelet-arg" - "cloud-provider=external" + - "--node-label" + - "k3os.io/upgrade=latest" token: ${k3s_token} ntp_servers: - 0.de.pool.ntp.org diff --git a/templates/server.tpl b/templates/server.tpl index abd127b..9c11761 100644 --- a/templates/server.tpl +++ b/templates/server.tpl @@ -18,6 +18,7 @@ k3os: - "--server" - "https://${master_ip}:6443" - "--disable-cloud-controller" + - "--disable-network-policy" - "--disable=traefik" - "--disable=servicelb" - "--disable=local-storage" @@ -30,6 +31,8 @@ k3os: - "${node_ip}" - "--kubelet-arg" - "cloud-provider=external" + - "--node-label" + - "k3os.io/upgrade=latest" token: ${k3s_token} ntp_servers: - 0.de.pool.ntp.org