使用 terraform 部署 EKS 节点组时出错

Error deploying EKS node-group with terraform

我在使用 Terraform 部署 EKS 集群中的节点组时遇到问题。错误看起来像是一个插件有问题,但我不知道如何解决它。

如果我在 AWS 控制台 (web) 中看到 EC2,我可以看到集群的实例,但我在集群中出现此错误。

错误显示在我的管道中:

Error: waiting for EKS Node Group (UNIR-API-REST-CLUSTER-DEV:node_sping_boot) creation: NodeCreationFailure: Instances failed to join the kubernetes cluster. Resource IDs: [i-05ed58f8101240dc8]
on EKS.tf line 17, in resource "aws_eks_node_group" "nodes":
17: resource "aws_eks_node_group" "nodes"
2020-06-01T00:03:50.576Z [DEBUG] plugin: plugin process exited: path=/home/ubuntu/.jenkins/workspace/shop_infraestucture_generator_pipline/shop-proyect-dev/.terraform/plugins/linux_amd64/terraform-provider-aws_v2.64.0_x4 pid=13475
2020-06-01T00:03:50.576Z [DEBUG] plugin: plugin exited

并且错误打印在 AWS 控制台:

Link

这是我用来创建项目的 Terraform 中的代码:

EKS.tf 用于创建集群和去节点

resource "aws_eks_cluster" "CLUSTER" {
  name     = "UNIR-API-REST-CLUSTER-${var.SUFFIX}"
  role_arn = "${aws_iam_role.eks_cluster_role.arn}"
  vpc_config {
    subnet_ids = [
      "${aws_subnet.unir_subnet_cluster_1.id}","${aws_subnet.unir_subnet_cluster_2.id}"
    ]
  }
  depends_on = [
    "aws_iam_role_policy_attachment.AmazonEKSWorkerNodePolicy",
    "aws_iam_role_policy_attachment.AmazonEKS_CNI_Policy",
    "aws_iam_role_policy_attachment.AmazonEC2ContainerRegistryReadOnly",
  ]
}


resource "aws_eks_node_group" "nodes" {
  cluster_name    = "${aws_eks_cluster.CLUSTER.name}"
  node_group_name = "node_sping_boot"
  node_role_arn   = "${aws_iam_role.eks_nodes_role.arn}"
  subnet_ids      = [
      "${aws_subnet.unir_subnet_cluster_1.id}","${aws_subnet.unir_subnet_cluster_2.id}"
  ]
  scaling_config {
    desired_size = 1
    max_size     = 5
    min_size     = 1
  }
# instance_types is mediumt3 by default
# Ensure that IAM Role permissions are created before and deleted after EKS Node Group handling.
# Otherwise, EKS will not be able to properly delete EC2 Instances and Elastic Network Interfaces.
  depends_on = [
    "aws_iam_role_policy_attachment.AmazonEKSWorkerNodePolicy",
    "aws_iam_role_policy_attachment.AmazonEKS_CNI_Policy",
    "aws_iam_role_policy_attachment.AmazonEC2ContainerRegistryReadOnly",
  ]
}

output "eks_cluster_endpoint" {
  value = "${aws_eks_cluster.CLUSTER.endpoint}"
}

output "eks_cluster_certificat_authority" {
    value = "${aws_eks_cluster.CLUSTER.certificate_authority}"
}

securityAndGroups.tf

resource "aws_iam_role" "eks_cluster_role" {
  name = "eks-cluster-${var.SUFFIX}"

  assume_role_policy = <<POLICY
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "eks.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}
POLICY
}


resource "aws_iam_role" "eks_nodes_role" {
  name = "eks-node-${var.SUFFIX}"

  assume_role_policy = <<POLICY
{
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "ec2.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}
POLICY
}


resource "aws_iam_role_policy_attachment" "AmazonEKSClusterPolicy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
  role       = "${aws_iam_role.eks_cluster_role.name}"
}

resource "aws_iam_role_policy_attachment" "AmazonEKSServicePolicy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSServicePolicy"
  role       = "${aws_iam_role.eks_cluster_role.name}"
}

resource "aws_iam_role_policy_attachment" "AmazonEKSWorkerNodePolicy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
  role       = "${aws_iam_role.eks_nodes_role.name}"
}

resource "aws_iam_role_policy_attachment" "AmazonEKS_CNI_Policy" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
  role       = "${aws_iam_role.eks_nodes_role.name}"
}

resource "aws_iam_role_policy_attachment" "AmazonEC2ContainerRegistryReadOnly" {
  policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
  role       = "${aws_iam_role.eks_nodes_role.name}"
}

VPCAndRouting.tf 创建我的路由、VPC 和子网

resource "aws_vpc" "unir_shop_vpc_dev" {
  cidr_block = "${var.NET_CIDR_BLOCK}"
  enable_dns_hostnames = true
  enable_dns_support = true
  tags = {
    Name = "UNIR-VPC-SHOP-${var.SUFFIX}"
    Environment = "${var.SUFFIX}"
  }
}
resource "aws_route_table" "route" {
  vpc_id = "${aws_vpc.unir_shop_vpc_dev.id}"
  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = "${aws_internet_gateway.unir_gat_shop_dev.id}"
  }
  tags = {
    Name = "UNIR-RoutePublic-${var.SUFFIX}"
    Environment = "${var.SUFFIX}"
  }
}

data "aws_availability_zones" "available" {
  state = "available"
}
resource "aws_subnet" "unir_subnet_aplications" {
  vpc_id = "${aws_vpc.unir_shop_vpc_dev.id}"
  cidr_block = "${var.SUBNET_CIDR_APLICATIONS}"
  availability_zone = "${var.ZONE_SUB}"
  depends_on = ["aws_internet_gateway.unir_gat_shop_dev"]
  map_public_ip_on_launch = true
  tags = {
    Name = "UNIR-SUBNET-APLICATIONS-${var.SUFFIX}"
    Environment = "${var.SUFFIX}"
  }
}

resource "aws_subnet" "unir_subnet_cluster_1" {
  vpc_id = "${aws_vpc.unir_shop_vpc_dev.id}"
  cidr_block = "${var.SUBNET_CIDR_CLUSTER_1}"
  map_public_ip_on_launch = true
  availability_zone = "${var.ZONE_SUB_CLUSTER_2}"
  tags = {
    "kubernetes.io/cluster/UNIR-API-REST-CLUSTER-${var.SUFFIX}" = "shared"
  }
}

resource "aws_subnet" "unir_subnet_cluster_2" {
  vpc_id = "${aws_vpc.unir_shop_vpc_dev.id}"
  cidr_block = "${var.SUBNET_CIDR_CLUSTER_2}"
  availability_zone = "${var.ZONE_SUB_CLUSTER_1}"
  map_public_ip_on_launch = true
  tags = {
    "kubernetes.io/cluster/UNIR-API-REST-CLUSTER-${var.SUFFIX}" = "shared"
  }

}

resource "aws_internet_gateway" "unir_gat_shop_dev" {
  vpc_id = "${aws_vpc.unir_shop_vpc_dev.id}"
  tags = {
    Environment = "${var.SUFFIX}"
    Name = "UNIR-publicGateway-${var.SUFFIX}"
  }
}

我的变量:

SUFFIX="DEV"
ZONE="eu-west-1"
TERRAFORM_USER_ID=
TERRAFORM_USER_PASS=
ZONE_SUB="eu-west-1b"
ZONE_SUB_CLUSTER_1="eu-west-1a"
ZONE_SUB_CLUSTER_2="eu-west-1c"
NET_CIDR_BLOCK="172.15.0.0/24"
SUBNET_CIDR_APLICATIONS="172.15.0.0/27"
SUBNET_CIDR_CLUSTER_1="172.15.0.32/27"
SUBNET_CIDR_CLUSTER_2="172.15.0.64/27"
SUBNET_CIDR_CLUSTER_3="172.15.0.128/27"
SUBNET_CIDR_CLUSTER_4="172.15.0.160/27"
SUBNET_CIDR_CLUSTER_5="172.15.0.192/27"
SUBNET_CIDR_CLUSTER_6="172.15.0.224/27"
MONGO_SSH_KEY=
KIBANA_SSH_KEY=
CLUSTER_SSH_KEY=

是否需要更多日志?

根据 AWS documentation:

If you receive the error "Instances failed to join the kubernetes cluster" in the AWS Management Console, ensure that either the cluster's private endpoint access is enabled, or that you have correctly configured CIDR blocks for public endpoint access. For more information, see Amazon EKS cluster endpoint access control.

我注意到您正在为您的子网切换可用区:

resource "aws_subnet" "unir_subnet_cluster_1" {
  vpc_id = "${aws_vpc.unir_shop_vpc_dev.id}"
  cidr_block = "${var.SUBNET_CIDR_CLUSTER_1}"
  map_public_ip_on_launch = true
  availability_zone = "${var.ZONE_SUB_CLUSTER_2}"

您已将 var.ZONE_SUB_CLUSTER_2 分配给 unir_subnet_cluster_1,将 var.ZONE_SUB_CLUSTER_1 分配给 unir_subnet_cluster_2。也许这可能是配置错误的原因。

如“NodeCreationFailure”下here所述,此错误有两个可能的原因:

NodeCreationFailure: Your launched instances are unable to register with your Amazon EKS cluster. Common causes of this failure are insufficient node IAM role permissions or lack of outbound internet access for the nodes.
Your nodes must be able to access the internet using a public IP address to function properly.

在我的例子中,集群位于私有子网内,在添加到 NAT 网关的路由后,错误消失了。