Terraform AWS LB 运行状况检查失败

Posted

技术标签:

【中文标题】Terraform AWS LB 运行状况检查失败【英文标题】:Terraform AWS LB healthcheck failed 【发布时间】:2021-12-26 06:48:35 【问题描述】:

我有一个 terraform 以下代码,它在 AWS ECS Fargate 上为我配置网关服务。不在专用网络中的负载均衡器下的服务按预期工作,但是添加了 LB 的网关未能通过健康检查,并且每 2-3 分钟取消配置和配置新任务。 Docker 文件在端口 3000 上公开服务。

这是一个失败的 terraform 计划

locals 
  gateway_version = "1.0.0"
  gateway_port    = 3000


## VPC

module "vpc" 
  source  = "terraform-aws-modules/vpc/aws"
  version = "3.11.0"

  name        = "$var.env-vpc"
  cidr        = "20.0.0.0/16"
  enable_ipv6 = true

  azs                 = ["eu-central-1a", "eu-central-1b"]
  public_subnets      = ["20.0.1.0/24", "20.0.2.0/24"]
  private_subnets     = ["20.0.86.0/24", "20.0.172.0/24"]
  elasticache_subnets = ["20.0.31.0/24", "20.0.32.0/24"]

  enable_nat_gateway = true
  single_nat_gateway = true

  tags = 
    Terraform = "true"
  


## Security Groups

module "sg" 
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "$var.env-sg-default"
  description = "Default service security group"
  vpc_id      = module.vpc.vpc_id

  ingress_cidr_blocks = ["0.0.0.0/0"]
  ingress_rules = [
    "all-icmp",
    "http-80-tcp",
    "https-443-tcp",
    "mysql-tcp",
    "rabbitmq-4369-tcp",
    "rabbitmq-5671-tcp",
    "rabbitmq-5672-tcp",
    "rabbitmq-15672-tcp",
    "rabbitmq-25672-tcp",
    "redis-tcp"
  ]
  egress_rules = ["all-all"]


module "security_group" 
  source  = "terraform-aws-modules/security-group/aws"
  version = "~> 4.0"

  name        = "$var.env-sg-lb"
  description = "Security group for ALB"
  vpc_id      = module.vpc.vpc_id

  ingress_cidr_blocks = ["0.0.0.0/0"]
  ingress_rules       = ["http-80-tcp", "all-icmp"]
  egress_rules        = ["all-all"]



resource "aws_security_group" "service_security_group" 
  name = "$var.env-lb-connection"
  ingress 
    from_port = 0
    to_port   = 0
    protocol  = "-1"
    # Only allowing traffic in from the load balancer security group
    security_groups = [module.security_group.security_group_id]
  

  egress 
    from_port   = 0
    to_port     = 0
    protocol    = "-1"
    cidr_blocks = ["0.0.0.0/0"]
  

  vpc_id = module.vpc.vpc_id


## ECS Cluster

resource "aws_ecs_cluster" "default" 
  name = "$var.env-cluster"


## ECR

data "aws_ecr_repository" "gateway_ecr" 
  name = "gateway-$var.env"


## ECS Task Definition

resource "aws_ecs_task_definition" "gateway_task" 
  family                   = "$var.env-gateway-task"
  container_definitions    = <<DEFINITION
  [
    
      "name": "$var.env-gateway-task",
      "image": "$data.aws_ecr_repository.gateway_ecr.repository_url:$local.gateway_version",
      "networkMode": "awsvpc",
      "essential": true,
      "logConfiguration": 
        "logDriver": "awslogs",
        "options": 
          "awslogs-group": "$aws_cloudwatch_log_group.gateway_logs.name",
          "awslogs-stream-prefix": "ecs",
          "awslogs-region": "$var.aws-region"
        
      ,
      "portMappings": [
        
          "containerPort": $local.gateway_port,
          "hostPort": $local.gateway_port
        
      ],
      "environment": [
        
          "name": "AWS_REGION",
          "value": "$var.aws-region"
        ,
        
          "name": "PORT",
          "value": "$local.gateway_port"
        ,
        
          "name": "STAGE",
          "value": "$var.env"
        ,
        
          "name": "NODE_ENV",
          "value": "development"
        ,
        
          "name": "VERSION",
          "value": "$local.gateway_version"
        
      ],
      "memory": 512,
      "cpu": 256
    
  ]
  DEFINITION
  requires_compatibilities = ["FARGATE"]
  network_mode             = "awsvpc"
  memory                   = 512
  cpu                      = 256
  task_role_arn            = aws_iam_role.gateway_task_definition_role.arn
  execution_role_arn       = aws_iam_role.gateway_task_execution_role.arn


## ECS Service

resource "aws_ecs_service" "gateway_service" 
  name            = "$var.env-gateway-service"
  cluster         = aws_ecs_cluster.default.id
  task_definition = aws_ecs_task_definition.gateway_task.arn
  launch_type     = "FARGATE"
  desired_count   = 1

  force_new_deployment = true

  network_configuration 
    subnets = concat(
      module.vpc.public_subnets,
      module.vpc.private_subnets,
    )
    security_groups = [
      module.sg.security_group_id,
      aws_security_group.service_security_group.id
    ]
    assign_public_ip = true
  

  lifecycle 
    ignore_changes = [desired_count]
  

  load_balancer 
    target_group_arn = aws_lb_target_group.target_group.arn
    container_name   = aws_ecs_task_definition.gateway_task.family
    container_port   = local.gateway_port
  


## Cloudwatch Log Group

resource "aws_cloudwatch_log_group" "gateway_logs" 
  name = "$var.env-gateway-log-group"


  tags = 
    Name = "$var.env-gateway-log-group"
  


## IAM Roles

resource "aws_iam_role" "gateway_task_definition_role" 
  name               = "$var.env-gateway-task-definition-role"
  assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json

  tags = 
    Name = "$var.env-gateway-task-definition-role"
  


resource "aws_iam_role" "gateway_task_execution_role" 
  name               = "$var.env-gateway-task-execution-role"
  assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json

  tags = 
    Name = "$var.env-gateway-task-execution-role"
  


data "aws_iam_policy_document" "gateway_assume_role_policy" 
  statement 
    effect  = "Allow"
    actions = ["sts:AssumeRole"]

    principals 
      type        = "Service"
      identifiers = ["ecs-tasks.amazonaws.com"]
    
  


resource "aws_iam_role_policy" "gateway_exec" 
  name   = "$var.env-gateway-execution-role-policy"
  role   = aws_iam_role.gateway_task_execution_role.id
  policy = data.aws_iam_policy_document.gateway_exec_policy.json


data "aws_iam_policy_document" "gateway_exec_policy" 
  statement 
    effect    = "Allow"
    resources = ["*"]

    actions = [
      "ecr:GetAuthorizationToken",
      "ecr:BatchCheckLayerAvailability",
      "ecr:GetDownloadUrlForLayer",
      "ecr:BatchGetImage",
      "logs:CreateLogStream",
      "logs:PutLogEvents",
    ]
  


## ALB

resource "aws_lb" "alb" 
  name               = "$var.env-lb"
  load_balancer_type = "application"
  subnets            = module.vpc.public_subnets
  security_groups    = [module.security_group.security_group_id]


resource "aws_lb_target_group" "target_group" 
  name        = "target-group"
  port        = 80
  protocol    = "HTTP"
  target_type = "ip"
  vpc_id      = module.vpc.vpc_id

  health_check 
    matcher = "200,301,302"
    path    = "/health"
    interval = 120
    timeout = 30
  


resource "aws_lb_listener" "listener" 
  load_balancer_arn = aws_alb.alb.arn
  port              = 80
  protocol          = "HTTP"
  default_action 
    type             = "forward"
    target_group_arn = aws_lb_target_group.target_group.arn
  

这就是错误

Task failed ELB health checks in (target-group arn:aws:elasticloadbalancing:eu-central-1:129228585726:targetgroup/target-group/5853904c0d3ad322)

部署后,我看到 ECS 服务已启动并且它在那里工作,但是我没有看到任何检查其运行状况的请求

【问题讨论】:

【参考方案1】:

您的目标组使用port = 80,但您的ECS 任务定义指定端口3000。因此,这可能是您的 ALB 无法连接到您的容器的原因。

【讨论】:

【参考方案2】:

负载均衡器尝试检查它是否能够访问指定目标端口上的应用程序。在你的情况下是 3000。

替换您的目标组资源以使用应用程序端口让 LB 健康检查通过。

resource "aws_lb_target_group" "target_group" 
  name        = "target-group"
  port        = 3000
  protocol    = "HTTP"
  target_type = "ip"
  vpc_id      = module.vpc.vpc_id

  health_check 
    matcher = "200,301,302"
    path    = "/health"
    interval = 120
    timeout = 30
  

【讨论】:

【参考方案3】:

目标组不是问题 -> 问题是错误的 security_group 不允许访问端口 3000

【讨论】:

以上是关于Terraform AWS LB 运行状况检查失败的主要内容,如果未能解决你的问题,请参考以下文章

AWS 负载均衡器运行状况检查:运行状况检查失败,代码如下:[301]

Terraform 配置 LB 属性失败

AWS Elastic Beanstalk 运行状况检查偶尔失败

ECS 服务的 AWS 网络负载均衡器运行状况检查失败

Terraform如何获取aws_lb的IP地址

AWS 容器运行状况检查