Terraform AWS LB 运行状况检查失败
Posted
技术标签:
【中文标题】Terraform AWS LB 运行状况检查失败【英文标题】:Terraform AWS LB healthcheck failed 【发布时间】:2021-12-26 06:48:35 【问题描述】:我有一个 terraform 以下代码,它在 AWS ECS Fargate 上为我配置网关服务。不在专用网络中的负载均衡器下的服务按预期工作,但是添加了 LB 的网关未能通过健康检查,并且每 2-3 分钟取消配置和配置新任务。 Docker 文件在端口 3000 上公开服务。
这是一个失败的 terraform 计划
locals
gateway_version = "1.0.0"
gateway_port = 3000
## VPC
module "vpc"
source = "terraform-aws-modules/vpc/aws"
version = "3.11.0"
name = "$var.env-vpc"
cidr = "20.0.0.0/16"
enable_ipv6 = true
azs = ["eu-central-1a", "eu-central-1b"]
public_subnets = ["20.0.1.0/24", "20.0.2.0/24"]
private_subnets = ["20.0.86.0/24", "20.0.172.0/24"]
elasticache_subnets = ["20.0.31.0/24", "20.0.32.0/24"]
enable_nat_gateway = true
single_nat_gateway = true
tags =
Terraform = "true"
## Security Groups
module "sg"
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "$var.env-sg-default"
description = "Default service security group"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = [
"all-icmp",
"http-80-tcp",
"https-443-tcp",
"mysql-tcp",
"rabbitmq-4369-tcp",
"rabbitmq-5671-tcp",
"rabbitmq-5672-tcp",
"rabbitmq-15672-tcp",
"rabbitmq-25672-tcp",
"redis-tcp"
]
egress_rules = ["all-all"]
module "security_group"
source = "terraform-aws-modules/security-group/aws"
version = "~> 4.0"
name = "$var.env-sg-lb"
description = "Security group for ALB"
vpc_id = module.vpc.vpc_id
ingress_cidr_blocks = ["0.0.0.0/0"]
ingress_rules = ["http-80-tcp", "all-icmp"]
egress_rules = ["all-all"]
resource "aws_security_group" "service_security_group"
name = "$var.env-lb-connection"
ingress
from_port = 0
to_port = 0
protocol = "-1"
# Only allowing traffic in from the load balancer security group
security_groups = [module.security_group.security_group_id]
egress
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
vpc_id = module.vpc.vpc_id
## ECS Cluster
resource "aws_ecs_cluster" "default"
name = "$var.env-cluster"
## ECR
data "aws_ecr_repository" "gateway_ecr"
name = "gateway-$var.env"
## ECS Task Definition
resource "aws_ecs_task_definition" "gateway_task"
family = "$var.env-gateway-task"
container_definitions = <<DEFINITION
[
"name": "$var.env-gateway-task",
"image": "$data.aws_ecr_repository.gateway_ecr.repository_url:$local.gateway_version",
"networkMode": "awsvpc",
"essential": true,
"logConfiguration":
"logDriver": "awslogs",
"options":
"awslogs-group": "$aws_cloudwatch_log_group.gateway_logs.name",
"awslogs-stream-prefix": "ecs",
"awslogs-region": "$var.aws-region"
,
"portMappings": [
"containerPort": $local.gateway_port,
"hostPort": $local.gateway_port
],
"environment": [
"name": "AWS_REGION",
"value": "$var.aws-region"
,
"name": "PORT",
"value": "$local.gateway_port"
,
"name": "STAGE",
"value": "$var.env"
,
"name": "NODE_ENV",
"value": "development"
,
"name": "VERSION",
"value": "$local.gateway_version"
],
"memory": 512,
"cpu": 256
]
DEFINITION
requires_compatibilities = ["FARGATE"]
network_mode = "awsvpc"
memory = 512
cpu = 256
task_role_arn = aws_iam_role.gateway_task_definition_role.arn
execution_role_arn = aws_iam_role.gateway_task_execution_role.arn
## ECS Service
resource "aws_ecs_service" "gateway_service"
name = "$var.env-gateway-service"
cluster = aws_ecs_cluster.default.id
task_definition = aws_ecs_task_definition.gateway_task.arn
launch_type = "FARGATE"
desired_count = 1
force_new_deployment = true
network_configuration
subnets = concat(
module.vpc.public_subnets,
module.vpc.private_subnets,
)
security_groups = [
module.sg.security_group_id,
aws_security_group.service_security_group.id
]
assign_public_ip = true
lifecycle
ignore_changes = [desired_count]
load_balancer
target_group_arn = aws_lb_target_group.target_group.arn
container_name = aws_ecs_task_definition.gateway_task.family
container_port = local.gateway_port
## Cloudwatch Log Group
resource "aws_cloudwatch_log_group" "gateway_logs"
name = "$var.env-gateway-log-group"
tags =
Name = "$var.env-gateway-log-group"
## IAM Roles
resource "aws_iam_role" "gateway_task_definition_role"
name = "$var.env-gateway-task-definition-role"
assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json
tags =
Name = "$var.env-gateway-task-definition-role"
resource "aws_iam_role" "gateway_task_execution_role"
name = "$var.env-gateway-task-execution-role"
assume_role_policy = data.aws_iam_policy_document.gateway_assume_role_policy.json
tags =
Name = "$var.env-gateway-task-execution-role"
data "aws_iam_policy_document" "gateway_assume_role_policy"
statement
effect = "Allow"
actions = ["sts:AssumeRole"]
principals
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
resource "aws_iam_role_policy" "gateway_exec"
name = "$var.env-gateway-execution-role-policy"
role = aws_iam_role.gateway_task_execution_role.id
policy = data.aws_iam_policy_document.gateway_exec_policy.json
data "aws_iam_policy_document" "gateway_exec_policy"
statement
effect = "Allow"
resources = ["*"]
actions = [
"ecr:GetAuthorizationToken",
"ecr:BatchCheckLayerAvailability",
"ecr:GetDownloadUrlForLayer",
"ecr:BatchGetImage",
"logs:CreateLogStream",
"logs:PutLogEvents",
]
## ALB
resource "aws_lb" "alb"
name = "$var.env-lb"
load_balancer_type = "application"
subnets = module.vpc.public_subnets
security_groups = [module.security_group.security_group_id]
resource "aws_lb_target_group" "target_group"
name = "target-group"
port = 80
protocol = "HTTP"
target_type = "ip"
vpc_id = module.vpc.vpc_id
health_check
matcher = "200,301,302"
path = "/health"
interval = 120
timeout = 30
resource "aws_lb_listener" "listener"
load_balancer_arn = aws_alb.alb.arn
port = 80
protocol = "HTTP"
default_action
type = "forward"
target_group_arn = aws_lb_target_group.target_group.arn
这就是错误
Task failed ELB health checks in (target-group arn:aws:elasticloadbalancing:eu-central-1:129228585726:targetgroup/target-group/5853904c0d3ad322)
部署后,我看到 ECS 服务已启动并且它在那里工作,但是我没有看到任何检查其运行状况的请求
【问题讨论】:
【参考方案1】:您的目标组使用port = 80
,但您的ECS 任务定义指定端口3000
。因此,这可能是您的 ALB 无法连接到您的容器的原因。
【讨论】:
【参考方案2】:负载均衡器尝试检查它是否能够访问指定目标端口上的应用程序。在你的情况下是 3000。
替换您的目标组资源以使用应用程序端口让 LB 健康检查通过。
resource "aws_lb_target_group" "target_group"
name = "target-group"
port = 3000
protocol = "HTTP"
target_type = "ip"
vpc_id = module.vpc.vpc_id
health_check
matcher = "200,301,302"
path = "/health"
interval = 120
timeout = 30
【讨论】:
【参考方案3】:目标组不是问题 -> 问题是错误的 security_group 不允许访问端口 3000
【讨论】:
以上是关于Terraform AWS LB 运行状况检查失败的主要内容,如果未能解决你的问题,请参考以下文章
AWS 负载均衡器运行状况检查:运行状况检查失败,代码如下:[301]