Our current basic setup for having Glue crawl one S3 bucket and create/update a table in a Glue DB, which can then be queried in Athena, looks like this:
Crawler role and role policy:
The assume_role_policy of the IAM role needs only Glue as principal
The IAM role policy allows actions for Glue, S3, and logs
The Glue actions and resources can probably be narrowed down to the ones really needed
The S3 actions are limited to those needed by the crawler
resource "aws_iam_role" "glue_crawler_role" {
name = "analytics_glue_crawler_role"
assume_role_policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Action": "sts:AssumeRole",
"Principal": {
"Service": "glue.amazonaws.com"
},
"Effect": "Allow",
"Sid": ""
}
]
}
EOF
}
resource "aws_iam_role_policy" "glue_crawler_role_policy" {
name = "analytics_glue_crawler_role_policy"
role = "${aws_iam_role.glue_crawler_role.id}"
policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"glue:*",
],
"Resource": [
"*"
]
},
{
"Effect": "Allow",
"Action": [
"s3:GetBucketLocation",
"s3:ListBucket",
"s3:GetBucketAcl",
"s3:GetObject",
"s3:PutObject",
"s3:DeleteObject"
],
"Resource": [
"arn:aws:s3:::analytics-product-data",
"arn:aws:s3:::analytics-product-data/*",
]
},
{
"Effect": "Allow",
"Action": [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": [
"arn:aws:logs:*:*:/aws-glue/*"
]
}
]
}
EOF
}
S3 Bucket, Glue Database and Crawler:
resource "aws_s3_bucket" "product_bucket" {
bucket = "analytics-product-data"
acl = "private"
}
resource "aws_glue_catalog_database" "analytics_db" {
name = "inventory-analytics-db"
}
resource "aws_glue_crawler" "product_crawler" {
database_name = "${aws_glue_catalog_database.analytics_db.name}"
name = "analytics-product-crawler"
role = "${aws_iam_role.glue_crawler_role.arn}"
schedule = "cron(0 0 * * ? *)"
configuration = "{\"Version\": 1.0, \"CrawlerOutput\": { \"Partitions\": { \"AddOrUpdateBehavior\": \"InheritFromTable\" }, \"Tables\": {\"AddOrUpdateBehavior\": \"MergeNewColumns\" } } }"
schema_change_policy {
delete_behavior = "DELETE_FROM_DATABASE"
}
s3_target {
path = "s3://${aws_s3_bucket.product_bucket.bucket}/products"
}
}