Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 72 additions & 4 deletions terraform/modules/cloud-build-docker/build_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os
import subprocess
import sys
import time


def run_command(cmd, **kwargs):
Expand All @@ -30,6 +31,8 @@ def run_command(cmd, **kwargs):
def get_image_digest(image_uri, tag, project_id):
"""Query the digest of an existing image."""
try:
# Use tags={tag} for exact match filtering (not tags:{tag} which is substring match)
# See: gcloud topic filters
result = run_command(
[
"gcloud",
Expand All @@ -38,7 +41,7 @@ def get_image_digest(image_uri, tag, project_id):
"list-tags",
image_uri,
"--filter",
f"tags:{tag}",
f"tags={tag}",
"--limit",
"1",
"--format=get(digest)",
Expand All @@ -57,6 +60,8 @@ def get_image_digest(image_uri, tag, project_id):
def check_cache_tag_exists(image_uri, cache_tag, project_id):
"""Check if a cache tag exists for the given image."""
try:
# Use tags={cache_tag} for exact match filtering (not tags:{tag} which is substring match)
# See: gcloud topic filters
result = run_command(
[
"gcloud",
Expand All @@ -65,7 +70,7 @@ def check_cache_tag_exists(image_uri, cache_tag, project_id):
"list-tags",
image_uri,
"--filter",
f"tags:{cache_tag}",
f"tags={cache_tag}",
"--limit",
"1",
"--format=get(digest)",
Expand Down Expand Up @@ -103,6 +108,7 @@ def build_image(
project_id,
image_tag_suffix,
base_digest="latest",
region="us-central1",
):
"""Build a Docker image via Cloud Build and return its digest."""

Expand Down Expand Up @@ -172,20 +178,80 @@ def build_image(
cloudbuild_config = os.path.join(script_dir, "cloudbuild.yml")

# TODO(jwbron): Consider adding automatic GCS bucket creation with import support for existing buckets in terraform
run_command(
# Submit build asynchronously to avoid rate limit issues when running many parallel builds
print(f"Submitting build for {image_name} in region {region}...", file=sys.stderr)
result = subprocess.run(
[
"gcloud",
"builds",
"submit",
context_path,
f"--config={cloudbuild_config}",
f"--project={project_id}",
f"--region={region}",
f"--gcs-source-staging-dir=gs://{project_id}-cloudbuild-ci/staging",
f"--gcs-log-dir=gs://{project_id}-cloudbuild-ci/logs",
f"--substitutions={subs_str}",
]
"--async", # Don't wait/poll - avoids rate limit issues
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you see my previous comment about just passing the timeout flag rather than trying to implement the polling yourself? Request Changes for that. Rest is looking good!

"--format=get(id)", # Output just the build ID for easier parsing
],
check=True,
capture_output=True,
text=True,
)

# Extract build ID from async output
build_id = result.stdout.strip()
if not build_id:
raise RuntimeError(f"Failed to extract build ID from output: {result.stdout}")

print(f"Build submitted: {build_id}", file=sys.stderr)
print(f"Waiting for build to complete...", file=sys.stderr)

# Poll build status using 'gcloud builds describe'
# Use exponential backoff to reduce API calls and avoid rate limits
poll_interval = 10 # Start with 10 seconds
max_interval = 60 # Cap at 60 seconds
elapsed = 0

while True:
time.sleep(poll_interval)
elapsed += poll_interval

try:
result = run_command(
[
"gcloud",
"builds",
"describe",
build_id,
f"--project={project_id}",
f"--region={region}",
"--format=get(status)",
]
)
status = result.stdout.strip()

if status == "SUCCESS":
print(f"Build completed successfully after {elapsed}s", file=sys.stderr)
break
elif status in ["FAILURE", "TIMEOUT", "CANCELLED", "INTERNAL_ERROR"]:
print(f"\nBuild {status} after {elapsed}s", file=sys.stderr)
print(f"\nTo view build logs:", file=sys.stderr)
print(f" gcloud builds log {build_id} --project={project_id}", file=sys.stderr)
print(f"Or visit: https://console.cloud.google.com/cloud-build/builds/{build_id}?project={project_id}", file=sys.stderr)
raise RuntimeError(f"Build {build_id} {status}")
elif status in ["QUEUED", "WORKING"]:
print(f"Build status: {status} (elapsed: {elapsed}s)", file=sys.stderr)
# Increase poll interval (exponential backoff, capped)
poll_interval = min(poll_interval * 1.5, max_interval)
else:
print(f"Unknown build status: {status}", file=sys.stderr)

except subprocess.CalledProcessError as e:
print(f"\nFailed to check build status: {e}", file=sys.stderr)
raise RuntimeError(f"Failed to check status for build {build_id}")

# Query the digest of the newly built image
digest = get_image_digest(image_uri, image_tag_suffix, project_id)
if not digest:
Expand Down Expand Up @@ -216,6 +282,7 @@ def main():
project_id = input_data["project_id"]
image_tag_suffix = input_data["image_tag_suffix"]
base_digest = input_data.get("base_digest", "latest")
region = input_data.get("region", "us-central1")

# Validate that image_tag_suffix is not empty
if not image_tag_suffix or image_tag_suffix.strip() == "":
Expand All @@ -229,6 +296,7 @@ def main():
project_id=project_id,
image_tag_suffix=image_tag_suffix,
base_digest=base_digest,
region=region,
)

# Return JSON output for Terraform
Expand Down
46 changes: 29 additions & 17 deletions terraform/modules/cloud-build-docker/cloudbuild.yml
Original file line number Diff line number Diff line change
@@ -1,36 +1,48 @@
# Cloud Build configuration with branch-based caching.
# Enables caching by pulling the previous image with a given "cache tag", if it exists, and reusing its layers.
# Cloud Build configuration with BuildKit inline cache for multi-stage builds.
# BuildKit is required to properly cache intermediate stages in multi-stage builds.
steps:
- name: 'gcr.io/cloud-builders/docker'
id: Pull previous image
id: Pull cache images
entrypoint: bash
args:
- -c
- |
echo "Attempting to pull cache image: $_IMAGE_NAME:$_CACHE_TAG"
docker pull "$_IMAGE_NAME:$_CACHE_TAG" || echo "Cache image not found, will build without cache"
echo "Attempting to pull cache image..."
# With BuildKit inline cache, we only need to pull the final image
# which contains metadata about all intermediate stages
docker pull "$_IMAGE_NAME:$_CACHE_TAG" || echo "Cache image not found, building from scratch"
- name: 'gcr.io/cloud-builders/docker'
id: Build image
id: Build image with BuildKit
entrypoint: bash
env:
- 'DOCKER_BUILDKIT=1'
args:
- -c
- |
if docker image inspect "$_IMAGE_NAME:$_CACHE_TAG" > /dev/null 2>&1; then
echo "Using cache from $_IMAGE_NAME:$_CACHE_TAG"
docker build --tag="$_IMAGE_TAG" --cache-from="$_IMAGE_NAME:$_CACHE_TAG" --build-arg BASE_IMAGE="$_BASE_DIGEST" .
else
echo "No cache found for $_IMAGE_NAME:$_CACHE_TAG, building without --cache-from"
docker build --tag="$_IMAGE_TAG" --build-arg BASE_IMAGE="$_BASE_DIGEST" .
fi
echo "Building with BuildKit and inline cache..."
# Build with BuildKit inline cache
# --cache-from pulls cache metadata from previous build
# --build-arg BUILDKIT_INLINE_CACHE=1 embeds cache metadata in the image
docker build \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work if the --cache-from argument isn't found? Or do you still need the if like you had in the old code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current code should work fine without the conditional. Current approach (works correctly):

  • Line 13: Try to pull cache image with || echo - if pull fails, just print message and continue (doesn't exit)
  • Line 30: Always use --cache-from in the build command

Docker/BuildKit behavior:

  • --cache-from is graceful - if the referenced image doesn't exist locally, Docker just ignores it and builds without cache
  • No error is thrown, build continues normally

--tag="$_IMAGE_TAG" \
--cache-from="$_IMAGE_NAME:$_CACHE_TAG" \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--build-arg BASE_IMAGE="$_BASE_DIGEST" \
.
- name: 'gcr.io/cloud-builders/docker'
id: Tag cache
id: Tag cache image
entrypoint: bash
args: ['-c', 'docker tag "$_IMAGE_TAG" "$_IMAGE_NAME:$_CACHE_TAG"']
waitFor: ['Build image']
waitFor: ['Build image with BuildKit']

- name: 'gcr.io/cloud-builders/docker'
id: Push cache
id: Push cache image
entrypoint: bash
args: ['-c', 'docker push "$_IMAGE_NAME:$_CACHE_TAG"']
waitFor: ['Tag cache']
waitFor: ['Tag cache image']

images:
- '$_IMAGE_TAG'
1 change: 1 addition & 0 deletions terraform/modules/cloud-build-docker/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ data "external" "image_build" {
project_id = var.project_id
image_tag_suffix = var.image_tag_suffix
base_digest = var.base_digest
region = var.region
}

# Trigger rebuild when any of these change
Expand Down
6 changes: 6 additions & 0 deletions terraform/modules/cloud-build-docker/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ variable "base_digest" {
type = string
default = "latest"
}

variable "region" {
description = "The GCP region where Cloud Build jobs will run"
type = string
default = "us-central1"
}