googleapis · tianlei2 · Jun 17, 2026 · Jun 22, 2026 · vermas2012 · Jun 15, 2026
@@ -0,0 +1,65 @@
+# HBase Snapshot Import Helper Script Usage
+
+This document describes the environment variables used by the `run-snapshot-import.sh` script to automate HBase snapshot imports into Cloud Bigtable using Dataflow.
+
+## Environment Variables
+
+The script relies on the following environment variables. You should set them before executing the script.
+
+| Variable | Description | Example / Suggested Value |
+| :--- | :--- | :--- |
+| `PROJECT_ID` | The Google Cloud Project ID where the Bigtable instance and Dataflow jobs reside. | `your-project-id` |
+| `INSTANCE_ID` | The Bigtable Instance ID to import data into. | `your-instance-id` |
+| `BUCKET` | The GCS bucket name used for Dataflow staging, temp files, and default snapshot source path. | `your-gcs-bucket` |
+| `REGION` | The GCP region to run the Dataflow jobs in. | `us-central1` |
+| `TABLE_NAME` | The target Bigtable table name. | `your-table-name` |
+| `SNAPSHOT_NAME` | The name of the HBase snapshot to import. | `your-snapshot-name` |
+| `SNAPSHOT_SOURCE_DIR` | The GCS path where the HBase snapshot export is located. | `gs://your-gcs-bucket/snapshots` |
+| `SERVICE_ACCOUNT` | The service account email to run the Dataflow jobs. | `your-service-account@developer.gserviceaccount.com` |
+| `NUM_SHARDS` | The number of shards to split the import into for parallel processing. | `20` |
+| `MAX_INFLIGHT_RPCS` | Maximum number of inflight RPCs for Bigtable client. | `100` |
+| `BULK_MUTATION_CLOSE_TIMEOUT_MINUTES` | Timeout in minutes for closing bulk mutations. | `30` |
+| `NETWORK` | VPC Network name for Dataflow workers. | `your-network` |
+| `SUBNETWORK` | VPC Subnetwork name for Dataflow workers. | `regions/us-central1/subnetworks/your-subnetwork` |
+
+## Usage
+
+### Run a specific shard range
+```bash
+./run-snapshot-import.sh <start_shard> <end_shard>
+```
+Example: `./run-snapshot-import.sh 0 5`
+
+### Run all shards (Auto-parallel mode)
+```bash
+./run-snapshot-import.sh --all
+```
+This mode will first run the restore step, and then launch background processes for all shards in parallel groups of 4 by default.
+
+## Advanced Usage
+
+### Manual Parallel Execution
+
+To run shards in parallel groups (e.g., assuming 20 shards total), you can run multiple instances of this script.
+
+> [!IMPORTANT]
+> Because concurrent shards cannot delete or overwrite the restored snapshot directory simultaneously, **no shard** performs the restore step during a sharded run. You MUST run the restore step explicitly first!
+
+Example for manual parallel execution:
+```bash
+# 1. Run the blocking restore step first!
+./run-snapshot-import.sh --restore-only
+
+# 2. Once the restore is complete, launch shards in parallel:
+./run-snapshot-import.sh 0 3 &
+./run-snapshot-import.sh 4 7 &
+./run-snapshot-import.sh 8 11 &
+./run-snapshot-import.sh 12 15 &
+./run-snapshot-import.sh 16 19 &
+```
+
+## Troubleshooting
+
+### JDK Compatibility
+
+If you are running on a newer JDK (like Java 21 or 26) and hit ByteBuddy errors, you can add `-Dnet.bytebuddy.experimental=true` to the `java` command lines in the script.
@@ -39,6 +39,33 @@ limitations under the License.
         <type>pom</type>
         <scope>import</scope>
       </dependency>
+
+      <!-- Version alignment -->
+      <!-- Mark all annotations as provided. They don't affect the runtime of the pipeline so
+      there is no need to try to version align them -->
+      <dependency>
+        <groupId>org.checkerframework</groupId>
+        <artifactId>checker-qual</artifactId>
+        <version>3.31.0</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>com.google.errorprone</groupId>
+        <artifactId>error_prone_annotations</artifactId>
+        <version>2.18.0</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>org.codehaus.mojo</groupId>
+        <artifactId>animal-sniffer-annotations</artifactId>
+        <version>1.22</version>
+        <scope>provided</scope>
+      </dependency>
+      <dependency>
+        <groupId>jakarta.annotation</groupId>
+        <artifactId>jakarta.annotation-api</artifactId>
+        <scope>provided</scope>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
@@ -85,6 +112,7 @@ limitations under the License.
       <artifactId>bigtable-hbase-beam</artifactId>
       <version>2.18.2-SNAPSHOT</version> <!-- {x-version-update:bigtable-client-parent:current} -->
       <exclusions>
+        <!-- Exclude hbase-shaded-client to prevent reintroducing the dnsjava SPI / LiteralByteString conflict (NoClassDefFoundError) -->
         <exclusion>
           <groupId>org.apache.hbase</groupId>
           <artifactId>hbase-shaded-client</artifactId>
@@ -104,6 +132,7 @@ limitations under the License.
     </dependency>
 
 
+    <!-- Use hbase-shaded-mapreduce (instead of hbase-shaded-client) to defeat the dnsjava SPI / LiteralByteString conflict (NoClassDefFoundError on JDK 21+) -->
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-shaded-mapreduce</artifactId>
@@ -118,6 +147,10 @@ limitations under the License.
           <groupId>org.apache.hadoop</groupId>
           <artifactId>hadoop-hdfs-client</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.hbase</groupId>
+          <artifactId>hbase-shaded-client</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
@@ -134,7 +167,11 @@ limitations under the License.
       <artifactId>beam-runners-direct-java</artifactId>
       <scope>test</scope>
     </dependency>
-
+    <dependency>
+      <groupId>com.google.cloud</groupId>
+      <artifactId>google-cloud-bigtable-emulator-core</artifactId>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-shaded-testing-util</artifactId>
@@ -148,11 +185,6 @@ limitations under the License.
       </exclusions>
     </dependency>
 
-    <dependency>
-      <groupId>com.google.cloud</groupId>
-      <artifactId>google-cloud-bigtable-emulator-core</artifactId>
-      <scope>test</scope>
-    </dependency>
     <dependency>
       <groupId>com.google.cloud.bigtable</groupId>
       <artifactId>bigtable-internal-test-helper</artifactId>
@@ -186,7 +218,7 @@ limitations under the License.
     </dependency>
     <dependency>
       <groupId>org.mockito</groupId>
-      <artifactId>mockito-core</artifactId>
+      <artifactId>mockito-inline</artifactId>
       <version>${mockito.version}</version>
       <scope>test</scope>
     </dependency>
@@ -221,6 +253,14 @@ limitations under the License.
 
 
     <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <fork>true</fork>
+        </configuration>
+      </plugin>
+
       <plugin>
         <artifactId>maven-jar-plugin</artifactId>
         <configuration>
@@ -287,6 +327,8 @@ limitations under the License.
 	      <filter>
 		  <artifact>*:*</artifact>
 		  <excludes>
+		      <!-- Exclude InetAddressResolverProvider to prevent the dnsjava SPI / LiteralByteString conflict (NoClassDefFoundError on JDK 21+) -->
+		      <exclude>META-INF/services/java.net.spi.InetAddressResolverProvider</exclude>
 		      <exclude>META-INF/*.SF</exclude>
 		      <exclude>META-INF/*.DSA</exclude>
 		      <exclude>META-INF/*.RSA</exclude>

@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# ==============================================================================
+# HBase Snapshot Import Helper Script
+# ==============================================================================
+# This script runs a range of Dataflow snapshot import jobs sequentially or in parallel.
+# Must be executed from the 'bigtable-dataflow-parent/bigtable-beam-import' directory.
+#
+# For detailed usage and advanced options, see: SNAPSHOT_IMPORT_USAGE.md
+# ==============================================================================
+
+# ------------------------------------------------------------------------------
+# Environment Variables
+# ------------------------------------------------------------------------------
+# Most users will need to set these variables before running the script.
+# See SNAPSHOT_IMPORT_USAGE.md for details and expected values.
+
+# --- Required / Common Configurations ---
+# export PROJECT_ID="your-project-id"
+# export INSTANCE_ID="your-instance-id"
+# export BUCKET="your-gcs-bucket"
+# export REGION="us-central1"
+# 
+# export TABLE_NAME="your-table-name"
+# export SNAPSHOT_NAME="your-snapshot-name"
+# export SNAPSHOT_SOURCE_DIR="gs://your-gcs-bucket/snapshots"
+# export SERVICE_ACCOUNT="your-service-account"
+
+# --- Sharding & Tuning ---
+# export NUM_SHARDS="20"
+# export MAX_INFLIGHT_RPCS="100"
+# export BULK_MUTATION_CLOSE_TIMEOUT_MINUTES="30"
+
+# --- Network Configurations ---
+# export NETWORK="your-network"
+# export SUBNETWORK="your-subnetwork"
+
+# ------------------------------------------------------------------------------
+# Usage
+# ------------------------------------------------------------------------------
+# Usage: ./run-snapshot-import.sh <start_shard> <end_shard>
+#   Or:  ./run-snapshot-import.sh --all
+#        (Runs all shards in parallel groups of 4 by default)
+#
+# Examples:
+#   ./run-snapshot-import.sh 0 3
+#   ./run-snapshot-import.sh --all
+
+if [ "$#" -ne 2 ] && [ "$1" != "--all" ] && [ "$1" != "--restore-only" ]; then
+    echo "Usage: $0 <start_shard> <end_shard>"
+    echo "   Or: $0 --all"
+    echo "   Or: $0 --restore-only"
+    exit 1
+fi
+
+START_SHARD=$1
+END_SHARD=$2
+
+# Configurations
+JAR_PATH="target/bigtable-beam-import-2.18.2-SNAPSHOT-shaded.jar"
+RESTORE_DIR="${SNAPSHOT_SOURCE_DIR}/restore-${SNAPSHOT_NAME}"
+
+# --- RESTORE ONLY MODE ---
+if [ "$1" == "--restore-only" ]; then
+    echo "🚀 Performing snapshot restore (blocking)..."
+    java -jar ${JAR_PATH} importsnapshot \
+      --runner=DataflowRunner \
+      --project=${PROJECT_ID} \
+      --bigtableInstanceId=${INSTANCE_ID} \
+      --bigtableTableId=${TABLE_NAME} \
+      --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \
+      --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \
+      --stagingLocation=gs://${BUCKET}/dataflow/staging \
+      --tempLocation=gs://${BUCKET}/dataflow/temp \
+      --region=${REGION} \
+      --performOnlyRestoreStep=true \
+      --restorePath=${RESTORE_DIR} \
+      --jobName="restore-job" \
+      --network=${NETWORK} \
+      --subnetwork=${SUBNETWORK}
+    echo "✅ Restore completed."
+    echo "⚠️ IMPORTANT: Please manually cleanup the restore path once validation succeeds:"
+    echo "   gsutil rm -r ${RESTORE_DIR}"
+    exit 0
+fi
+
+# --- AUTO-PARALLEL MODE ---
+if [ "$1" == "--all" ]; then
+    echo "🚀 Starting fully automated snapshot import..."
+
+    # Step 1: Perform ONLY the restore step
+    echo "Step 1/2: Performing snapshot restore (blocking)..."
+    java -jar ${JAR_PATH} importsnapshot \
+      --runner=DataflowRunner \
+      --project=${PROJECT_ID} \
+      --bigtableInstanceId=${INSTANCE_ID} \
+      --bigtableTableId=${TABLE_NAME} \
+      --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \
+      --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \
+      --stagingLocation=gs://${BUCKET}/dataflow/staging \
+      --tempLocation=gs://${BUCKET}/dataflow/temp \
+      --region=${REGION} \
+      --performOnlyRestoreStep=true \
+      --restorePath=${RESTORE_DIR} \
+      --jobName="restore-job" \
+      --network=${NETWORK} \
+      --subnetwork=${SUBNETWORK}
+
+    echo "Restore completed. Proceeding to data import."
+
+    # Step 2: Launch parallel groups of 4
+    echo "Step 2/2: Launching parallel groups of 4 shards..."
+    SHARDS_PER_GROUP=4
+
+    for (( start=0; start<$NUM_SHARDS; start+=$SHARDS_PER_GROUP )); do
+        end=$((start + SHARDS_PER_GROUP - 1))
+        [ $end -ge $NUM_SHARDS ] && end=$((NUM_SHARDS - 1))
+
+        echo "Launching group: shards $start to $end in background"
+        # Call ourselves with the range!
+        $0 $start $end &
+    done
+
+    echo "All groups launched. Waiting for all background jobs to finish..."
+    wait
+    echo "🎉 All import jobs completed!"
+    echo "⚠️ IMPORTANT: Please manually cleanup the restore path once validation succeeds:"
+    echo "   gsutil rm -r ${RESTORE_DIR}"
+    exit 0
+fi
+# ----------------------------------------
+
+# Standard Range Mode
+for i in $(seq $START_SHARD $END_SHARD); do
+  echo "Submitting Dataflow job for shardIndex: $i"
+
+  # As per the sharding contract, ALL parallel sharded jobs MUST skip the restore step
+  # to prevent concurrent shards from deleting the restore path.
+  # The --all mode runs performOnlyRestoreStep=true automatically in Step 1.
+  SKIP_RESTORE="true"
+
+  JOB="job-${i}"
+  java -jar ${JAR_PATH} importsnapshot \
+    --runner=DataflowRunner \
+    --project=${PROJECT_ID} \
+    --bigtableInstanceId=${INSTANCE_ID} \
+    --bigtableTableId=${TABLE_NAME} \
+    --hbaseSnapshotSourceDir=${SNAPSHOT_SOURCE_DIR} \
+    --snapshots=${SNAPSHOT_NAME}:${TABLE_NAME} \
+    --stagingLocation=gs://${BUCKET}/dataflow/staging \
+    --tempLocation=gs://${BUCKET}/dataflow/temp \
+    --workerMachineType=n1-highmem-4 \
+    --diskSizeGb=500 \
+    --maxNumWorkers=10 \
+    --region=${REGION} \
+    --serviceAccount=${SERVICE_ACCOUNT} \
+    --usePublicIps=false \
+    --enableSnappy=true \
+    --skipRestoreStep=${SKIP_RESTORE} \
+    --deleteRestoredSnapshots=false \
+    --restorePath=${RESTORE_DIR} \
+    --numShards=${NUM_SHARDS} \
+    --shardIndex=$i \
+    --jobName="${JOB}" \
+    --network=${NETWORK} \
+    --subnetwork=${SUBNETWORK} \
+    --maxInflightRpcs=${MAX_INFLIGHT_RPCS} \
+    --bulkMutationCloseTimeoutMinutes=${BULK_MUTATION_CLOSE_TIMEOUT_MINUTES}
+
+  # Sequential within this script instance
+done
@@ -17,6 +17,7 @@
 
 import com.google.bigtable.repackaged.com.google.api.core.InternalApi;
 import com.google.bigtable.repackaged.com.google.api.core.InternalExtensionOnly;
+import com.google.cloud.bigtable.beam.hbasesnapshots.HBaseSnapshotRestoreTool;
 import com.google.cloud.bigtable.beam.hbasesnapshots.ImportJobFromHbaseSnapshot;
 import com.google.cloud.bigtable.beam.sequencefiles.CreateTableHelper;
 import com.google.cloud.bigtable.beam.sequencefiles.ExportJob;
@@ -51,6 +52,9 @@ public static void main(String[] args) throws Exception {
       case "importsnapshot":
         ImportJobFromHbaseSnapshot.main(subArgs);
         break;
+      case "restoresnapshot":
+        HBaseSnapshotRestoreTool.main(subArgs);
+        break;
       case "create-table":
         CreateTableHelper.main(subArgs);
         break;