From 657285f84b2437a05b503a3762d4125a0e6dff1a Mon Sep 17 00:00:00 2001
From: Scott Roy <scroy@meta.com>
Date: Fri, 9 Jan 2026 16:40:13 -0800
Subject: [PATCH 1/2] Use content hash instead of UUID for model identifier

Summary:
Previously, the CoreML backend generated model identifiers using `uuid.uuid4()`, which created a random identifier every time. This meant that even if the exact same model was exported twice, it would get different identifiers, causing cache misses on the device.

This change replaces the UUID with a SHA256 hash of the saved mlpackage contents (including weights). Now identical models produce identical identifiers, enabling proper cache hits.

The hash is computed by:
1. Saving the model to a temp directory first
2. Hashing all files in the mlpackage directory (sorted for determinism)
3. Using the first 32 characters of the hex digest as the identifier

This change also makes it easier to bisect when code changes have altered a generated PTE file - if the identifier changes, you know the model content changed.

Note: The identifier format changes from UUID format (with hyphens, e.g., `executorch_a1b2c3d4-e5f6-7890-abcd-ef1234567890`) to a hash format (no hyphens, e.g., `executorch_a1b2c3d4e5f67890abcdef12`). This has no BC impact since the runtime treats the identifier as an opaque string.

Differential Revision: D90424166
---
 .../coreml/compiler/coreml_preprocess.py      | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index 32cd0df67a2..4f474067b40 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -2,12 +2,12 @@
 
 # CoreML backend for delegating a EdgeProgram to CoreML.
 
+import hashlib
 import json
 import logging
 
 import shutil
 import tempfile
-import uuid
 from dataclasses import asdict, dataclass
 from enum import Enum
 
@@ -36,6 +36,16 @@
 logger.setLevel(get_coreml_log_level(default_level=logging.WARNING))
 
 
+def _hash_directory(path: Path) -> str:
+    """Hash all files in a directory deterministically."""
+    hasher = hashlib.sha256()
+    for file_path in sorted(path.rglob("*")):
+        if file_path.is_file():
+            hasher.update(str(file_path.relative_to(path)).encode())
+            hasher.update(file_path.read_bytes())
+    return hasher.hexdigest()[:32]
+
+
 class COMPILE_SPEC_KEYS(Enum):
     COMPUTE_UNITS = "compute_units"
     MODEL_TYPE = "model_type"
@@ -448,10 +458,18 @@ def save_model_debug_info(model_debug_info: ModelDebugInfo, model_dir_path: Path
     def preprocess_model(
         mlmodel: ct.models.MLModel, model_type: MODEL_TYPE
     ) -> PreprocessResult:
-        identifier = "executorch_" + str(uuid.uuid4())
-        dir_path: Path = Path(tempfile.gettempdir()) / identifier
+        dir_path: Path = Path(tempfile.mkdtemp())
         model_dir_path: Path = dir_path / "lowered_module"
         model_spec: ct.proto.Model_pb2 = mlmodel.get_spec()
+
+        # Save model first to compute content hash for deterministic identifier.
+        model_path = model_dir_path / MODEL_PATHS.MODEL.value
+        mlmodel.save(str(model_path))
+
+        # Generate deterministic identifier from model contents.
+        content_hash = _hash_directory(model_path)
+        identifier = "executorch_" + content_hash
+
         logger.warning(
             f"The model with identifier {identifier} was exported with CoreML specification version {model_spec.specificationVersion}, and it will not run on all version of iOS/macOS."
             " See https://apple.github.io/coremltools/mlmodel/Format/Model.html#model for information on what OS versions are compatible with this specifcation version."
@@ -462,10 +480,6 @@ def preprocess_model(
             model_spec=model_spec,
             identifier=identifier,
         )
-
-        # Save model.
-        model_path = model_dir_path / MODEL_PATHS.MODEL.value
-        mlmodel.save(str(model_path))
         # Extract delegate mapping file.
         model_debug_info: Optional[ModelDebugInfo] = CoreMLBackend.get_model_debug_info(
             model_path

From 8b06ef21156fd1d3a62de59204d2404a914721d9 Mon Sep 17 00:00:00 2001
From: Scott Roy <161522778+metascroy@users.noreply.github.com>
Date: Wed, 14 Jan 2026 17:57:13 -0800
Subject: [PATCH 2/2] Make hasher deterministic

---
 .../coreml/compiler/coreml_preprocess.py      | 55 ++++++++++++++++---
 1 file changed, 47 insertions(+), 8 deletions(-)

diff --git a/backends/apple/coreml/compiler/coreml_preprocess.py b/backends/apple/coreml/compiler/coreml_preprocess.py
index 4f474067b40..2e942377ecb 100644
--- a/backends/apple/coreml/compiler/coreml_preprocess.py
+++ b/backends/apple/coreml/compiler/coreml_preprocess.py
@@ -36,13 +36,52 @@
 logger.setLevel(get_coreml_log_level(default_level=logging.WARNING))
 
 
-def _hash_directory(path: Path) -> str:
-    """Hash all files in a directory deterministically."""
+from google.protobuf import text_format
+
+
+def _hash_model(model_spec: ct.proto.Model_pb2, model_path: Path) -> str:  # pyre-ignore
+    """Hash model deterministically, including both spec and weights.
+
+    This function addresses three sources of non-determinism in CoreML models:
+
+    1. Timestamps in metadata: CoreML's coremltools embeds a conversion timestamp
+       in the model's userDefined metadata (com.github.apple.coremltools.conversion_date).
+       We clear this metadata before hashing.
+
+    2. Random UUIDs in Manifest.json: The mlpackage's Manifest.json contains randomly
+       generated UUIDs that change on every save, even for identical model content.
+       We exclude this file from hashing.
+
+    3. Non-deterministic protobuf serialization: Protobuf's SerializeToString() does
+       not guarantee consistent field ordering across processes. We use text_format
+       for deterministic serialization instead.
+    """
     hasher = hashlib.sha256()
-    for file_path in sorted(path.rglob("*")):
-        if file_path.is_file():
-            hasher.update(str(file_path.relative_to(path)).encode())
+
+    # Hash model spec with non-deterministic metadata cleared
+    # Use text_format for deterministic serialization (protobuf binary
+    # serialization is not deterministic across processes)
+    spec_copy = ct.proto.Model_pb2.Model()  # pyre-ignore
+    spec_copy.CopyFrom(model_spec)
+    # Only clear the specific non-deterministic key, not all userDefined metadata
+    if (
+        "com.github.apple.coremltools.conversion_date"
+        in spec_copy.description.metadata.userDefined
+    ):
+        del spec_copy.description.metadata.userDefined[
+            "com.github.apple.coremltools.conversion_date"
+        ]
+    hasher.update(text_format.MessageToString(spec_copy).encode())
+
+    # Hash weight files (exclude Manifest.json which contains random UUIDs)
+    for file_path in sorted(model_path.rglob("*")):
+        if file_path.is_file() and file_path.name != "Manifest.json":
+            # Skip the model.mlmodel since we already hashed the spec above
+            if file_path.name == "model.mlmodel":
+                continue
+            hasher.update(str(file_path.relative_to(model_path)).encode())
             hasher.update(file_path.read_bytes())
+
     return hasher.hexdigest()[:32]
 
 
@@ -462,12 +501,12 @@ def preprocess_model(
         model_dir_path: Path = dir_path / "lowered_module"
         model_spec: ct.proto.Model_pb2 = mlmodel.get_spec()
 
-        # Save model first to compute content hash for deterministic identifier.
+        # Save model first so we can hash both spec and weights.
         model_path = model_dir_path / MODEL_PATHS.MODEL.value
         mlmodel.save(str(model_path))
 
-        # Generate deterministic identifier from model contents.
-        content_hash = _hash_directory(model_path)
+        # Generate deterministic identifier from model content (spec + weights).
+        content_hash = _hash_model(model_spec, model_path)
         identifier = "executorch_" + content_hash
 
         logger.warning(