From d439c3909a012a39065054c41bfa85e9dba83926 Mon Sep 17 00:00:00 2001
From: Longfang Zhao <longfangzhao@meta.com>
Date: Tue, 9 Jun 2026 16:42:26 -0700
Subject: [PATCH 1/2] Cross-load packed weight cache reuse for XNNPACK

Summary:
Add cross-load reuse + multi-PTE safety to the file-backed packed weight cache (D106673663). The first PTE in a session calls `save_packed_index()` to append a trailer; subsequent process launches mmap the file and pre-populate `name_to_packed_data_metadata_` so `look_up()` hits for every saved weight and `xnn_create_runtime` skips packing entirely.

## Cache file format

```
[packed data regions]                          (written by reserve_space)
[index entries]                                (written by save_packed_index)
  each: name_len(4B) | name(N) | file_offset(8B) | data_size(8B)
[footer: 20 bytes]
  index_start(8B) | entry_count(4B) | magic "XPWC"(4B) | version(4B)
```

## Lifecycle invariants

- `cache_loaded_` gate: `load_packed_cache()` runs at most once per process per path. Subsequent PTE inits for the same path reopen the write fd without re-reading the trailer.
- `from_load` flag: persistent entries (loaded from trailer or promoted on save) skip `delete_packed_data` cleanup. This keeps the mmap region and metadata alive across PTE unload/reload, so the next init hits the cache instead of repacking. Without this, every PTE destroy/recreate cycle appended a fresh copy to the file (~450 MB per cycle).
- No-op save short-circuit: `save_packed_index` returns early when no new `reserve_space` happened since the last save, avoiding the mtime churn that previously made the cache file look modified on every model load.

## Multi-PTE behavior

- Multiple PTEs (or methods that don't share weights) in the same model load share one cache file. Each PTE's `reserve_space` extends the file; `finalize_for_runtime` msyncs only newly added regions; `save_packed_index` writes one trailer covering all PTEs at the end of the load.
- Sibling PTEs that opt out of the mmap path (caller passes empty `packed_cache_path`) early-return from `initialize_for_runtime` and fall through to heap allocation, without touching the singleton's PLLM state.
- Cross-model coexistence relies on caller-side discipline: only models that opt in set a non-empty cache path. Setting different non-empty paths concurrently is not supported by this singleton design.

## Caller change

`XNNPACKBackend::init` always calls `set_packed_cache_path` (with empty string for non-opted-in PTEs). This keeps the singleton path in sync with the current PTE instead of inheriting a sibling's path.

## Test Plan

```
buck2 test fbcode//executorch/backends/xnnpack/test:test_xnn_weights_cache  # 5 pass
buck2 build fbsource//xplat/executorch/backends/xnnpack:xnnpack_backendApple
buck2 build fbsource//xplat/executorch/backends/xnnpack:xnnpack_backend
buck2 build fbcode//executorch/backends/xnnpack:xnnpack_backend
```

On device (iOS Stella build, PLLM + Llama3 runner):
- Cold start: load `(1184 entries)` from cache, `reserve_mmap=0` for cached weights
- Cache file size stable at ~593 MB across PLLM unload/reload cycles
- `app_peak ~700 MB` (vs ~2.5 GB pre-fix)
- `compressed ~100 MB` (vs ~1.7 GB pre-fix)

Differential Revision: D106717093
---
 backends/xnnpack/runtime/XNNPACKBackend.cpp   |  12 +-
 backends/xnnpack/runtime/XNNPACKBackend.h     |   3 +
 backends/xnnpack/runtime/XNNWeightsCache.cpp  | 462 ++++++++++++++----
 backends/xnnpack/runtime/XNNWeightsCache.h    |  35 +-
 .../test/runtime/test_xnn_weights_cache.cpp   | 393 +++++++++++++++
 5 files changed, 807 insertions(+), 98 deletions(-)

diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
index 3a5d6ab7958..6e1d3b042a5 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.cpp
+++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -100,9 +100,7 @@ class XnnpackBackend final
       lock_weights_cache.lock();
 
       const auto& cache_path = options_.get_packed_cache_path();
-      if (!cache_path.empty()) {
-        weights_cache_->set_packed_cache_path(cache_path);
-      }
+      weights_cache_->set_packed_cache_path(cache_path);
 
       weights_cache_->initialize_for_runtime(
           context.get_runtime_allocator(), named_data_map);
@@ -219,6 +217,14 @@ class XnnpackBackend final
       BackendOptionContext& context,
       const Span<BackendOption>& backend_options) override {
     for (const auto& option : backend_options) {
+      if (strcmp(option.key, xnnpack::save_packed_index_option_key) == 0) {
+        auto* val = std::get_if<bool>(&option.value);
+        if (val && *val) {
+          const std::lock_guard<std::mutex> lock(weights_cache_mutex_);
+          return weights_cache_->save_packed_index();
+        }
+        continue;
+      }
       Error err = options_.set_option(option);
       if (err != Error::Ok) {
         return err;
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.h b/backends/xnnpack/runtime/XNNPACKBackend.h
index e3492c3f5f3..ac9d42b9fbf 100644
--- a/backends/xnnpack/runtime/XNNPACKBackend.h
+++ b/backends/xnnpack/runtime/XNNPACKBackend.h
@@ -20,6 +20,9 @@ const char weight_cache_option_key[] = "weight_cache_enabled";
 // @lint-ignore CLANGTIDY facebook-hte-CArray
 const char packed_cache_path_option_key[] = "packed_cache_path";
 
+/// Trigger saving the packed weight index for cross-load cache reuse.
+const char save_packed_index_option_key[] = "save_packed_index";
+
 /// Workspace sharing mode. This is a backend option that can be set via the
 /// set_option API to control memory sharing between CALL_DELEGATE instances.
 /// This is useful for reducing memory consumption.
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.cpp b/backends/xnnpack/runtime/XNNWeightsCache.cpp
index 70c410e5729..9eab694d529 100644
--- a/backends/xnnpack/runtime/XNNWeightsCache.cpp
+++ b/backends/xnnpack/runtime/XNNWeightsCache.cpp
@@ -24,6 +24,7 @@
 #include <string>
 #include <vector>
 
+
 namespace executorch {
 namespace backends {
 namespace xnnpack {
@@ -32,6 +33,8 @@ namespace delegate {
 using executorch::ET_RUNTIME_NAMESPACE::NamedDataMap;
 using executorch::runtime::MemoryAllocator;
 
+
+
 XNNWeightsCache::XNNWeightsCache() {
   weights_cache_.context = this;
   weights_cache_.look_up = (size_t(*)(
@@ -63,6 +66,72 @@ XNNWeightsCache::~XNNWeightsCache() {
 #endif
 }
 
+// Trivial helpers for little-endian byte serialization of the trailer.
+template <typename T>
+static void append_le(std::vector<uint8_t>& buf, T value) {
+  const auto* p = reinterpret_cast<const uint8_t*>(&value);
+  buf.insert(buf.end(), p, p + sizeof(T));
+}
+
+template <typename T>
+static T read_le(const uint8_t* src) {
+  T value;
+  memcpy(&value, src, sizeof(T));
+  return value;
+}
+
+#ifndef _WIN32
+// Open the cache file and take an advisory exclusive lock. Returns the
+// fd, or -1 if open/flock failed (logs the failure). The caller decides
+// how to recover (typically: skip the mmap path for this init).
+static int open_locked(const std::string& path, int flags) {
+  int fd = open(path.c_str(), flags, 0600);
+  if (fd < 0) {
+    ET_LOG(Error, "open(%s) failed (errno=%d)", path.c_str(), errno);
+    return -1;
+  }
+  if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+    ET_LOG(Error, "flock(%s) failed (errno=%d)", path.c_str(), errno);
+    close(fd);
+    return -1;
+  }
+  return fd;
+}
+
+// Drop in-memory state that referenced a now-truncated cache file.
+// Heap-backed entries (live in packed_pointer_to_container_) stay; their
+// packed_data_ptrs_ slots remain valid so existing offsets don't shift.
+void XNNWeightsCache::reset_for_fresh_write() {
+  for (auto& region : mmap_regions_) {
+    if (region.addr != nullptr && region.addr != MAP_FAILED) {
+      munmap(region.addr, region.size);
+    }
+  }
+  mmap_regions_.clear();
+  mmap_regions_synced_ = 0;
+  packed_file_used_ = 0;
+  ptr_to_file_offset_.clear();
+  file_ptr_to_region_index_.clear();
+  for (auto it = name_to_packed_data_metadata_.begin();
+       it != name_to_packed_data_metadata_.end();) {
+    bool is_heap_backed = false;
+    if (it->second.offset < packed_data_ptrs_.size()) {
+      void* ptr = packed_data_ptrs_[it->second.offset];
+      if (ptr != nullptr &&
+          packed_pointer_to_container_.find(ptr) !=
+              packed_pointer_to_container_.end()) {
+        is_heap_backed = true;
+      }
+    }
+    if (is_heap_backed) {
+      ++it;
+    } else {
+      it = name_to_packed_data_metadata_.erase(it);
+    }
+  }
+}
+#endif
+
 Error XNNWeightsCache::initialize_for_runtime(
     MemoryAllocator* runtime_allocator,
     const NamedDataMap* named_data_map) {
@@ -71,38 +140,52 @@ Error XNNWeightsCache::initialize_for_runtime(
   is_finalized_ = false;
 
 #ifndef _WIN32
-  // Open the file for packed weights. Each reserve_space() call
-  // independently mmaps a region of the file. Once packed_file_disabled_
-  // is set we never re-open — re-opening with O_TRUNC would corrupt any
-  // still-live mappings into the same path and cause SIGBUS on access.
-  if (!packed_cache_path_.empty() && packed_file_fd_ < 0 &&
-      !packed_file_disabled_) {
-    packed_file_fd_ =
-        open(packed_cache_path_.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0600);
-    if (packed_file_fd_ < 0) {
-      ET_LOG(
-          Error,
-          "Failed to open packed weight file: %s (errno=%d)",
-          packed_cache_path_.c_str(),
-          errno);
-    } else if (flock(packed_file_fd_, LOCK_EX | LOCK_NB) != 0) {
-      // Another XNNWeightsCache instance (this process or another) is
-      // already using this path. O_TRUNC above would corrupt its mappings.
-      // Disable mmap for this instance to prevent collision; fall back to
-      // heap allocation for the remainder of this cache's lifetime.
-      ET_LOG(
-          Error,
-          "Another instance is using packed weight cache file %s (errno=%d); "
-          "disabling mmap path",
-          packed_cache_path_.c_str(),
-          errno);
-      close(packed_file_fd_);
-      packed_file_fd_ = -1;
-      packed_file_disabled_ = true;
-    } else {
-      ET_LOG(Info, "Opened packed weight file: %s", packed_cache_path_.c_str());
-    }
+  if (packed_cache_path_.empty() || packed_file_fd_ >= 0) {
+    return Error::Ok;
+  }
+
+  // Already loaded earlier this session; just reopen the write fd that
+  // save_packed_index() closed. Subsequent reserve_space can extend the
+  // file for any entries not in the saved trailer.
+  if (cache_loaded_) {
+    packed_file_fd_ = open_locked(packed_cache_path_, O_RDWR);
+    return Error::Ok;
+  }
+
+  // First init for this path: try to load the saved trailer; on success
+  // open a write fd for any new entries. If load fails, fall through to
+  // fresh-write below.
+  if (load_packed_cache()) {
+    ET_LOG(
+        Info,
+        "Loaded packed weight cache: %s (%zu entries)",
+        packed_cache_path_.c_str(),
+        name_to_packed_data_metadata_.size());
+    packed_file_fd_ = open_locked(packed_cache_path_, O_RDWR);
+    return Error::Ok;
+  }
+
+  // Fresh write. Skip O_TRUNC in open_locked so a concurrent holder's
+  // mmap stays valid; truncate explicitly only after we hold the lock.
+  packed_file_fd_ = open_locked(packed_cache_path_, O_RDWR | O_CREAT);
+  if (packed_file_fd_ < 0) {
+    return Error::Ok;
+  }
+  if (ftruncate(packed_file_fd_, 0) != 0) {
+    ET_LOG(
+        Error,
+        "ftruncate(0) failed for %s (errno=%d); heap fallback this init",
+        packed_cache_path_.c_str(),
+        errno);
+    close(packed_file_fd_);
+    packed_file_fd_ = -1;
+    return Error::Ok;
   }
+  reset_for_fresh_write();
+  ET_LOG(
+      Info,
+      "Opened packed weight file for writing: %s",
+      packed_cache_path_.c_str());
 #endif
 
   return Error::Ok;
@@ -130,6 +213,7 @@ Result<std::vector<std::string>> XNNWeightsCache::finalize_for_runtime() {
     }
   }
 
+
 #ifndef _WIN32
   // Schedule async flush for newly added regions only.
   // MS_ASYNC returns immediately; OS flushes in the background.
@@ -164,59 +248,86 @@ Result<const uint8_t*> XNNWeightsCache::load_unpacked_data(
       static_cast<const uint8_t*>(named_data.get().data());
   unpacked_data_.push_back(std::move(named_data.get()));
   unpacked_data_to_name_[data_pointer] = name;
-
   return data_pointer;
 }
 
+void XNNWeightsCache::release_entry(void* packed_data_ptr) {
+  packed_pointer_to_container_.erase(packed_data_ptr);
+#ifndef _WIN32
+  // Per-entry file-backed mmap region: munmap to release VM. The
+  // packed_data_ptrs_ slot is nulled by the caller so existing offsets
+  // stay valid.
+  auto region_it = file_ptr_to_region_index_.find(packed_data_ptr);
+  if (region_it != file_ptr_to_region_index_.end()) {
+    MmapRegion& region = mmap_regions_[region_it->second];
+    if (region.addr != nullptr && region.addr != MAP_FAILED) {
+      munmap(region.addr, region.size);
+      region.addr = nullptr;
+      region.size = 0;
+    }
+    file_ptr_to_region_index_.erase(region_it);
+  }
+#endif
+}
+
+void XNNWeightsCache::full_unload() {
+#ifndef _WIN32
+  for (auto& region : mmap_regions_) {
+    if (region.addr != nullptr && region.addr != MAP_FAILED) {
+      munmap(region.addr, region.size);
+      region.addr = nullptr;
+      region.size = 0;
+    }
+  }
+  mmap_regions_.clear();
+  mmap_regions_synced_ = 0;
+  packed_data_ptrs_.clear();
+  ptr_to_file_offset_.clear();
+  file_ptr_to_region_index_.clear();
+  cache_loaded_ = false;
+  if (packed_file_fd_ >= 0) {
+    close(packed_file_fd_);
+    packed_file_fd_ = -1;
+  }
+#endif
+}
+
 Error XNNWeightsCache::delete_packed_data(
     const std::vector<std::string>& packed_data_names) {
   if (!is_finalized_) {
     ET_LOG(
         Error,
-        "Error, attempted to delete packed data from the cache but the cache is not finalized");
+        "delete_packed_data called before finalize_for_runtime");
     return Error::InvalidArgument;
   }
   for (const std::string& name : packed_data_names) {
     auto entry = name_to_packed_data_metadata_.find(name);
     if (entry == name_to_packed_data_metadata_.end()) {
-      ET_LOG(
-          Error,
-          "Error, attempted to deleted packed data: %s, from the cache but it wasn't found",
-          name.c_str());
+      ET_LOG(Error, "delete_packed_data: '%s' not found", name.c_str());
       return Error::InvalidArgument;
-    } else {
-      entry->second.ref_count--;
-      if (entry->second.ref_count == 0) {
-        void* packed_data_ptr = packed_data_ptrs_[entry->second.offset];
-        // Erase the key/value from the map frees the pointer holding the
-        // packed data. No-op on the file-backed mmap path, where the
-        // container is not populated.
-        packed_pointer_to_container_.erase(packed_data_ptr);
-#ifndef _WIN32
-        // File-backed mmap path: munmap the region so VM and page-cache
-        // usage is released, not just retained until cache destruction.
-        // The vector slot is set to nullptr below so existing offsets remain
-        // valid for any concurrent lookups.
-        auto region_it = file_ptr_to_region_index_.find(packed_data_ptr);
-        if (region_it != file_ptr_to_region_index_.end()) {
-          size_t idx = region_it->second;
-          MmapRegion& region = mmap_regions_[idx];
-          if (region.addr != nullptr && region.addr != MAP_FAILED) {
-            munmap(region.addr, region.size);
-            region.addr = nullptr;
-            region.size = 0;
-          }
-          file_ptr_to_region_index_.erase(region_it);
-        }
-#endif
-        // Remove the pointer from packed_data_ptrs_.
-        packed_data_ptrs_[entry->second.offset] = nullptr;
-        // Erase the name to packed metadata entry.
-        name_to_packed_data_metadata_.erase(entry->first);
-      }
     }
+    if (--entry->second.ref_count > 0) {
+      continue;
+    }
+    // Keep from_load entries: their packed bytes live in the cache file
+    // and stay valid until full unload. Erasing them would force the
+    // next init to re-pack and append ~450 MB to the file per cycle.
+    if (entry->second.from_load) {
+      entry->second.in_current_runtime = false;
+      continue;
+    }
+    release_entry(packed_data_ptrs_[entry->second.offset]);
+    packed_data_ptrs_[entry->second.offset] = nullptr;
+    name_to_packed_data_metadata_.erase(entry);
   }
 
+  // Last entry gone: drop all in-memory state. File on disk is preserved
+  // so the next process can load_packed_cache and skip re-packing. If
+  // reserve_space after the last save corrupted the trailer, load will
+  // fall through to fresh-write — same outcome as truncating here.
+  if (name_to_packed_data_metadata_.empty()) {
+    full_unload();
+  }
   return Error::Ok;
 }
 
@@ -226,15 +337,11 @@ size_t XNNWeightsCache::look_up(
   const void* unpacked_weights_ptr = cache_key->kernel;
   const void* unpacked_bias_ptr = cache_key->bias;
   auto entry = context->unpacked_data_to_name_.find(unpacked_weights_ptr);
-
-  // Check if weight_pointer has been cached
   if (entry == context->unpacked_data_to_name_.end()) {
     return SIZE_MAX;
   }
-
   std::string weight_bias_name = entry->second;
 
-  // Check if bias_pointer has been cached
   if (unpacked_bias_ptr != nullptr) {
     auto bias_entry = context->unpacked_data_to_name_.find(unpacked_bias_ptr);
     if (bias_entry != context->unpacked_data_to_name_.end()) {
@@ -242,14 +349,12 @@ size_t XNNWeightsCache::look_up(
     }
   }
 
-  // check if weight_bias_name has been packed already
   auto packed_weight_entry =
       context->name_to_packed_data_metadata_.find(weight_bias_name);
   if (packed_weight_entry == context->name_to_packed_data_metadata_.end()) {
     return SIZE_MAX;
   }
   packed_weight_entry->second.in_current_runtime = true;
-
   return packed_weight_entry->second.offset;
 }
 
@@ -264,16 +369,11 @@ void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) {
     if (ftruncate(context->packed_file_fd_, file_offset + map_size) != 0) {
       ET_LOG(
           Error,
-          "ftruncate to %zu failed (errno=%d)",
+          "reserve_space ftruncate to %zu failed (errno=%d)",
           file_offset + map_size,
           errno);
       close(context->packed_file_fd_);
       context->packed_file_fd_ = -1;
-      // Existing mmap_regions_ still reference this inode. Disable the
-      // file-backed path permanently so a future initialize_for_runtime
-      // doesn't re-open + O_TRUNC the same path and trigger SIGBUS on the
-      // stale mappings.
-      context->packed_file_disabled_ = true;
       return context->reserve_space_heap(n);
     }
 
@@ -285,15 +385,18 @@ void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) {
         context->packed_file_fd_,
         file_offset);
     if (ptr == MAP_FAILED) {
-      ET_LOG(Error, "mmap %zu bytes failed (errno=%d)", map_size, errno);
+      ET_LOG(
+          Error,
+          "reserve_space mmap %zu bytes failed (errno=%d)",
+          map_size,
+          errno);
       close(context->packed_file_fd_);
       context->packed_file_fd_ = -1;
-      context->packed_file_disabled_ = true;
       return context->reserve_space_heap(n);
     }
 
     // mmap returns page-aligned (>= 4 KiB), which trivially satisfies the
-    // 64-byte kPackedAllocationAlignment XNNPACK expects. Assert defensively.
+    // 64-byte kPackedAllocationAlignment XNNPACK expects.
     ET_DCHECK_MSG(
         (reinterpret_cast<uintptr_t>(ptr) % kPackedAllocationAlignment) == 0,
         "mmap returned ptr not aligned to %zu bytes",
@@ -302,10 +405,10 @@ void* XNNWeightsCache::reserve_space(XNNWeightsCache* context, size_t n) {
     context->packed_file_used_ = file_offset + map_size;
     context->file_ptr_to_region_index_[ptr] = context->mmap_regions_.size();
     context->mmap_regions_.push_back({ptr, map_size});
+    context->ptr_to_file_offset_[ptr] = file_offset;
     return ptr;
   }
 #endif
-
   return context->reserve_space_heap(n);
 }
 
@@ -343,11 +446,8 @@ size_t XNNWeightsCache::look_up_or_insert(
     size_t size) {
   size_t offset = context->look_up(context, cache_key);
 
-  // XNNPACK can call this with ptr==nullptr when it previously hit the cache
-  // and skipped packing. We can't validate against the ptr contents in this
-  // case, so just return the offset. This might actually be a bug in XNNPACK
-  // since calling look_up_or_insert with ptr==nullptr doesn't really make
-  // sense...
+  // XNNPACK calls with ptr==nullptr after a cache hit (no packing
+  // happened, nothing to validate against). Return the offset as-is.
   if (ptr == nullptr) {
     return offset;
   }
@@ -357,7 +457,7 @@ size_t XNNWeightsCache::look_up_or_insert(
     if (saved_ptr != nullptr && 0 == memcmp(ptr, saved_ptr, size)) {
       return offset;
     }
-    // Failure, cache is out of date
+    // Cache out of date: name hits but packed bytes differ.
     return SIZE_MAX;
   }
 
@@ -376,6 +476,7 @@ size_t XNNWeightsCache::look_up_or_insert(
     }
     PackedDataMeta packed_data_metadata;
     packed_data_metadata.offset = next_offset;
+    packed_data_metadata.data_size = size;
     packed_data_metadata.ref_count =
         0; // ref_count is only incremented after finalizing for runtime
     packed_data_metadata.in_current_runtime = true;
@@ -408,6 +509,189 @@ void XNNWeightsCache::set_packed_cache_path(const std::string& path) {
   packed_cache_path_ = path;
 }
 
+Error XNNWeightsCache::save_packed_index() {
+#ifndef _WIN32
+  if (packed_file_fd_ < 0) {
+    return Error::Ok;
+  }
+  // Skip no-op saves: identical bytes would still bump mtime via
+  // pwrite/fsync, making the cache file appear modified on every load.
+  if (mmap_regions_.size() == mmap_regions_at_last_save_ &&
+      mmap_regions_at_last_save_ > 0) {
+    return Error::Ok;
+  }
+
+  size_t index_start = packed_file_used_;
+  std::vector<uint8_t> buf;
+  uint32_t entry_count = 0;
+
+  // Index entry: [name_len:u32][name][file_offset:u64][data_size:u64]
+  for (const auto& [name, meta] : name_to_packed_data_metadata_) {
+    void* ptr = packed_data_ptrs_[meta.offset];
+    auto it = ptr_to_file_offset_.find(ptr);
+    if (it == ptr_to_file_offset_.end()) {
+      continue;
+    }
+    entry_count++;
+    append_le(buf, static_cast<uint32_t>(name.size()));
+    buf.insert(buf.end(), name.begin(), name.end());
+    append_le(buf, static_cast<uint64_t>(it->second));
+    append_le(buf, static_cast<uint64_t>(meta.data_size));
+  }
+
+  // Footer: [index_start:u64][entry_count:u32][magic:u32][version:u32]
+  append_le(buf, static_cast<uint64_t>(index_start));
+  append_le(buf, entry_count);
+  append_le(buf, kCacheMagic);
+  append_le(buf, kCacheVersion);
+
+  if (ftruncate(packed_file_fd_, index_start + buf.size()) != 0) {
+    ET_LOG(Error, "Failed to extend file for index (errno=%d)", errno);
+    return Error::Internal;
+  }
+  ssize_t written =
+      pwrite(packed_file_fd_, buf.data(), buf.size(), index_start);
+  if (written != static_cast<ssize_t>(buf.size())) {
+    ET_LOG(Error, "Failed to write index (errno=%d)", errno);
+    return Error::Internal;
+  }
+  // Ensure trailer is on disk before we declare success.
+  if (fsync(packed_file_fd_) != 0) {
+    ET_LOG(Error, "fsync of packed cache failed (errno=%d)", errno);
+    // Continue — data is in page cache; durability is best-effort.
+  }
+  ET_LOG(
+      Info,
+      "Saved packed weight index: %u entries at offset %zu",
+      entry_count,
+      index_start);
+
+  // Promote freshly-packed entries to from_load now that they're durable
+  // on disk, so delete_packed_data preserves them across unload/reload.
+  for (auto& [name, meta] : name_to_packed_data_metadata_) {
+    if (!meta.from_load && ptr_to_file_offset_.find(
+                               packed_data_ptrs_[meta.offset]) !=
+                               ptr_to_file_offset_.end()) {
+      meta.from_load = true;
+    }
+  }
+
+  mmap_regions_at_last_save_ = mmap_regions_.size();
+
+  // Close the fd so the next init re-enters load_packed_cache and reads
+  // the trailer we just wrote.
+  if (close(packed_file_fd_) != 0) {
+    ET_LOG(Error, "close of packed cache fd failed (errno=%d)", errno);
+  }
+  packed_file_fd_ = -1;
+#endif
+  return Error::Ok;
+}
+
+bool XNNWeightsCache::load_packed_cache() {
+#ifndef _WIN32
+  int fd = open(packed_cache_path_.c_str(), O_RDONLY);
+  if (fd < 0) {
+    return false;
+  }
+  // Prevent racing with a concurrent writer
+  if (flock(fd, LOCK_SH | LOCK_NB) != 0) {
+    close(fd);
+    return false;
+  }
+  struct stat st;
+  if (fstat(fd, &st) != 0 || st.st_size < 20) {
+    close(fd);
+    return false;
+  }
+  size_t file_size = static_cast<size_t>(st.st_size);
+
+  uint8_t footer[20];
+  if (pread(fd, footer, 20, file_size - 20) != 20) {
+    close(fd);
+    return false;
+  }
+  uint64_t index_start = read_le<uint64_t>(footer);
+  uint32_t entry_count = read_le<uint32_t>(footer + 8);
+  uint32_t magic = read_le<uint32_t>(footer + 12);
+  uint32_t version = read_le<uint32_t>(footer + 16);
+
+  if (magic != kCacheMagic || version != kCacheVersion ||
+      index_start >= file_size - 20) {
+    close(fd);
+    return false;
+  }
+  const size_t index_region_end = file_size - 20;
+
+  void* map = mmap(nullptr, file_size, PROT_READ, MAP_SHARED, fd, 0);
+  close(fd);
+  if (map == MAP_FAILED) {
+    return false;
+  }
+  mmap_regions_.push_back({map, file_size});
+
+  const uint8_t* cursor = static_cast<const uint8_t*>(map) + index_start;
+  const uint8_t* end = static_cast<const uint8_t*>(map) + index_region_end;
+
+  for (uint32_t i = 0; i < entry_count && cursor + 4 <= end; ++i) {
+    uint32_t name_len = read_le<uint32_t>(cursor);
+    cursor += 4;
+    if (cursor + name_len + 16 > end) {
+      break;
+    }
+    std::string name(reinterpret_cast<const char*>(cursor), name_len);
+    cursor += name_len;
+    uint64_t file_offset = read_le<uint64_t>(cursor);
+    cursor += 8;
+    uint64_t data_size = read_le<uint64_t>(cursor);
+    cursor += 8;
+
+    // Bounds check: the entry's bytes must lie entirely inside the
+    // packed-data region
+    if (file_offset >= index_start ||
+        data_size > index_start - file_offset) {
+      ET_LOG(
+          Error,
+          "load_packed_cache: entry '%s' has invalid bounds (file_offset=%llu, data_size=%llu, index_start=%llu); aborting load",
+          name.c_str(),
+          static_cast<unsigned long long>(file_offset),
+          static_cast<unsigned long long>(data_size),
+          static_cast<unsigned long long>(index_start));
+      // Roll back any partial state.
+      munmap(map, file_size);
+      mmap_regions_.pop_back();
+      name_to_packed_data_metadata_.clear();
+      packed_data_ptrs_.clear();
+      ptr_to_file_offset_.clear();
+      return false;
+    }
+
+    size_t ptr_index = packed_data_ptrs_.size();
+    void* entry_ptr = static_cast<char*>(map) + file_offset;
+    packed_data_ptrs_.push_back(entry_ptr);
+    // Tracked so a subsequent save_packed_index can rewrite the trailer
+    // covering both loaded and newly-packed entries.
+    ptr_to_file_offset_[entry_ptr] = file_offset;
+    PackedDataMeta meta;
+    meta.offset = ptr_index;
+    meta.data_size = data_size;
+    meta.ref_count = 0;
+    meta.in_current_runtime = false;
+    meta.from_load = true;
+    name_to_packed_data_metadata_[name] = meta;
+  }
+
+  cache_loaded_ = true;
+  packed_file_used_ = index_start;
+  // In-memory state matches the on-disk trailer; the next save would be
+  // a no-op. Initialize watermark so save_packed_index short-circuits.
+  mmap_regions_at_last_save_ = mmap_regions_.size();
+  return true;
+#else
+  return false;
+#endif
+}
+
 } // namespace delegate
 } // namespace xnnpack
 } // namespace backends
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.h b/backends/xnnpack/runtime/XNNWeightsCache.h
index a41fed49fd1..d6910b31ce3 100644
--- a/backends/xnnpack/runtime/XNNWeightsCache.h
+++ b/backends/xnnpack/runtime/XNNWeightsCache.h
@@ -31,11 +31,19 @@ using executorch::runtime::Result;
 
 struct PackedDataMeta {
   size_t offset;
+  size_t data_size{0};
   // Count number of xnn_runtime_t this packed data is used in
   size_t ref_count;
   // true if this packed data was inserted or looked up for the
   // current runtime being created
   bool in_current_runtime;
+  // True if this entry's bytes are persisted in the on-disk cache file
+  // (either originally loaded via load_packed_cache, or freshly packed
+  // and then save_packed_index-ed). Used by delete_packed_data to
+  // detect when all persistent entries are gone, at which point
+  // cache_loaded_ is auto-invalidated so the next init re-enters
+  // load_packed_cache and reuses the saved file instead of re-packing.
+  bool from_load{false};
 };
 
 class XNNWeightsCache {
@@ -138,7 +146,16 @@ class XNNWeightsCache {
    */
   void set_packed_cache_path(const std::string& path);
 
+  /** Save packed weight index so subsequent loads skip packing. */
+  Error save_packed_index();
+
  private:
+  static constexpr uint32_t kCacheMagic = 0x58505743; // "XPWC"
+  static constexpr uint32_t kCacheVersion = 1;
+  bool load_packed_cache();
+  void reset_for_fresh_write();
+  void release_entry(void* packed_data_ptr);
+  void full_unload();
   // Runtime Allocator used to reserve memory for packed weights
   MemoryAllocator* runtime_allocator_;
 
@@ -167,18 +184,24 @@ class XNNWeightsCache {
   std::string packed_cache_path_;
   int packed_file_fd_{-1};
   size_t packed_file_used_{0};
-  // Set after an unrecoverable mmap/ftruncate failure. Prevents re-opening
-  // the cache file on subsequent initialize_for_runtime() calls — re-opening
-  // with O_TRUNC would truncate the inode beneath any still-live mmap pages
-  // and the next access would raise SIGBUS. Once disabled, all reserve_space
-  // calls fall back to heap allocation for the lifetime of this cache.
-  bool packed_file_disabled_{false};
+  // True once load_packed_cache() has populated metadata from a saved
+  // index, OR once a fresh-write session has been persisted to disk via
+  // save_packed_index() (so subsequent inits can load from it).
+  bool cache_loaded_{false};
+  // Tracks file offset of each file-backed allocation. Used by
+  // save_packed_index() to serialize (name → offset, size) index.
+  std::unordered_map<void*, size_t> ptr_to_file_offset_;
   struct MmapRegion {
     void* addr;
     size_t size;
   };
   std::vector<MmapRegion> mmap_regions_;
   size_t mmap_regions_synced_{0};
+  // Number of regions present at the time of the most recent successful
+  // save_packed_index. Used to skip no-op saves (trailer would be byte-
+  // identical, but pwrite/fsync still bump mtime, making it look like the
+  // cache file is constantly "modified" when nothing has actually changed).
+  size_t mmap_regions_at_last_save_{0};
   // For file-backed packed allocations, maps the returned ptr to its index
   // in mmap_regions_, so delete_packed_data() can munmap when ref_count==0.
   std::unordered_map<void*, size_t> file_ptr_to_region_index_;
diff --git a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
index 83937887e25..4639d96152d 100644
--- a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
+++ b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
@@ -17,6 +17,9 @@
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/schema/program_generated.h>
 #include <gtest/gtest.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fstream>
 #include <xnnpack.h>
 
 using executorch::backends::xnnpack::delegate::XNNWeightsCache;
@@ -352,4 +355,394 @@ TEST_F(XNNWeightsCacheTest, PackedWeightsMmapPathLockCollision) {
 
   ::unlink(cache_path.c_str());
 }
+
+// Verify load_packed_cache produces byte-identical inference results to
+// a fresh build of the same graph. Guards against weight pointers being
+// mis-mapped after cache load.
+TEST_F(XNNWeightsCacheTest, SaveAndLoad_PreservesInferenceOutput) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_output_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input_tensor(
+      num_batches * input_channels + padding, 1.0f);
+
+  // Run 1: no cache file (pure heap pack).
+  std::vector<float> output_baseline(num_batches * output_channels, 0.0f);
+  {
+    XNNWeightsCache cache;
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input_tensor.data(),
+        output_baseline.data());
+  }
+
+  // Run 2: file-backed mmap path, save trailer.
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    std::vector<float> output_write(num_batches * output_channels, 0.0f);
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input_tensor.data(),
+        output_write.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+    EXPECT_EQ(output_write, output_baseline);
+  }
+
+  // Run 3: fresh instance loads from disk; output must match.
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    ASSERT_GT(cache.get_packed_data_names().size(), 0u);
+    std::vector<float> output_load(num_batches * output_channels, 0.0f);
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input_tensor.data(),
+        output_load.data());
+    EXPECT_EQ(output_load, output_baseline);
+  }
+
+  ::unlink(cache_path.c_str());
+}
+
+// Corrupted cache file must not crash; load_packed_cache returns false and
+// the next init falls through to the fresh-build path that overwrites it.
+TEST_F(XNNWeightsCacheTest, LoadPackedCache_RejectsCorruptTrailer) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_corrupt_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  // Write a file with valid size but garbage trailer.
+  {
+    std::ofstream f(cache_path, std::ios::binary);
+    std::vector<char> garbage(1024, '\xCC');
+    f.write(garbage.data(), garbage.size());
+  }
+
+  XNNWeightsCache cache;
+  cache.set_packed_cache_path(cache_path);
+  // Must not crash; load returns false → falls through to fresh build.
+  Error err =
+      cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  ASSERT_EQ(err, Error::Ok);
+
+  // Fresh build still works.
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output(num_batches * output_channels, 0.0f);
+  BuildAndRunGraphWithWeightsCache(
+      cache,
+      batches,
+      input_channels,
+      output_channels,
+      input.data(),
+      output.data());
+
+  ::unlink(cache_path.c_str());
+}
+
+// Repeated init+run+save cycles on the same file must not grow the cache
+// file. Guards against the regression where each PTE init re-packed weights
+// and appended a fresh copy (+500 MB per inference observed in production).
+TEST_F(XNNWeightsCacheTest, MultiSessionLoad_DoesNotGrowCacheFile) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_nogrow_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output(num_batches * output_channels, 0.0f);
+
+  // Cycle 1: fresh write of cache.
+  off_t size_after_first_save = 0;
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        output.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+    struct stat st {};
+    ASSERT_EQ(::stat(cache_path.c_str(), &st), 0);
+    size_after_first_save = st.st_size;
+    ASSERT_GT(size_after_first_save, 0);
+  }
+
+  // Cycle 2: fresh instance loads from disk, runs, saves. No new weights
+  // were packed → file must be byte-for-byte identical in length.
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    ASSERT_GT(cache.get_packed_data_names().size(), 0u);
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        output.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+  }
+  {
+    struct stat st {};
+    ASSERT_EQ(::stat(cache_path.c_str(), &st), 0);
+    EXPECT_EQ(st.st_size, size_after_first_save);
+  }
+
+  // Cycle 3: simulate PTE destroy + recreate inside the same instance.
+  // delete_packed_data on from_load entries must not erase metadata, so
+  // the second init's look_up still hits → no new file append.
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        output.data());
+    cache.delete_packed_data(cache.get_packed_data_names());
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        output.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+  }
+  {
+    struct stat st {};
+    ASSERT_EQ(::stat(cache_path.c_str(), &st), 0);
+    EXPECT_EQ(st.st_size, size_after_first_save);
+  }
+
+  ::unlink(cache_path.c_str());
+}
+
+// After loading from disk, delete_packed_data must skip from_load entries
+// so the next init still hits the cache. Bug would re-pack weights from
+// scratch each time the backend destroys + recreates a delegate.
+TEST_F(XNNWeightsCacheTest, DeletePackedData_OnFromLoadEntries_PreservesMetadata) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_fromload_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output(num_batches * output_channels, 0.0f);
+
+  // Seed the cache file.
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        output.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+  }
+
+  // Fresh instance: all populated entries are from_load=true.
+  XNNWeightsCache cache;
+  cache.set_packed_cache_path(cache_path);
+  cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  size_t loaded_count = cache.get_packed_data_names().size();
+  ASSERT_GT(loaded_count, 0u);
+
+  BuildAndRunGraphWithWeightsCache(
+      cache,
+      batches,
+      input_channels,
+      output_channels,
+      input.data(),
+      output.data());
+
+  // Repeated delete must never erase from_load entries — contrast with
+  // ReusePackedWeights where two delete calls drop the count to 0.
+  for (int i = 0; i < 5; ++i) {
+    cache.delete_packed_data(cache.get_packed_data_names());
+    EXPECT_EQ(cache.get_packed_data_names().size(), loaded_count)
+        << "from_load entries should survive delete; iteration " << i;
+  }
+
+  ::unlink(cache_path.c_str());
+}
+
+// A model with multiple PTE/method delegates initializes the cache
+// sequentially before any one is destroyed. The second PTE's init must
+// see the first PTE's packed entries already in the map → look_up hits,
+// no new reserve_space, file does not grow per PTE.
+TEST_F(XNNWeightsCacheTest, MultiplePTEsInSameInstance_NoFileGrowth) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_multipte_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> out_pte1(num_batches * output_channels, 0.0f);
+  std::vector<float> out_pte2(num_batches * output_channels, 0.0f);
+
+  XNNWeightsCache cache;
+  cache.set_packed_cache_path(cache_path);
+
+  // PTE 1: fresh pack + save.
+  cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      cache,
+      batches,
+      input_channels,
+      output_channels,
+      input.data(),
+      out_pte1.data());
+  ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+
+  off_t size_after_pte1 = 0;
+  {
+    struct stat st {};
+    ASSERT_EQ(::stat(cache_path.c_str(), &st), 0);
+    size_after_pte1 = st.st_size;
+    ASSERT_GT(size_after_pte1, 0);
+  }
+  size_t names_after_pte1 = cache.get_packed_data_names().size();
+  ASSERT_GT(names_after_pte1, 0u);
+
+  // PTE 2: sibling delegate, NO destroy between. look_up must hit the
+  // entry from PTE 1 → no new reserve_space → file size unchanged after
+  // save.
+  cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      cache,
+      batches,
+      input_channels,
+      output_channels,
+      input.data(),
+      out_pte2.data());
+  ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+
+  {
+    struct stat st {};
+    ASSERT_EQ(::stat(cache_path.c_str(), &st), 0);
+    EXPECT_EQ(st.st_size, size_after_pte1)
+        << "PTE 2 with same weights must not append to the cache file";
+  }
+  EXPECT_EQ(cache.get_packed_data_names().size(), names_after_pte1);
+
+  // Both PTEs produced the same output for the same input (correctness).
+  EXPECT_EQ(out_pte1, out_pte2);
+
+  // PTE 3: third sibling. Still no growth.
+  std::vector<float> out_pte3(num_batches * output_channels, 0.0f);
+  cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      cache,
+      batches,
+      input_channels,
+      output_channels,
+      input.data(),
+      out_pte3.data());
+  ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+  {
+    struct stat st {};
+    ASSERT_EQ(::stat(cache_path.c_str(), &st), 0);
+    EXPECT_EQ(st.st_size, size_after_pte1);
+  }
+  EXPECT_EQ(out_pte3, out_pte1);
+
+  ::unlink(cache_path.c_str());
+}
+
+// save_packed_index must be a true no-op when no new reserve_space happened
+// since the last save — same content but writing would still bump mtime,
+// making the cache file look modified on every model load.
+TEST_F(XNNWeightsCacheTest, SavePackedIndex_NoNewReserves_IsNoOp) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_noop_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output(num_batches * output_channels, 0.0f);
+
+  // Seed cache + first save.
+  XNNWeightsCache cache;
+  cache.set_packed_cache_path(cache_path);
+  cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  BuildAndRunGraphWithWeightsCache(
+      cache,
+      batches,
+      input_channels,
+      output_channels,
+      input.data(),
+      output.data());
+  ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+
+  struct stat st_before {};
+  ASSERT_EQ(::stat(cache_path.c_str(), &st_before), 0);
+
+  // Sleep so mtime would tick if a write actually happened.
+  ::sleep(1);
+
+  // Second save with no intervening reserve_space → no-op short-circuit.
+  ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+
+  struct stat st_after {};
+  ASSERT_EQ(::stat(cache_path.c_str(), &st_after), 0);
+  EXPECT_EQ(st_before.st_size, st_after.st_size);
+  EXPECT_EQ(st_before.st_mtime, st_after.st_mtime);
+
+  ::unlink(cache_path.c_str());
+}
+
 #endif

From 7739ca2bdec506610d1027650b16466fbb4cb970 Mon Sep 17 00:00:00 2001
From: Longfang Zhao <longfangzhao@meta.com>
Date: Tue, 9 Jun 2026 16:42:26 -0700
Subject: [PATCH 2/2] Per-entry seed in XNNPACK weights cache for
 XNNPACK-upgrade invalidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
XNNPACK exposes `xnn_weights_cache_look_up_key.seed` — a per-ukernel value that XNNPACK guarantees is consistent across runs of the same ukernel and changes whenever a ukernel implementation changes. Store this seed per cache entry so a stale cached packing produced by an old XNNPACK ukernel is rejected after upgrade, instead of being handed back to a newer ukernel that expects a different layout.

Changes:
- `PackedDataMeta` gains `uint32_t seed{0}`.
- `look_up` rejects (returns `SIZE_MAX`) when a name hit has a stored seed that doesn't match `cache_key->seed`. This forces `look_up_or_insert` to re-pack with the current ukernel and avoids the slow `memcmp` path catching it later.
- `look_up_or_insert` records `cache_key->seed` on insert.
- On-disk index entry layout extended to `[name_len:u32][name][file_offset:u64][data_size:u64][seed:u32]` (was 16 bytes after the name, now 20).
- `load_packed_cache` reads the per-entry seed and bumps the trailing bytes bound check accordingly.
- `kCacheVersion` bumped 1 → 2 so existing v1 files (which carry no seed) are rejected at load instead of being loaded with `seed=0` and mismatching every fresh `look_up`.

Cleanup of orphaned in-memory and on-disk entries left by an invalidated look-up is a follow-up — this diff only adds the detection.

Differential Revision: D108082431
---
 backends/xnnpack/runtime/XNNWeightsCache.cpp  |  23 +-
 backends/xnnpack/runtime/XNNWeightsCache.h    |  13 +-
 .../test/runtime/test_xnn_weights_cache.cpp   | 227 ++++++++++++++++++
 3 files changed, 260 insertions(+), 3 deletions(-)

diff --git a/backends/xnnpack/runtime/XNNWeightsCache.cpp b/backends/xnnpack/runtime/XNNWeightsCache.cpp
index 9eab694d529..f75ff8adb94 100644
--- a/backends/xnnpack/runtime/XNNWeightsCache.cpp
+++ b/backends/xnnpack/runtime/XNNWeightsCache.cpp
@@ -354,6 +354,19 @@ size_t XNNWeightsCache::look_up(
   if (packed_weight_entry == context->name_to_packed_data_metadata_.end()) {
     return SIZE_MAX;
   }
+  // XNNPACK upgrade detection: a ukernel whose implementation changed
+  // produces a different seed. Reject the cached entry so look_up_or_insert
+  // falls through to re-pack with the current ukernel.
+  if (packed_weight_entry->second.seed != cache_key->seed) {
+    ET_LOG(
+        Info,
+        "look_up: seed mismatch for '%s' (cached=0x%08x, current=0x%08x); "
+        "treating as miss for re-pack",
+        weight_bias_name.c_str(),
+        packed_weight_entry->second.seed,
+        cache_key->seed);
+    return SIZE_MAX;
+  }
   packed_weight_entry->second.in_current_runtime = true;
   return packed_weight_entry->second.offset;
 }
@@ -480,6 +493,7 @@ size_t XNNWeightsCache::look_up_or_insert(
     packed_data_metadata.ref_count =
         0; // ref_count is only incremented after finalizing for runtime
     packed_data_metadata.in_current_runtime = true;
+    packed_data_metadata.seed = cache_key->seed;
     context->name_to_packed_data_metadata_[weight_bias_name] =
         packed_data_metadata;
   } else {
@@ -525,7 +539,7 @@ Error XNNWeightsCache::save_packed_index() {
   std::vector<uint8_t> buf;
   uint32_t entry_count = 0;
 
-  // Index entry: [name_len:u32][name][file_offset:u64][data_size:u64]
+  // Index entry: [name_len:u32][name][file_offset:u64][data_size:u64][seed:u32]
   for (const auto& [name, meta] : name_to_packed_data_metadata_) {
     void* ptr = packed_data_ptrs_[meta.offset];
     auto it = ptr_to_file_offset_.find(ptr);
@@ -537,6 +551,7 @@ Error XNNWeightsCache::save_packed_index() {
     buf.insert(buf.end(), name.begin(), name.end());
     append_le(buf, static_cast<uint64_t>(it->second));
     append_le(buf, static_cast<uint64_t>(meta.data_size));
+    append_le(buf, meta.seed);
   }
 
   // Footer: [index_start:u64][entry_count:u32][magic:u32][version:u32]
@@ -636,7 +651,8 @@ bool XNNWeightsCache::load_packed_cache() {
   for (uint32_t i = 0; i < entry_count && cursor + 4 <= end; ++i) {
     uint32_t name_len = read_le<uint32_t>(cursor);
     cursor += 4;
-    if (cursor + name_len + 16 > end) {
+    // [file_offset:u64][data_size:u64][seed:u32] = 20 bytes
+    if (cursor + name_len + 20 > end) {
       break;
     }
     std::string name(reinterpret_cast<const char*>(cursor), name_len);
@@ -645,6 +661,8 @@ bool XNNWeightsCache::load_packed_cache() {
     cursor += 8;
     uint64_t data_size = read_le<uint64_t>(cursor);
     cursor += 8;
+    uint32_t seed = read_le<uint32_t>(cursor);
+    cursor += 4;
 
     // Bounds check: the entry's bytes must lie entirely inside the
     // packed-data region
@@ -678,6 +696,7 @@ bool XNNWeightsCache::load_packed_cache() {
     meta.ref_count = 0;
     meta.in_current_runtime = false;
     meta.from_load = true;
+    meta.seed = seed;
     name_to_packed_data_metadata_[name] = meta;
   }
 
diff --git a/backends/xnnpack/runtime/XNNWeightsCache.h b/backends/xnnpack/runtime/XNNWeightsCache.h
index d6910b31ce3..f00713e5a1c 100644
--- a/backends/xnnpack/runtime/XNNWeightsCache.h
+++ b/backends/xnnpack/runtime/XNNWeightsCache.h
@@ -44,6 +44,13 @@ struct PackedDataMeta {
   // cache_loaded_ is auto-invalidated so the next init re-enters
   // load_packed_cache and reuses the saved file instead of re-packing.
   bool from_load{false};
+  // Per-ukernel seed from xnn_weights_cache_look_up_key.seed. XNNPACK
+  // guarantees this is consistent across runs of the same ukernel; when
+  // XNNPACK upgrades and a ukernel implementation changes, the seed
+  // changes. look_up rejects entries whose stored seed doesn't match
+  // the caller's seed so that stale cache entries don't deliver wrongly
+  // packed weights to a newer ukernel.
+  uint32_t seed{0};
 };
 
 class XNNWeightsCache {
@@ -151,7 +158,11 @@ class XNNWeightsCache {
 
  private:
   static constexpr uint32_t kCacheMagic = 0x58505743; // "XPWC"
-  static constexpr uint32_t kCacheVersion = 1;
+  // Bump when the on-disk layout (footer or per-entry record) changes.
+  // v2: per-entry seed added — old v1 files don't carry seeds and would
+  // load with seed=0, mismatching every fresh look_up with a non-zero
+  // seed, causing a stampede of re-packs. Reject v1 outright.
+  static constexpr uint32_t kCacheVersion = 2;
   bool load_packed_cache();
   void reset_for_fresh_write();
   void release_entry(void* packed_data_ptr);
diff --git a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
index 4639d96152d..80b19865024 100644
--- a/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
+++ b/backends/xnnpack/test/runtime/test_xnn_weights_cache.cpp
@@ -699,6 +699,233 @@ TEST_F(XNNWeightsCacheTest, MultiplePTEsInSameInstance_NoFileGrowth) {
   ::unlink(cache_path.c_str());
 }
 
+namespace {
+
+// Little-endian decode helpers matching XNNWeightsCache's on-disk format.
+uint32_t read_le_u32(const uint8_t* p) {
+  uint32_t v = 0;
+  for (int i = 0; i < 4; ++i) {
+    v |= static_cast<uint32_t>(p[i]) << (8 * i);
+  }
+  return v;
+}
+uint64_t read_le_u64(const uint8_t* p) {
+  uint64_t v = 0;
+  for (int i = 0; i < 8; ++i) {
+    v |= static_cast<uint64_t>(p[i]) << (8 * i);
+  }
+  return v;
+}
+void write_le_u32(std::ostream& f, uint32_t v) {
+  for (int i = 0; i < 4; ++i) {
+    char b = static_cast<char>((v >> (8 * i)) & 0xff);
+    f.write(&b, 1);
+  }
+}
+void write_le_u64(std::ostream& f, uint64_t v) {
+  for (int i = 0; i < 8; ++i) {
+    char b = static_cast<char>((v >> (8 * i)) & 0xff);
+    f.write(&b, 1);
+  }
+}
+
+} // namespace
+
+// A cache file written by older code (kCacheVersion=1) carries no per-entry
+// seed field. Loading such a file with the current schema would yield
+// entries with seed=0 and mismatch every fresh look_up. The version bump
+// must reject it outright so the next init re-packs from scratch.
+TEST_F(XNNWeightsCacheTest, LoadPackedCache_RejectsV1Format) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_v1_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  // v1 layout: 64 bytes of dummy data, then 20-byte footer with version=1.
+  {
+    std::ofstream f(cache_path, std::ios::binary);
+    std::vector<char> data(64, 0);
+    f.write(data.data(), data.size());
+    write_le_u64(f, 64); // index_start
+    write_le_u32(f, 0); // entry_count
+    write_le_u32(f, 0x58505743); // kCacheMagic "XPWC"
+    write_le_u32(f, 1); // OLD kCacheVersion = 1
+  }
+
+  XNNWeightsCache cache;
+  cache.set_packed_cache_path(cache_path);
+  Error err =
+      cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+  ASSERT_EQ(err, Error::Ok);
+  // Version mismatch → load_packed_cache returned false → no entries.
+  EXPECT_EQ(cache.get_packed_data_names().size(), 0u);
+
+  ::unlink(cache_path.c_str());
+}
+
+// Verify save_packed_index writes the schema version 2 footer and embeds a
+// 4-byte seed field in each entry record. Guards against future refactors
+// silently dropping the seed write.
+TEST_F(XNNWeightsCacheTest, SavePackedIndex_EntryFormatIncludesSeed) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_format_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+  std::vector<float> output(num_batches * output_channels, 0.0f);
+
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        output.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+  }
+
+  // Parse footer at file_size - 20.
+  std::ifstream f(cache_path, std::ios::binary);
+  ASSERT_TRUE(f.is_open());
+  f.seekg(0, std::ios::end);
+  size_t file_size = f.tellg();
+  ASSERT_GE(file_size, 24u);
+
+  uint8_t footer[20];
+  f.seekg(file_size - 20);
+  f.read(reinterpret_cast<char*>(footer), 20);
+  uint32_t magic = read_le_u32(footer + 12);
+  uint32_t version = read_le_u32(footer + 16);
+  EXPECT_EQ(magic, 0x58505743u);
+  EXPECT_EQ(version, 2u);
+
+  // Walk first entry: [name_len:u32][name][file_offset:u64][data_size:u64][seed:u32]
+  uint64_t index_start = read_le_u64(footer);
+  uint32_t entry_count = read_le_u32(footer + 8);
+  ASSERT_GT(entry_count, 0u);
+
+  f.seekg(index_start);
+  uint8_t name_len_buf[4];
+  f.read(reinterpret_cast<char*>(name_len_buf), 4);
+  uint32_t name_len = read_le_u32(name_len_buf);
+
+  // The seed field sits at index_start + 4 + name_len + 8 + 8.
+  f.seekg(index_start + 4 + name_len + 8 + 8);
+  uint8_t seed_buf[4];
+  f.read(reinterpret_cast<char*>(seed_buf), 4);
+  // XNNPACK ukernel seeds are non-zero in practice. The signal here is
+  // simply that 4 well-formed bytes follow the size field — confirming
+  // the new entry layout was written, not the legacy 16-byte tail.
+  uint32_t stored_seed = read_le_u32(seed_buf);
+  EXPECT_NE(stored_seed, 0u);
+
+  ::unlink(cache_path.c_str());
+}
+
+// After loading a cache file whose entry seed has been tampered with
+// (simulating an XNNPACK upgrade where the same ukernel now emits a
+// different seed), the next inference must produce correct output. Either
+// look_up's seed check or look_up_or_insert's memcmp fallback drives the
+// re-pack; this test exercises the end-to-end safety net.
+TEST_F(XNNWeightsCacheTest, LoadPackedCache_CorruptedSeed_ProducesCorrectOutput) {
+  std::string cache_path = std::string("/tmp/xnn_weights_cache_badseed_") +
+      std::to_string(::getpid()) + ".packed_cache";
+  ::unlink(cache_path.c_str());
+
+  std::vector<size_t> batches{1, 2, 3};
+  size_t input_channels = 3;
+  size_t output_channels = 4;
+  size_t num_batches = 1 * 2 * 3;
+  size_t padding = 32;
+  std::vector<float> input(num_batches * input_channels + padding, 1.0f);
+
+  // Baseline: fresh pack, heap-only, no cache file.
+  std::vector<float> baseline(num_batches * output_channels, 0.0f);
+  {
+    XNNWeightsCache cache;
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        baseline.data());
+  }
+
+  // Write a valid cache file.
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    std::vector<float> out(num_batches * output_channels, 0.0f);
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        out.data());
+    ASSERT_EQ(cache.save_packed_index(), Error::Ok);
+  }
+
+  // Corrupt the seed field of the first entry to a value no real ukernel
+  // would emit (0xDEADBEEF).
+  {
+    std::fstream f(cache_path, std::ios::binary | std::ios::in | std::ios::out);
+    ASSERT_TRUE(f.is_open());
+    f.seekg(0, std::ios::end);
+    size_t file_size = f.tellg();
+    ASSERT_GE(file_size, 24u);
+
+    uint8_t footer_buf[20];
+    f.seekg(file_size - 20);
+    f.read(reinterpret_cast<char*>(footer_buf), 20);
+    uint64_t index_start = read_le_u64(footer_buf);
+    uint32_t entry_count = read_le_u32(footer_buf + 8);
+    ASSERT_GT(entry_count, 0u);
+
+    f.seekg(index_start);
+    uint8_t name_len_buf[4];
+    f.read(reinterpret_cast<char*>(name_len_buf), 4);
+    uint32_t name_len = read_le_u32(name_len_buf);
+
+    size_t seed_offset = index_start + 4 + name_len + 8 + 8;
+    f.seekp(seed_offset);
+    uint32_t corrupted = 0xDEADBEEFu;
+    f.write(reinterpret_cast<const char*>(&corrupted), 4);
+    f.close();
+  }
+
+  // Reload and run. Output must still match baseline.
+  std::vector<float> after_corruption(num_batches * output_channels, 0.0f);
+  {
+    XNNWeightsCache cache;
+    cache.set_packed_cache_path(cache_path);
+    cache.initialize_for_runtime(memory_allocator_.get(), data_map_.get());
+    ASSERT_GT(cache.get_packed_data_names().size(), 0u);
+    BuildAndRunGraphWithWeightsCache(
+        cache,
+        batches,
+        input_channels,
+        output_channels,
+        input.data(),
+        after_corruption.data());
+  }
+
+  EXPECT_EQ(after_corruption, baseline);
+
+  ::unlink(cache_path.c_str());
+}
+
 // save_packed_index must be a true no-op when no new reserve_space happened
 // since the last save — same content but writing would still bump mtime,
 // making the cache file look modified on every model load.