diff --git a/ios/Sources/GutenbergKit/Sources/Stores/SQLiteKVCache.swift b/ios/Sources/GutenbergKit/Sources/Stores/SQLiteKVCache.swift new file mode 100644 index 000000000..305306b4b --- /dev/null +++ b/ios/Sources/GutenbergKit/Sources/Stores/SQLiteKVCache.swift @@ -0,0 +1,674 @@ +import CryptoKit +import Foundation +import OSLog +import SQLite3 + +/// A SQLite-backed persistent key-value cache for opaque blobs. +/// +/// **Cache only — not a primary store.** Although entries persist across +/// process restarts, the on-disk contract is intentionally lossy: a schema +/// version mismatch drops the table on open; eviction discards oldest-first +/// when total content exceeds `diskCapacity`; and `synchronous = NORMAL` +/// trades the last few committed writes on power loss for substantially +/// faster small writes. Use this for data that's cheap to refetch from a +/// canonical source — never as the only copy. +/// +/// Each entry is `(key, storageDate, metadata, value)`. Values and metadata are +/// stored as raw bytes; the caller owns any serialization. After every `put`, +/// oldest entries (by storage date) are evicted until total stored size is at +/// or below `diskCapacity`. +/// +/// **Keys.** The cache accepts any `String` as a key and internally hashes it +/// (SHA-256, raw 32-byte digest) before binding into SQLite. This means callers +/// don't have to think about key length, SQL escaping, embedded null bytes, or +/// character encoding — anything the caller can hand us round-trips. The trade +/// is debuggability: a `sqlite3 ... SELECT hex(key) FROM entries` shows hash +/// digests as hex, not the original input strings. +/// +/// **Concurrency.** Thread-safe. Every operation is a single autocommit SQL +/// statement: `put` is `INSERT … ON CONFLICT DO UPDATE` with the eviction +/// sweep happening inside `AFTER INSERT` / `AFTER UPDATE` triggers; `get`, +/// `delete`, and `clear` are also single statements. `SQLITE_OPEN_FULLMUTEX` +/// serializes the C-level API calls on the connection, and SQLite's +/// per-statement atomicity (including trigger bodies) handles the rest — +/// there's no Swift-level lock because there's no multi-statement transaction +/// to fence. Single-thread `put` then `get` always observes the put. +/// +/// **One instance per backing file.** Opening two `SQLiteKVCache` instances +/// against the same `/.sqlite` (whether in the same process +/// or across processes — main app vs. share extension) is **undefined +/// behavior**. The eviction triggers are recreated unconditionally on every +/// open with the current instance's `diskCapacity` baked into them, so two +/// instances with different caps would clobber each other's triggers; in the +/// best case you get the wrong cap, in the worst case `SQLITE_BUSY` while the +/// recreations race. Each backing file must have exactly one owning +/// `SQLiteKVCache` for the lifetime of the process. Not currently enforced at +/// runtime — this is a usage contract. +/// +/// **Schema migrations.** A `schemaVersion` constant baked into the build is +/// compared against `PRAGMA user_version` on open; mismatches drop and recreate +/// the table. The check trusts the version: if a buggy past build wrote a +/// matching `user_version` but the wrong column shape, this won't detect it. +/// +/// **Disk reclamation.** `diskCapacity` caps the *content* size — the sum of +/// `length(value) + length(metadata)` across rows. The on-disk file can drift +/// past that cap because evicted/deleted rows leave free pages on the SQLite +/// freelist, which `auto_vacuum = NONE` (the default) doesn't reclaim. Rather +/// than enable `auto_vacuum = FULL` (which would rewrite freelist pages on +/// every transaction), the open path runs a single `VACUUM` if the freelist +/// has grown past `vacuumFreelistThreshold` of total pages — bounding the +/// SSD-churn cost to once per process and skipping launches where there's +/// nothing meaningful to reclaim. +final class SQLiteKVCache: @unchecked Sendable { + + enum Error: Swift.Error { + /// The cache couldn't be opened or set up. Typically the caches + /// directory is sandboxed-out, the disk is full, an existing file + /// at the path is corrupt, or a setup pragma / DDL statement + /// failed during open. The first operation on the cache throws + /// this and so does every subsequent operation (the failure is + /// cached) — callers using `try?` get a uniform "cache + /// unavailable" fall-through. Silently degrading to a + /// process-lifetime in-memory store would violate the persistence + /// contract `put` implies. The originating SQLite error code is + /// not preserved on this case (setup-time failures collapse to + /// the same surface) but is logged at the failure site. + case databaseUnavailable + /// A read (`get`) failed inside SQLite. The associated value is the + /// SQLite error code. Cache callers that want the prior "treat read + /// failures as misses" behavior can wrap the call in `try?`. + case readFailed(sqliteCode: Int32) + /// A write (`put`, `delete`, or `clear`) failed inside SQLite. The + /// associated value is the SQLite error code — common ones include + /// `SQLITE_FULL` (disk full, code 13), `SQLITE_TOOBIG` (blob over + /// `SQLITE_MAX_LENGTH`, code 18), and `SQLITE_CORRUPT` (code 11). + case writeFailed(sqliteCode: Int32) + } + + // MARK: - Debugging + // + // Backed by a single SQLite database at `/.sqlite`. + // Useful queries from a shell: + // + // # List entries by recency, with size and date + // sqlite3 Store.sqlite \ + // "SELECT hex(key), length(value), datetime(storage_date + 978307200, 'unixepoch') \ + // FROM entries ORDER BY storage_date DESC" + // + // # Export a specific value to a file (key is the SHA-256 digest as hex) + // sqlite3 Store.sqlite \ + // "SELECT writefile('/tmp/value.bin', value) FROM entries \ + // WHERE key = x''" + // + // `storage_date` is `Date.timeIntervalSinceReferenceDate` (seconds since + // 2001-01-01); +978307200 shifts to unix epoch for `datetime(..., 'unixepoch')`. + + struct Entry { + let storageDate: Date + let metadata: Data + let value: Data + } + + private let diskCapacity: Int + private let directory: URL + private let filename: String + + /// Cached open result. `nil` means "open hasn't been attempted yet"; once + /// set, either holds the live connection or the error we'll keep handing + /// back for the lifetime of the cache. Reading and writing both happen + /// under `openLock` — the cached value is read/written via the same code + /// path, and the open work runs at most once per cache. + private var dbResult: Result? + private let openLock = NSLock() + + /// Schema version baked into this build. The on-disk `PRAGMA user_version` + /// is checked on open; anything other than this exact value triggers a + /// drop-and-recreate. That includes the fresh-database case (`user_version` + /// defaults to 0 on a brand-new SQLite file), so the first open of a new + /// database also runs the drop path — harmless because there's nothing to + /// drop. Bump only on changes that hit user devices — pre-ship iteration + /// shouldn't burn version numbers. Never decrement (a downgrade would + /// silently wipe users' caches a second time on the next upgrade). + private static let schemaVersion: Int32 = 1 + + private static let logger = Logger(subsystem: "GutenbergKit", category: "sqlite-kv-cache") + + /// SQLite C API: signals that bound data should be copied. Reinvented here + /// because the `SQLITE_TRANSIENT` C macro doesn't import to Swift. + private static let SQLITE_TRANSIENT = unsafeBitCast( + OpaquePointer(bitPattern: -1), to: sqlite3_destructor_type.self + ) + + /// Creates a new cache. **Non-throwing**: the SQLite connection is opened + /// lazily on the first `get`/`put`/`delete`/`clear`. If the open or schema + /// setup fails, the failure is cached and re-thrown from every subsequent + /// operation — callers using `try?` get a uniform "cache unavailable" + /// fall-through without a separate error path at the construction site. + /// + /// - Parameters: + /// - handle: Identifies this cache within `directory`. Becomes the database + /// filename (`.sqlite`). Required to be a string + /// literal so it's known at compile time, and required to match + /// `[a-zA-Z0-9._-]` after lowercasing. Lowercasing is unconditional; + /// the `[a-zA-Z0-9._-]` shape check is enforced by a debug-build + /// assertion only — release builds skip the check and use the + /// lowercased handle as-is, which can produce an unexpected filename + /// if the input contained disallowed characters. + /// - directory: The directory where the database file lives. Defaults to + /// the system caches directory. + /// - diskCapacity: Soft cap (in bytes) on the combined size of stored values + /// and metadata. After every `put`, oldest entries (by storage date) are + /// evicted until total size is at or below this cap. A `put` for an entry + /// whose own size exceeds the cap is silently dropped — choose a cap + /// comfortably above the largest expected entry. Pass `0` to disable + /// eviction entirely. + init( + handle: StaticString, + directory: URL = URL.cachesDirectory, + diskCapacity: Int + ) { + let filename = "\(handle)".lowercased() + assert( + Self.isValidHandle(filename), + "handle must be a non-empty filename component matching [a-zA-Z0-9._-]; got '\(handle)'" + ) + self.diskCapacity = diskCapacity + self.directory = directory + self.filename = filename + } + + /// Whether `name` is a safe filename component: non-empty, only + /// `[a-z0-9._-]`, and not a directory reference (`.` or `..`). + /// Used by the debug-build assertion in `init`. + static func isValidHandle(_ name: String) -> Bool { + let allowed: Set = Set("abcdefghijklmnopqrstuvwxyz0123456789._-") + return !name.isEmpty + && name.allSatisfy { allowed.contains($0) } + && name != "." + && name != ".." + } + + /// Opens the database on first call and caches the result; subsequent + /// calls return the cached connection (or re-throw the cached error). + /// Every public operation routes through this helper, so a connection + /// is opened at most once per cache regardless of which method is + /// called first. + private func connection() throws -> OpaquePointer { + openLock.lock() + defer { openLock.unlock() } + if let cached = dbResult { + return try cached.get() + } + let result = Result { + try Self.openAndConfigure(directory: self.directory, filename: self.filename, diskCapacity: self.diskCapacity) + } + dbResult = result + return try result.get() + } + + /// Opens the SQLite file, applies performance pragmas, and either creates + /// or migrates the schema. Called from `connection()` once per cache. On + /// any failure after the connection handle is alive, the handle is closed + /// before the error propagates so we don't leak it into the cached + /// failure. + private static func openAndConfigure(directory: URL, filename: String, diskCapacity: Int) throws -> OpaquePointer { + try FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) + // File protection: files created under the default `URL.cachesDirectory` + // inherit `NSFileProtectionCompleteUntilFirstUserAuthentication` — + // unreadable until the user has unlocked the device once since boot. + // That's the right tier for cached HTTP bodies (which may include + // cookies / auth headers) but not for highly sensitive secrets. + // Callers needing a stricter class can pass a `directory` configured + // with `.completeUnlessOpen` or `.complete`. + // + // `appending(component:)` treats the input as a single path component (so a + // stray `/` would be percent-encoded, not interpreted as a separator), and + // `appendingPathExtension` keeps us from string-interpolating the suffix. + let dbPath = directory + .appending(component: filename) + .appendingPathExtension("sqlite") + .path(percentEncoded: false) + + var connection: OpaquePointer? + let openResult = sqlite3_open_v2( + dbPath, + &connection, + SQLITE_OPEN_CREATE | SQLITE_OPEN_READWRITE | SQLITE_OPEN_FULLMUTEX, + nil + ) + if openResult != SQLITE_OK { + Self.logger.error("Failed to open '\(dbPath)': \(Self.describe(openResult))") + // Per the SQLite contract, a failed `sqlite3_open_v2` may still + // hand back a connection handle that needs closing. `_close_v2` + // tolerates a nil pointer, so this is safe either way. + sqlite3_close_v2(connection) + throw Error.databaseUnavailable + } + guard let connection else { + throw Error.databaseUnavailable + } + + // Pragmas + schema setup. Pragmas first because `journal_mode` + // changes must run with no active transaction. `journal_mode = WAL` + // switches from the default rollback-journal to a write-ahead log: + // smaller, sequential commits, and readers no longer block at the + // file level the way they do under rollback-journal mode. Combined + // with `synchronous = NORMAL` this trades a small slice of + // durability — last few committed transactions could be lost on + // power loss, but not on a clean process crash — for substantially + // faster small writes. Acceptable here because this is a cache: + // losing the last few writes after a power cut just means a few + // extra refetches against the network, not data loss against a + // primary store. (WAL alone doesn't change in-process isolation: a + // same-connection reader during an in-flight write still sees the + // uncommitted state — that would only change with a separate reader + // connection.) WAL is a correctness dependency for `synchronous = + // NORMAL`, so `enableWAL` verifies the resulting journal mode rather + // than relying on the pragma's return code; see its doc-comment. + // + // If anything from this point fails, the connection is closed before + // the error propagates so the cached failure result in `connection()` + // doesn't leak the handle. + do { + try Self.enableWAL(on: connection) + try Self.exec("PRAGMA synchronous = NORMAL;", on: connection) + + // If the on-disk schema version doesn't match what we expect, + // drop the entries table — losing cached data is acceptable for a + // cache, and keeps schema migrations to a single bump-and-recreate. + if try Self.readSchemaVersion(db: connection) != Self.schemaVersion { + try Self.exec("DROP TABLE IF EXISTS entries;", on: connection) + try Self.exec("PRAGMA user_version = \(Self.schemaVersion);", on: connection) + } + try Self.exec(""" + CREATE TABLE IF NOT EXISTS entries ( + key BLOB PRIMARY KEY NOT NULL, + storage_date REAL NOT NULL, + metadata BLOB NOT NULL, + value BLOB NOT NULL + ) WITHOUT ROWID; + CREATE INDEX IF NOT EXISTS entries_storage_date_idx ON entries(storage_date); + """, on: connection) + try Self.installEvictionTriggers(on: connection, diskCapacity: diskCapacity) + try Self.vacuumIfWorthwhile(on: connection) + } catch { + sqlite3_close_v2(connection) + // Setup-time failures (a setup pragma, DDL, trigger install, or + // VACUUM rejected by SQLite) all collapse to `databaseUnavailable` + // here — the originating exec helper threw `writeFailed` for its + // own reasons, but the user-facing surface is "the cache failed + // to come up", not "a write failed". The originating SQLite code + // was already logged at the exec site. + throw Error.databaseUnavailable + } + + return connection + } + + /// Runs `VACUUM` if the freelist is at least `vacuumFreelistThreshold` of + /// the database's pages. Called once during open after the schema is + /// settled, so churn from past sessions (eviction sweeps, schema-mismatch + /// table drops, explicit deletes) is reclaimed at most once per process — + /// versus `auto_vacuum = FULL` which would touch the freelist on every + /// transaction. The threshold is tuned so VACUUM's I/O cost (a full DB + /// rewrite, copying every non-free page) is in the same order of magnitude + /// as the bytes reclaimed: at 25% freelist the ratio is ~3:1, at 5% it's + /// ~20:1 — at the lower end the rewrite costs more I/O than it saves on + /// disk, so we'd rather leave the freelist in place. + /// + /// Skipping when `total == 0` covers the brand-new-DB case where SQLite + /// hasn't allocated any pages yet (a freshly opened, never-written file). + private static func vacuumIfWorthwhile(on db: OpaquePointer) throws { + let freelist = try Self.readIntPragma("freelist_count", on: db) + let total = try Self.readIntPragma("page_count", on: db) + guard total > 0 else { return } + let fraction = Double(freelist) / Double(total) + guard fraction > Self.vacuumFreelistThreshold else { return } + Self.logger.info("Running VACUUM (freelist=\(freelist)/\(total) pages, fraction=\(fraction))") + try Self.exec("VACUUM;", on: db) + } + + /// Freelist-to-total-pages ratio above which `vacuumIfWorthwhile` rewrites + /// the database on open. See `vacuumIfWorthwhile` for the cost-benefit + /// reasoning. Exposed (non-private) so tests can pin the threshold without + /// duplicating the constant. + static let vacuumFreelistThreshold: Double = 0.25 + + /// Drops and recreates the eviction triggers with `diskCapacity` baked + /// into the trigger SQL. Called on every successful open: `diskCapacity` + /// is a constructor parameter that can change between opens of the same + /// file, and the triggers don't auto-update — drop-and-recreate is the + /// simplest way to ensure the running database always reflects the + /// current instance's cap. (This same drop-and-recreate is what makes the + /// "one instance per backing file" contract a real failure mode: a + /// second instance with a different cap would clobber the first's + /// triggers.) With `diskCapacity == 0` the triggers are dropped and not + /// recreated, disabling eviction entirely. + /// + /// Two triggers are installed because SQLite fires `AFTER UPDATE` (not + /// `AFTER INSERT`) on the upsert's `ON CONFLICT DO UPDATE` branch, but + /// fresh-key inserts fire `AFTER INSERT`. Bodies are identical: a + /// window-function DELETE that removes oldest-by-`storage_date` rows + /// until total stored size is at or below `diskCapacity`. The `WHEN` + /// guard short-circuits the body when the table is already under cap so + /// the common path is just a cheap aggregate scan, not a window-function + /// query. + /// + /// `diskCapacity` is interpolated into the trigger SQL — it's an `Int` + /// constant, not user input, so there's no injection vector. Triggers + /// run inside the firing statement's implicit transaction; if the + /// eviction DELETE fails (disk full, etc.), SQLite rolls back the entire + /// statement — the original INSERT/UPDATE is undone with it, with no + /// Swift-side orchestration needed. + private static func installEvictionTriggers(on db: OpaquePointer, diskCapacity: Int) throws { + try Self.exec("DROP TRIGGER IF EXISTS entries_evict_after_insert;", on: db) + try Self.exec("DROP TRIGGER IF EXISTS entries_evict_after_update;", on: db) + guard diskCapacity > 0 else { return } + let evictionBody = """ + DELETE FROM entries WHERE key IN ( + SELECT key FROM ( + SELECT key, + SUM(length(value) + length(metadata)) + OVER (ORDER BY storage_date DESC, key DESC + ROWS UNBOUNDED PRECEDING) AS running + FROM entries + ) WHERE running > \(diskCapacity) + ); + """ + let whenGuard = "(SELECT COALESCE(SUM(length(value) + length(metadata)), 0) FROM entries) > \(diskCapacity)" + try Self.exec(""" + CREATE TRIGGER entries_evict_after_insert AFTER INSERT ON entries + WHEN \(whenGuard) + BEGIN + \(evictionBody) + END; + """, on: db) + try Self.exec(""" + CREATE TRIGGER entries_evict_after_update AFTER UPDATE ON entries + WHEN \(whenGuard) + BEGIN + \(evictionBody) + END; + """, on: db) + } + + /// Sets `PRAGMA journal_mode = WAL;` and verifies that the database + /// actually switched to WAL. SQLite returns the resulting journal mode + /// in a row regardless of whether the requested change took effect — if + /// WAL can't be applied (an exclusive lock is held, the platform doesn't + /// support shared memory, etc.) the pragma quietly keeps the previous + /// mode and reports it. We can't tell "succeeded" from "kept the old + /// mode" via `sqlite3_exec`, so this helper prepares the pragma, steps + /// it, and reads the row back. WAL is a correctness dependency for + /// `synchronous = NORMAL` (NORMAL is documented safe under WAL but only + /// "probably safe" under rollback-journal), so a non-WAL result here + /// fails the open rather than silently degrading to the unsafe pairing. + private static func enableWAL(on db: OpaquePointer) throws { + var stmt: OpaquePointer? + let prepareResult = sqlite3_prepare_v2(db, "PRAGMA journal_mode = WAL;", -1, &stmt, nil) + defer { sqlite3_finalize(stmt) } + guard prepareResult == SQLITE_OK else { + Self.logger.error("enableWAL prepare failed: \(Self.describe(prepareResult))") + throw Error.databaseUnavailable + } + guard sqlite3_step(stmt) == SQLITE_ROW else { + Self.logger.error("enableWAL step did not return a row") + throw Error.databaseUnavailable + } + guard let cString = sqlite3_column_text(stmt, 0) else { + Self.logger.error("enableWAL returned NULL mode") + throw Error.databaseUnavailable + } + let mode = String(cString: cString) + guard mode == "wal" else { + Self.logger.error("WAL not applied; resulting journal mode is '\(mode)'") + throw Error.databaseUnavailable + } + } + + private static func readSchemaVersion(db: OpaquePointer) throws -> Int32 { + try Self.readIntPragma("user_version", on: db) + } + + /// Reads a SQLite integer-valued PRAGMA from `db`. The pragma name is + /// interpolated into the SQL — only ever called with hardcoded constants + /// from inside this file, so there's no injection vector. Throws + /// `databaseUnavailable` on prepare/step failure (matching the open path's + /// other pragma helpers). + private static func readIntPragma(_ pragma: String, on db: OpaquePointer) throws -> Int32 { + var stmt: OpaquePointer? + let prepareResult = sqlite3_prepare_v2(db, "PRAGMA \(pragma);", -1, &stmt, nil) + defer { sqlite3_finalize(stmt) } + guard prepareResult == SQLITE_OK else { + Self.logger.error("readIntPragma(\(pragma)) prepare failed: \(Self.describe(prepareResult))") + throw Error.databaseUnavailable + } + guard sqlite3_step(stmt) == SQLITE_ROW else { + Self.logger.error("readIntPragma(\(pragma)) step did not return a row") + throw Error.databaseUnavailable + } + return sqlite3_column_int(stmt, 0) + } + + deinit { + // Close only if the open succeeded. A cached failure or never-attempted + // open both leave nothing to close. `sqlite3_close_v2` tolerates + // outstanding statements (the prepared-statement path uses + // `defer { finalize }` in the nominal case, but a bug-induced leak + // would otherwise cause the connection to leak too). No lock here: + // deinit only fires when refcount hits zero, so no other thread holds + // a reference to read or mutate `dbResult`. + if case .success(let db) = dbResult { + sqlite3_close_v2(db) + } + } + + /// Looks up the entry at `key`. Returns `nil` for a genuine miss + /// (`SQLITE_DONE` from the step). Throws `Error.readFailed` for any other + /// non-row step result or a prepare failure — cache callers that want the + /// prior "treat read failures as misses" behavior can wrap the call in + /// `try?`. Failures are also logged so a wedged or corrupt cache is + /// visible in the OS log instead of looking like a steady stream of misses. + func get(key: String) throws -> Entry? { + let db = try connection() + var stmt: OpaquePointer? + let prepareResult = sqlite3_prepare_v2( + db, + "SELECT storage_date, metadata, value FROM entries WHERE key = ?1;", + -1, &stmt, nil + ) + defer { sqlite3_finalize(stmt) } + if prepareResult != SQLITE_OK { + Self.logger.error("get failed at sqlite3_prepare_v2: \(Self.describe(prepareResult))") + throw Error.readFailed(sqliteCode: prepareResult) + } + try Self.bindKey(key, stmt: stmt, column: 1, onError: Error.readFailed) + + let stepResult = sqlite3_step(stmt) + switch stepResult { + case SQLITE_ROW: + return Entry( + storageDate: Date(timeIntervalSinceReferenceDate: sqlite3_column_double(stmt, 0)), + metadata: Self.readBlobAsData(stmt: stmt, column: 1), + value: Self.readBlobAsData(stmt: stmt, column: 2) + ) + case SQLITE_DONE: + return nil + default: + Self.logger.error("get failed at sqlite3_step: \(Self.describe(stepResult))") + throw Error.readFailed(sqliteCode: stepResult) + } + } + + /// Inserts or overwrites the entry at `key`. The eviction sweep runs + /// inside an `AFTER INSERT`/`AFTER UPDATE` trigger, so this method is a + /// single autocommit SQL statement; SQLite handles atomicity (a failure + /// in the trigger body rolls back the entire statement, including the + /// inserted row). The trigger body is gated by a `WHEN (SELECT SUM…) > + /// diskCapacity` aggregate, so puts that don't push the table past cap + /// pay only that scan, not the full window-function eviction query. + /// + /// A `put` for an entry whose own size exceeds `diskCapacity` is silently + /// dropped — the trigger would insert it then immediately evict it, so we + /// short-circuit and skip the round trip. Throws `Error.writeFailed` if + /// SQLite rejects the write (disk full, blob over `SQLITE_MAX_LENGTH`, + /// database corruption, etc.). + func put(key: String, storageDate: Date, metadata: Data, value: Data) throws { + if self.diskCapacity > 0 && metadata.count + value.count > self.diskCapacity { + return + } + + let db = try connection() + + var stmt: OpaquePointer? + let prepareResult = sqlite3_prepare_v2( + db, + """ + INSERT INTO entries (key, storage_date, metadata, value) + VALUES (?1, ?2, ?3, ?4) + ON CONFLICT(key) DO UPDATE SET + storage_date = excluded.storage_date, + metadata = excluded.metadata, + value = excluded.value; + """, + -1, &stmt, nil + ) + defer { sqlite3_finalize(stmt) } + if prepareResult != SQLITE_OK { + Self.logger.error("put failed at sqlite3_prepare_v2: \(Self.describe(prepareResult))") + throw Error.writeFailed(sqliteCode: prepareResult) + } + try Self.bindKey(key, stmt: stmt, column: 1, onError: Error.writeFailed) + try Self.bindDouble(storageDate.timeIntervalSinceReferenceDate, stmt: stmt, column: 2, onError: Error.writeFailed) + try Self.bindBlob(metadata, stmt: stmt, column: 3, onError: Error.writeFailed) + try Self.bindBlob(value, stmt: stmt, column: 4, onError: Error.writeFailed) + let stepResult = sqlite3_step(stmt) + if stepResult != SQLITE_DONE { + Self.logger.error("put failed at sqlite3_step: \(Self.describe(stepResult))") + throw Error.writeFailed(sqliteCode: stepResult) + } + } + + /// Removes the entry at `key`, if any. Throws `Error.writeFailed` if SQLite + /// rejects the delete. + func delete(key: String) throws { + let db = try connection() + + var stmt: OpaquePointer? + let prepareResult = sqlite3_prepare_v2(db, "DELETE FROM entries WHERE key = ?1;", -1, &stmt, nil) + defer { sqlite3_finalize(stmt) } + if prepareResult != SQLITE_OK { + Self.logger.error("delete failed at sqlite3_prepare_v2: \(Self.describe(prepareResult))") + throw Error.writeFailed(sqliteCode: prepareResult) + } + try Self.bindKey(key, stmt: stmt, column: 1, onError: Error.writeFailed) + let stepResult = sqlite3_step(stmt) + if stepResult != SQLITE_DONE { + Self.logger.error("delete failed at sqlite3_step: \(Self.describe(stepResult))") + throw Error.writeFailed(sqliteCode: stepResult) + } + } + + /// Removes all entries. Throws `Error.writeFailed` if SQLite rejects the + /// delete. + func clear() throws { + let db = try connection() + try Self.exec("DELETE FROM entries;", on: db) + } + + /// Hashes the caller's key with SHA-256 and binds the raw 32-byte digest as + /// the SQLite `BLOB` parameter at `column`. Hashing produces a fixed-length + /// 32-byte digest regardless of the input, which side-steps key length + /// limits, embedded null bytes, encoding issues, and SQL-special-character + /// concerns. Throws via `onError` if SQLite rejects the bind. + private static func bindKey( + _ key: String, + stmt: OpaquePointer?, + column: Int32, + onError: (Int32) -> Error + ) throws { + try Self.bindBlob(Self.hashKey(key), stmt: stmt, column: column, onError: onError) + } + + /// Binds `data` as the SQLite `BLOB` parameter at `column`. Uses + /// `sqlite3_bind_blob64` (UInt64 length) instead of `sqlite3_bind_blob` + /// (Int32) so the byte-count cast can't trap on hypothetical >2 GiB inputs. + /// SQLite still rejects anything over `SQLITE_MAX_LENGTH` (~1 GiB by + /// default) with `SQLITE_TOOBIG`, surfaced via `onError`. Read callers + /// pass `Error.readFailed`, write callers pass `Error.writeFailed`. + private static func bindBlob( + _ data: Data, + stmt: OpaquePointer?, + column: Int32, + onError: (Int32) -> Error + ) throws { + let result = data.withUnsafeBytes { + sqlite3_bind_blob64(stmt, column, $0.baseAddress, sqlite3_uint64(data.count), Self.SQLITE_TRANSIENT) + } + if result != SQLITE_OK { + Self.logger.error("bind blob failed at column \(column): \(Self.describe(result))") + throw onError(result) + } + } + + /// Binds `value` as the SQLite `REAL` parameter at `column`. Throws via + /// `onError` if SQLite rejects the bind. + private static func bindDouble( + _ value: Double, + stmt: OpaquePointer?, + column: Int32, + onError: (Int32) -> Error + ) throws { + let result = sqlite3_bind_double(stmt, column, value) + if result != SQLITE_OK { + Self.logger.error("bind double failed at column \(column): \(Self.describe(result))") + throw onError(result) + } + } + + private static func hashKey(_ key: String) -> Data { + Data(SHA256.hash(data: Data(key.utf8))) + } + + /// Wraps `sqlite3_exec`, logs non-OK return codes, and throws on failure. + /// Used for DDL (table/index/trigger setup) and one-shot writes (`clear`'s + /// table-wide DELETE) where the caller doesn't need a prepared statement. + private static func exec(_ sql: String, on db: OpaquePointer) throws { + let result = sqlite3_exec(db, sql, nil, nil, nil) + if result != SQLITE_OK { + Self.logger.error("exec failed: \(Self.describe(result)) — \(sql)") + throw Error.writeFailed(sqliteCode: result) + } + } + + private static func readBlobAsData(stmt: OpaquePointer?, column: Int32) -> Data { + guard let bytes = sqlite3_column_blob(stmt, column) else { return Data() } + let count = Int(sqlite3_column_bytes(stmt, column)) + return Data(bytes: bytes, count: count) + } + + /// Maps a SQLite result code to its human-readable English description via + /// `sqlite3_errstr`. Used in log messages and `Error` descriptions so + /// callers see "disk I/O error (code 10)" instead of "code 10". + fileprivate static func describe(_ code: Int32) -> String { + if let cString = sqlite3_errstr(code) { + return "\(String(cString: cString)) (code \(code))" + } + return "code \(code)" + } +} + +extension SQLiteKVCache.Error: CustomStringConvertible, LocalizedError { + var description: String { + switch self { + case .databaseUnavailable: + return "SQLite database is unavailable (open or setup failed)" + case .readFailed(let sqliteCode): + return "SQLite read failed: \(SQLiteKVCache.describe(sqliteCode))" + case .writeFailed(let sqliteCode): + return "SQLite write failed: \(SQLiteKVCache.describe(sqliteCode))" + } + } + + var errorDescription: String? { description } +} diff --git a/ios/Tests/GutenbergKitTests/Stores/SQLiteKVCacheTests.swift b/ios/Tests/GutenbergKitTests/Stores/SQLiteKVCacheTests.swift new file mode 100644 index 000000000..6d260cf98 --- /dev/null +++ b/ios/Tests/GutenbergKitTests/Stores/SQLiteKVCacheTests.swift @@ -0,0 +1,555 @@ +import Foundation +import SQLite3 +import Testing + +@testable import GutenbergKit + +@Suite +struct SQLiteKVCacheTests { + + private func makeStore(diskCapacity: Int = 0) -> SQLiteKVCache { + SQLiteKVCache(handle: "test", directory: .randomTemporaryDirectory, diskCapacity: diskCapacity) + } + + private let referenceDate = Date(timeIntervalSinceReferenceDate: 0) + + // MARK: - Round-trip + + @Test("put and get round-trips a value with metadata") + func putGetRoundTrip() throws { + let store = makeStore() + try store.put(key: "k", storageDate: referenceDate, metadata: Data("meta"), value: Data("hello")) + + let entry = try #require(try store.get(key: "k")) + #expect(entry.value == Data("hello")) + #expect(entry.metadata == Data("meta")) + #expect(entry.storageDate == referenceDate) + } + + @Test("get returns nil for a missing key") + func getMissingKey() throws { + let store = makeStore() + #expect(try store.get(key: "missing") == nil) + } + + @Test("put overwrites the existing entry for the same key") + func putOverwrites() throws { + let store = makeStore() + try store.put(key: "k", storageDate: referenceDate, metadata: Data(), value: Data("first")) + try store.put(key: "k", storageDate: referenceDate.addingTimeInterval(60), metadata: Data("new"), value: Data("second")) + + let entry = try #require(try store.get(key: "k")) + #expect(entry.value == Data("second")) + #expect(entry.metadata == Data("new")) + #expect(entry.storageDate == referenceDate.addingTimeInterval(60)) + } + + @Test("clear removes all entries") + func clearRemovesAll() throws { + let store = makeStore() + try store.put(key: "a", storageDate: referenceDate, metadata: Data(), value: Data("A")) + try store.put(key: "b", storageDate: referenceDate, metadata: Data(), value: Data("B")) + try store.clear() + #expect(try store.get(key: "a") == nil) + #expect(try store.get(key: "b") == nil) + } + + @Test("delete removes the entry at key and leaves others alone") + func deleteRemovesOne() throws { + let store = makeStore() + try store.put(key: "a", storageDate: referenceDate, metadata: Data(), value: Data("A")) + try store.put(key: "b", storageDate: referenceDate, metadata: Data(), value: Data("B")) + try store.delete(key: "a") + #expect(try store.get(key: "a") == nil) + #expect(try store.get(key: "b")?.value == Data("B")) + } + + @Test("delete on a missing key is a no-op") + func deleteMissingKey() throws { + let store = makeStore() + try store.put(key: "kept", storageDate: referenceDate, metadata: Data(), value: Data("ok")) + try store.delete(key: "never-existed") + // Existing entries unaffected. + #expect(try store.get(key: "kept")?.value == Data("ok")) + } + + @Test("distinct keys are independent") + func distinctKeysIndependent() throws { + let store = makeStore() + try store.put(key: "a", storageDate: referenceDate, metadata: Data(), value: Data("A")) + try store.put(key: "b", storageDate: referenceDate, metadata: Data(), value: Data("B")) + #expect(try #require(try store.get(key: "a")).value == Data("A")) + #expect(try #require(try store.get(key: "b")).value == Data("B")) + } + + @Test("empty value and empty metadata round-trip") + func emptyBlobsRoundTrip() throws { + let store = makeStore() + try store.put(key: "empty", storageDate: referenceDate, metadata: Data(), value: Data()) + let entry = try #require(try store.get(key: "empty")) + #expect(entry.value == Data()) + #expect(entry.metadata == Data()) + } + + // MARK: - Persistence + + @Test("entries persist across instances against the same directory and handle") + func persistsAcrossInstances() throws { + let directory = URL.randomTemporaryDirectory + let first = SQLiteKVCache(handle: "test", directory: directory, diskCapacity: 0) + try first.put(key: "durable", storageDate: referenceDate, metadata: Data("m"), value: Data("v")) + + let reopened = SQLiteKVCache(handle: "test", directory: directory, diskCapacity: 0) + let entry = try #require(try reopened.get(key: "durable")) + #expect(entry.value == Data("v")) + #expect(entry.metadata == Data("m")) + } + + @Test("different handles in the same directory are independent stores") + func handlesIndependentInSameDirectory() throws { + let directory = URL.randomTemporaryDirectory + let storeA = SQLiteKVCache(handle: "A", directory: directory, diskCapacity: 0) + let storeB = SQLiteKVCache(handle: "B", directory: directory, diskCapacity: 0) + + try storeA.put(key: "shared-key", storageDate: referenceDate, metadata: Data(), value: Data("from A")) + try storeB.put(key: "shared-key", storageDate: referenceDate, metadata: Data(), value: Data("from B")) + + #expect(try storeA.get(key: "shared-key")?.value == Data("from A")) + #expect(try storeB.get(key: "shared-key")?.value == Data("from B")) + } + + // MARK: - Eviction (oldest-first by storage_date) + + @Test("entries within capacity are not evicted") + func underCapacityKeepsAll() throws { + let store = makeStore(diskCapacity: 10 * 1024) + try store.put(key: "a", storageDate: referenceDate, metadata: Data(), value: Data(repeating: 0xAA, count: 100)) + try store.put(key: "b", storageDate: referenceDate.addingTimeInterval(1), metadata: Data(), value: Data(repeating: 0xBB, count: 100)) + #expect(try store.get(key: "a") != nil) + #expect(try store.get(key: "b") != nil) + } + + @Test("oldest entries are evicted when capacity is exceeded") + func evictsOldestOverCapacity() throws { + // 600-byte cap. Each entry is 300 bytes (300-byte value + empty metadata). + // Two fit; the third forces eviction of the oldest by storage_date. + let store = makeStore(diskCapacity: 600) + try store.put(key: "oldest", storageDate: referenceDate, metadata: Data(), value: Data(repeating: 0x11, count: 300)) + try store.put(key: "middle", storageDate: referenceDate.addingTimeInterval(100), metadata: Data(), value: Data(repeating: 0x22, count: 300)) + try store.put(key: "newest", storageDate: referenceDate.addingTimeInterval(200), metadata: Data(), value: Data(repeating: 0x33, count: 300)) + + #expect(try store.get(key: "oldest") == nil) + #expect(try store.get(key: "middle") != nil) + #expect(try store.get(key: "newest") != nil) + } + + @Test("diskCapacity of 0 disables eviction") + func zeroCapacityDisablesEviction() throws { + let store = makeStore(diskCapacity: 0) + try store.put(key: "a", storageDate: referenceDate, metadata: Data(), value: Data(repeating: 0x11, count: 4096)) + try store.put(key: "b", storageDate: referenceDate.addingTimeInterval(1), metadata: Data(), value: Data(repeating: 0x22, count: 4096)) + #expect(try store.get(key: "a") != nil) + #expect(try store.get(key: "b") != nil) + } + + @Test("eviction breaks ties deterministically when storage dates are equal") + func evictionTiebreaker() throws { + // 700-byte cap; each entry is 300 bytes. Two same-dated entries plus a + // newer one forces eviction. The newer entry survives; exactly one of + // the tied entries is evicted by the hash-DESC tiebreaker (we don't + // pin which one — the underlying ordering is on SHA-256 digests). + let store = makeStore(diskCapacity: 700) + try store.put(key: "a", storageDate: referenceDate, metadata: Data(), value: Data(repeating: 0x01, count: 300)) + try store.put(key: "z", storageDate: referenceDate, metadata: Data(), value: Data(repeating: 0x02, count: 300)) + try store.put(key: "newer", storageDate: referenceDate.addingTimeInterval(1), metadata: Data(), value: Data(repeating: 0x03, count: 300)) + + #expect(try store.get(key: "newer") != nil) + let aSurvives = try store.get(key: "a") != nil + let zSurvives = try store.get(key: "z") != nil + #expect(aSurvives != zSurvives, "exactly one of the tied entries should survive") + } + + @Test("an entry larger than diskCapacity is silently dropped on store") + func oversizedEntryNotStored() throws { + // Such an entry couldn't survive the eviction sweep, so put short-circuits + // and skips the write entirely. The observable contract is the same: the + // entry is not in the store afterwards. Other entries are unaffected. + let store = makeStore(diskCapacity: 300) + try store.put(key: "fits", storageDate: referenceDate, metadata: Data(), value: Data("ok")) + try store.put(key: "big", storageDate: referenceDate.addingTimeInterval(1), metadata: Data(), value: Data(repeating: 0x42, count: 400)) + + #expect(try store.get(key: "big") == nil) + #expect(try store.get(key: "fits")?.value == Data("ok")) + } + + // MARK: - Key edge cases + // + // Keys are SHA-256 hashed before being bound to SQLite, so most "weird key" + // concerns (SQL escaping, encoding, embedded nulls, length limits) collapse + // to "does the hash function distinguish these inputs?" These tests pin the + // round-trip property — that distinct inputs map to distinct entries — + // across input shapes that would otherwise have been hazardous if keys were + // bound verbatim. + + @Test("SQL-shaped keys round-trip without affecting other entries") + func keysWithSQLSpecialCharacters() throws { + let store = makeStore() + let evilKey = "'; DROP TABLE entries; --" + try store.put(key: evilKey, storageDate: referenceDate, metadata: Data(), value: Data("ok")) + + let entry = try #require(try store.get(key: evilKey)) + #expect(entry.value == Data("ok")) + + try store.put(key: "normal", storageDate: referenceDate, metadata: Data(), value: Data("normal value")) + #expect(try store.get(key: "normal")?.value == Data("normal value")) + } + + @Test("unicode and emoji keys round-trip correctly") + func unicodeKeys() throws { + let store = makeStore() + let unicodeKey = "café-日本語-🎉" + try store.put(key: unicodeKey, storageDate: referenceDate, metadata: Data(), value: Data("unicode value")) + + let entry = try #require(try store.get(key: unicodeKey)) + #expect(entry.value == Data("unicode value")) + } + + @Test("keys differing only by an embedded null byte don't collide") + func keysWithEmbeddedNullBytes() throws { + let store = makeStore() + // The hash of "abc\0def" differs from the hash of "abc" — the byte + // sequence is what's hashed, not a C-string view of it. + let nullKey = "abc\0def" + let collidingKey = "abc" + + try store.put(key: nullKey, storageDate: referenceDate, metadata: Data(), value: Data("with null")) + try store.put(key: collidingKey, storageDate: referenceDate, metadata: Data(), value: Data("without null")) + + #expect(try #require(try store.get(key: nullKey)).value == Data("with null")) + #expect(try #require(try store.get(key: collidingKey)).value == Data("without null")) + } + + // MARK: - Error messages + + @Test("Error.writeFailed renders sqlite3_errstr description") + func writeFailedErrorDescription() { + let err = SQLiteKVCache.Error.writeFailed(sqliteCode: 13) // SQLITE_FULL + // Description and localizedDescription both come through the same hook. + #expect(err.description.contains("disk")) + #expect(err.description.contains("(code 13)")) + #expect(err.localizedDescription == err.description) + } + + @Test("Error.databaseUnavailable renders a description") + func databaseUnavailableErrorDescription() { + let err = SQLiteKVCache.Error.databaseUnavailable + #expect(!err.description.isEmpty) + #expect(err.localizedDescription == err.description) + } + + // MARK: - Handle validation + + @Test("isValidHandle accepts ordinary filename components") + func isValidHandleAccepts() { + #expect(SQLiteKVCache.isValidHandle("editorurlcache")) + #expect(SQLiteKVCache.isValidHandle("foo-bar_baz")) + #expect(SQLiteKVCache.isValidHandle("v1.0")) + #expect(SQLiteKVCache.isValidHandle("a")) + } + + @Test("isValidHandle rejects empty, directory references, and unsafe characters") + func isValidHandleRejects() { + #expect(!SQLiteKVCache.isValidHandle("")) + #expect(!SQLiteKVCache.isValidHandle(".")) + #expect(!SQLiteKVCache.isValidHandle("..")) + #expect(!SQLiteKVCache.isValidHandle("foo/bar")) + #expect(!SQLiteKVCache.isValidHandle("foo bar")) + #expect(!SQLiteKVCache.isValidHandle("foo!bar")) + // Uppercase fails — the assert in init runs after lowercasing, + // but the validator itself is strict. + #expect(!SQLiteKVCache.isValidHandle("Foo")) + // Non-ASCII fails. + #expect(!SQLiteKVCache.isValidHandle("café")) + #expect(!SQLiteKVCache.isValidHandle("東京")) + } + + @Test("init lowercases the handle before using it as a filename") + func initLowercasesHandle() throws { + let directory = URL.randomTemporaryDirectory + // Two stores with handles that differ only by case end up at the same + // file — the second open finds the first's data. + let mixedCase = SQLiteKVCache(handle: "EditorURLCache", directory: directory, diskCapacity: 0) + try mixedCase.put(key: "k", storageDate: referenceDate, metadata: Data(), value: Data("ok")) + + let lowerCase = SQLiteKVCache(handle: "editorurlcache", directory: directory, diskCapacity: 0) + #expect(try lowerCase.get(key: "k")?.value == Data("ok")) + } + + // MARK: - Concurrent access + + @Test("survives concurrent put/get from many tasks without crashing or losing data") + func concurrentAccess() async throws { + let store = makeStore() + let iterations = 200 + let distinctKeys = 10 + + // Many tasks put and get against overlapping keys. The contract is: + // 1. SQLite's FULLMUTEX serialization keeps the store from crashing + // under concurrent access. + // 2. Every read returns either nothing or a well-formed value written + // by some writer — never a torn / malformed blob. + // Last-writer-wins is *not* asserted here: under FULLMUTEX, "last writer" + // depends on FIFO mutex acquisition, which Swift Concurrency doesn't + // guarantee, and across `Task`s on a cooperative pool the scheduling is + // intentionally unpredictable. + try await withThrowingTaskGroup(of: Void.self) { group in + for i in 0.. SQLiteKVCache.vacuumFreelistThreshold, + "test setup did not build a freelist over threshold; got fraction=\(beforeFraction)" + ) + + // Phase 2: reopen — VACUUM should fire during openAndConfigure. + do { + let store = SQLiteKVCache(handle: "test", directory: directory, diskCapacity: 0) + // Force the lazy open + vacuum. + _ = try store.get(key: "anything") + } + + // After VACUUM, the freelist is reclaimed. + let afterFreelist = try probeIntPragma("freelist_count", at: dbPath) + #expect(afterFreelist == 0, "expected VACUUM to clear freelist; got \(afterFreelist)") + } + + @Test("VACUUM on open is skipped when freelist is below threshold") + func vacuumOnOpenSkipsBelowThreshold() throws { + let directory = URL.randomTemporaryDirectory + let dbPath = directory.appending(path: "test.sqlite").path(percentEncoded: false) + + // A normal write-only workload — no deletes — leaves the freelist near + // zero, well below the threshold. + do { + let store = SQLiteKVCache(handle: "test", directory: directory, diskCapacity: 0) + for i in 0..<10 { + try store.put(key: "k-\(i)", storageDate: referenceDate, metadata: Data(), value: Data("value-\(i)")) + } + } + + let pagesBeforeReopen = try probeIntPragma("page_count", at: dbPath) + + // Reopen — VACUUM should be skipped because the freelist is below threshold. + // We can't directly observe "VACUUM did not run", but we can check that + // the page count is unchanged (a successful VACUUM almost always shrinks + // the file by at least a few pages on a non-trivial DB) and that the + // data round-trips intact. + do { + let store = SQLiteKVCache(handle: "test", directory: directory, diskCapacity: 0) + #expect(try store.get(key: "k-5")?.value == Data("value-5")) + } + + let pagesAfterReopen = try probeIntPragma("page_count", at: dbPath) + #expect( + pagesAfterReopen == pagesBeforeReopen, + "page_count changed across reopen; VACUUM may have run unexpectedly (\(pagesBeforeReopen) -> \(pagesAfterReopen))" + ) + } + + /// Reads an integer-valued PRAGMA from `dbPath` via a separate read-only + /// connection. WAL allows concurrent reads, so this works whether or not + /// a writer connection is currently live against the file. + private func probeIntPragma(_ pragma: String, at dbPath: String) throws -> Int32 { + var probe: OpaquePointer? + #expect(sqlite3_open_v2(dbPath, &probe, SQLITE_OPEN_READONLY, nil) == SQLITE_OK) + defer { sqlite3_close_v2(probe) } + var stmt: OpaquePointer? + #expect(sqlite3_prepare_v2(probe, "PRAGMA \(pragma);", -1, &stmt, nil) == SQLITE_OK) + defer { sqlite3_finalize(stmt) } + #expect(sqlite3_step(stmt) == SQLITE_ROW) + return sqlite3_column_int(stmt, 0) + } + + private func freelistFraction(at dbPath: String) throws -> Double { + let freelist = try probeIntPragma("freelist_count", at: dbPath) + let total = try probeIntPragma("page_count", at: dbPath) + guard total > 0 else { return 0 } + return Double(freelist) / Double(total) + } +}