From 03e5f2b1a345ea39e1b2d0b6ab1841576dce73ba Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Mon, 27 Apr 2026 04:16:11 +0200
Subject: [PATCH 1/5] test: add no-network and install profile gates

---
 .github/workflows/ci.yml                |  41 ++++++
 datafog/engine.py                       |  26 ++--
 datafog/models/spacy_nlp.py             |   3 +-
 docs/v5-model-requirements.md           | 159 ++++++++++++++++++++++++
 setup.py                                |   2 +
 tests/test_install_profiles.py          |  67 ++++++++++
 tests/test_no_network_core.py           |  96 ++++++++++++++
 tests/test_runtime_dependency_safety.py |  38 ++++++
 8 files changed, 412 insertions(+), 20 deletions(-)
 create mode 100644 docs/v5-model-requirements.md
 create mode 100644 tests/test_install_profiles.py
 create mode 100644 tests/test_no_network_core.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2daa81ba..a3020390 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -154,6 +154,47 @@ jobs:
           flags: ${{ matrix.install-profile }}-py${{ matrix.python-version }}
           token: ${{ secrets.CODECOV_TOKEN }}
 
+  profile-smoke:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        install-profile:
+          - core
+          - cli
+          - nlp
+          - nlp-advanced
+          - ocr
+          - distributed
+          - web
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+
+      - name: Upgrade pip
+        run: |
+          python -m pip install --upgrade pip
+
+      - name: Install dependencies (core)
+        if: matrix.install-profile == 'core'
+        run: |
+          pip install -e ".[test]"
+
+      - name: Install dependencies (profile)
+        if: matrix.install-profile != 'core'
+        run: |
+          pip install -e ".[test,${{ matrix.install-profile }}]"
+
+      - name: Run install profile smoke test
+        env:
+          DATAFOG_INSTALL_PROFILE: ${{ matrix.install-profile }}
+        run: |
+          pytest tests/test_install_profiles.py -q
+
   wheel-size:
     runs-on: ubuntu-latest
     steps:
diff --git a/datafog/engine.py b/datafog/engine.py
index 6687c24e..1a94e634 100644
--- a/datafog/engine.py
+++ b/datafog/engine.py
@@ -171,17 +171,13 @@ def _gliner_entities(text: str) -> list[Entity]:
 def _get_spacy_annotator():
     try:
         from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
-    except ImportError:
-        return _UnavailableAnnotator(
-            "SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
-        )
+    except ImportError as exc:
+        return _UnavailableAnnotator(str(exc))
 
     try:
         return SpacyPIIAnnotator.create()
-    except ImportError:
-        return _UnavailableAnnotator(
-            "SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
-        )
+    except ImportError as exc:
+        return _UnavailableAnnotator(str(exc))
     except Exception as exc:
         return _UnavailableAnnotator(
             f"SpaCy engine initialization failed: {type(exc).__name__}: {exc}"
@@ -192,19 +188,13 @@ def _get_spacy_annotator():
 def _get_gliner_annotator():
     try:
         from .processing.text_processing.gliner_annotator import GLiNERAnnotator
-    except ImportError:
-        return _UnavailableAnnotator(
-            "GLiNER engine requires the nlp-advanced extra. "
-            "Install with: pip install datafog[nlp-advanced]"
-        )
+    except ImportError as exc:
+        return _UnavailableAnnotator(str(exc))
 
     try:
         annotator = GLiNERAnnotator.create()
-    except ImportError:
-        return _UnavailableAnnotator(
-            "GLiNER engine requires the nlp-advanced extra. "
-            "Install with: pip install datafog[nlp-advanced]"
-        )
+    except ImportError as exc:
+        return _UnavailableAnnotator(str(exc))
     except Exception as exc:
         return _UnavailableAnnotator(
             f"GLiNER engine initialization failed: {type(exc).__name__}: {exc}"
diff --git a/datafog/models/spacy_nlp.py b/datafog/models/spacy_nlp.py
index 5257ba3d..15ca8868 100644
--- a/datafog/models/spacy_nlp.py
+++ b/datafog/models/spacy_nlp.py
@@ -9,7 +9,6 @@
 from uuid import uuid4
 
 import spacy
-from rich.progress import track
 
 from .annotator import AnnotationResult, AnnotatorRequest
 
@@ -53,7 +52,7 @@ def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResul
         )
         doc = self.nlp(annotator_request.text)
         results = []
-        for ent in track(doc.ents, description="Processing entities"):
+        for ent in doc.ents:
             result = AnnotationResult(
                 start=ent.start_char,
                 end=ent.end_char,
diff --git a/docs/v5-model-requirements.md b/docs/v5-model-requirements.md
new file mode 100644
index 00000000..ab8d496f
--- /dev/null
+++ b/docs/v5-model-requirements.md
@@ -0,0 +1,159 @@
+# v5 Model Selection Requirements
+
+This sheet defines requirements for revisiting DataFog's optional model stack before
+locking the v5 core API around specific NLP/OCR backends. It is intentionally a
+requirements document, not a model recommendation list.
+
+## Decision Goals
+
+- Pick models that improve adoption by making the first successful result easy,
+  trustworthy, and local by default.
+- Keep the core SDK fast and lightweight; model-backed engines remain optional.
+- Make model behavior explicit enough that users can defend it in privacy,
+  security, and compliance reviews.
+- Preserve a clean path for future backend swaps without breaking the top-level
+  v5 API.
+
+## Must-Haves
+
+### Runtime And Packaging
+
+- No model downloads during import, install, or ordinary SDK calls.
+- All model downloads must be explicit CLI/API actions or user-provided local
+  paths.
+- The core install must not require ML, OCR, Torch, TensorFlow, Java, Spark, or
+  system OCR binaries.
+- Optional extras must map cleanly to real imports:
+  - `nlp` for lightweight NLP engines.
+  - `nlp-advanced` for heavier ML NER engines.
+  - `ocr` for local image/OCR processing.
+  - `distributed` for Spark-style processing.
+- Missing dependency and missing model errors must explain the exact install or
+  download command.
+- Python 3.10, 3.11, and 3.12 must be supported for advertised optional model
+  profiles. Python 3.13 support should be advertised only after explicit profile
+  validation.
+- Models must work in offline mode after explicit download/cache preparation.
+
+### Privacy And Trust
+
+- No network access during inference.
+- No telemetry, remote callbacks, model hub lookups, or license checks during
+  inference.
+- No raw PII should be written to logs, cache names, telemetry, exceptions, or
+  debug traces by default.
+- Model metadata exposed by DataFog should identify model name/version/source
+  without storing detected raw PII.
+- Reversible workflows must be opt-in and clearly separated from ordinary
+  redaction.
+
+### Detection Contract
+
+- Model outputs must include enough structure for the public result contract:
+  entity type, text/span, start/end offsets, confidence when available, and
+  engine/source.
+- Spans must be deterministic for the same model, text, and settings.
+- Entity labels must be mappable into DataFog's canonical entity taxonomy without
+  surprising users.
+- Model-backed engines must compose with regex detection without duplicating or
+  overwriting high-confidence structured entities.
+- Failure modes must be predictable: unsupported language, missing model, missing
+  optional dependency, and low-confidence results should all be distinguishable.
+
+### Quality Gates
+
+- Candidate models must be benchmarked on DataFog's target corpora before
+  adoption.
+- Benchmarks must include precision/recall by entity type, not only aggregate F1.
+- Structured PII such as email, phone, IP address, SSN, credit cards, dates, and
+  ZIP/postal codes should remain regex/validator-first unless a model clearly
+  improves quality.
+- NER-style entities such as person, organization, location, address, and
+  domain-specific identifiers need regression tests with realistic app/log data.
+- OCR models must be evaluated separately for text extraction quality and PII
+  extraction quality after OCR.
+
+### Operational Fit
+
+- CPU inference must be acceptable for the default advertised workflow.
+- GPU-only models are not acceptable as default engines.
+- Model size, cold-start time, memory use, and cache footprint must be measured.
+- The model must have a usable open license for commercial SDK users.
+- The model or provider must have credible maintenance signals and versioned
+  artifacts.
+
+## Nice-To-Haves
+
+- Strong multilingual support with per-language quality reporting.
+- Quantized or small variants that keep local inference practical.
+- ONNX or other portable runtime support for future non-Torch deployments.
+- Streaming/chunked inference support or predictable behavior across chunk
+  boundaries.
+- Custom entity hints or user-provided label sets.
+- Confidence calibration good enough to expose threshold controls.
+- Batch inference APIs for logs, CSV, and JSONL workflows.
+- Clear model cards with training data notes, limitations, and intended use.
+- Support for local cache directories that can be controlled by environment
+  variable or explicit config.
+- Graceful operation on Apple Silicon and common Linux CI runners.
+
+## Disqualifiers
+
+- Requires network access for inference.
+- Downloads weights implicitly from ordinary SDK calls.
+- License is unclear, non-commercial, or incompatible with SDK distribution.
+- Requires a hosted API for core value.
+- Requires GPU for reasonable first-use behavior.
+- Cannot return stable spans or forces only label-level output.
+- Emits raw text or entities through logging, telemetry, or callbacks.
+- Adds heavyweight dependencies to the core install.
+- Breaks Python version support we already advertise.
+
+## Evaluation Matrix
+
+Each candidate backend should be scored before adoption:
+
+| Area | Required Evidence |
+| --- | --- |
+| Install footprint | Extra name, package deps, wheel size impact, system deps |
+| Runtime footprint | Cold start, warm latency, memory, CPU/GPU requirements |
+| Offline behavior | Explicit download path, local cache path, no-network test |
+| Quality | Precision/recall by entity type on DataFog corpora |
+| Span quality | Offset correctness and deduplication behavior |
+| Privacy | No raw PII logs/cache/telemetry, safe error messages |
+| Licensing | Model license, dependency licenses, commercial use notes |
+| Maintenance | Release cadence, Python compatibility, issue activity |
+| API fit | Entity taxonomy mapping, confidence support, batch/chunk support |
+| Docs fit | Model card, limitations, user-facing setup instructions |
+
+## Candidate Backend Categories To Evaluate
+
+- Regex plus validators for structured PII and secrets.
+- Lightweight NLP NER for person, organization, location, and address entities.
+- Advanced local NER models for broader entity coverage and multilingual support.
+- OCR text extraction engines for local images/PDF-derived images.
+- Document understanding models only if they beat OCR plus text PII extraction
+  enough to justify their footprint.
+- User-provided backend hooks for teams that already have a preferred model.
+
+## Recommended Selection Policy
+
+- Default v5 behavior should remain regex/validator-first.
+- Model-backed engines should be opt-in by engine, policy, or extra.
+- DataFog should prefer smaller, reliable local models over maximum leaderboard
+  scores if they improve install success and first-use latency.
+- Model choices should be version-pinned in docs and CI once advertised.
+- A model can be experimental in docs/examples before it becomes part of the
+  supported contract.
+
+## Open Questions
+
+- Do we want one recommended advanced NER model, or a pluggable registry with a
+  default?
+- Should OCR stay Tesseract-first, or should v5 introduce a newer local OCR
+  default after benchmarking?
+- How much multilingual quality is required for v5.0.0 versus a later release?
+- Should Python 3.13 optional-profile support be a v4.5 compatibility release,
+  a v5 launch requirement, or both?
+- What maximum model download size is acceptable for the default recommended
+  advanced profile?
diff --git a/setup.py b/setup.py
index 7c3b7992..f84c241a 100644
--- a/setup.py
+++ b/setup.py
@@ -36,6 +36,7 @@
 ]
 
 ocr_deps = [
+    "numpy>=1.24.0",
     "pytesseract>=0.3.0",
     "Pillow>=12.2.0",
     "sentencepiece>=0.2.0",
@@ -51,6 +52,7 @@
 web_deps = [
     "fastapi>=0.100.0",
     "aiohttp>=3.13.4",
+    "certifi>=2025.4.26",
     "requests>=2.33.0",
 ]
 
diff --git a/tests/test_install_profiles.py b/tests/test_install_profiles.py
new file mode 100644
index 00000000..e17261be
--- /dev/null
+++ b/tests/test_install_profiles.py
@@ -0,0 +1,67 @@
+import os
+
+import pytest
+
+pytestmark = pytest.mark.skipif(
+    not os.environ.get("DATAFOG_INSTALL_PROFILE"),
+    reason="install profile smoke tests run only in profile-specific CI jobs",
+)
+
+
+def test_install_profile_import_surface() -> None:
+    profile = os.environ["DATAFOG_INSTALL_PROFILE"]
+
+    if profile == "core":
+        import datafog
+
+        assert datafog.scan("Email jane@example.com").entities
+        assert datafog.redact("Email jane@example.com").redacted_text
+    elif profile == "cli":
+        from datafog.client import app
+
+        assert app is not None
+    elif profile == "nlp":
+        import spacy  # noqa: F401
+
+        from datafog.models.spacy_nlp import SpacyAnnotator
+        from datafog.processing.text_processing.spacy_pii_annotator import (
+            SpacyPIIAnnotator,
+        )
+
+        assert SpacyAnnotator is not None
+        assert SpacyPIIAnnotator is not None
+    elif profile == "nlp-advanced":
+        import gliner  # noqa: F401
+        import torch  # noqa: F401
+        import transformers  # noqa: F401
+
+        from datafog.processing.text_processing.gliner_annotator import GLiNERAnnotator
+
+        assert GLiNERAnnotator is not None
+    elif profile == "ocr":
+        import numpy  # noqa: F401
+        import pytesseract  # noqa: F401
+        from PIL import Image  # noqa: F401
+
+        from datafog.processing.image_processing.donut_processor import DonutProcessor
+        from datafog.processing.image_processing.pytesseract_processor import (
+            PytesseractProcessor,
+        )
+        from datafog.services.image_service import ImageService
+
+        assert DonutProcessor is not None
+        assert ImageService is not None
+        assert PytesseractProcessor is not None
+    elif profile == "distributed":
+        from datafog.processing.spark_processing import pyspark_udfs
+        from datafog.services.spark_service import SparkService
+
+        pyspark_udfs.ensure_installed("pyspark")
+        assert SparkService is not None
+    elif profile == "web":
+        import aiohttp  # noqa: F401
+        import certifi  # noqa: F401
+        import fastapi  # noqa: F401
+        import requests  # noqa: F401
+    else:
+        raise AssertionError(f"unknown DATAFOG_INSTALL_PROFILE: {profile}")
diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py
new file mode 100644
index 00000000..905984f4
--- /dev/null
+++ b/tests/test_no_network_core.py
@@ -0,0 +1,96 @@
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+
+def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]:
+    env = dict(os.environ)
+    env["PYTHONPATH"] = str(Path.cwd())
+    env.pop("DATAFOG_TELEMETRY", None)
+    env["DATAFOG_NO_TELEMETRY"] = "1"
+    env["DO_NOT_TRACK"] = "1"
+    return subprocess.run(
+        [sys.executable, "-c", script],
+        check=True,
+        env=env,
+        text=True,
+        capture_output=True,
+    )
+
+
+def test_import_scan_and_redact_do_not_open_network() -> None:
+    _run_isolated_python(
+        """
+import socket
+import urllib.request
+
+def blocked(*_args, **_kwargs):
+    raise AssertionError("network access is blocked in this test")
+
+socket.create_connection = blocked
+urllib.request.urlopen = blocked
+
+import datafog
+
+scan_result = datafog.scan("Email jane@example.com or call 415-555-1212")
+assert {entity.type for entity in scan_result.entities} >= {"EMAIL", "PHONE"}
+
+redact_result = datafog.redact("Email jane@example.com or call 415-555-1212")
+assert "jane@example.com" not in redact_result.redacted_text
+assert "415-555-1212" not in redact_result.redacted_text
+"""
+    )
+
+
+def test_core_defaults_do_not_initialize_optional_engines(monkeypatch) -> None:
+    import datafog
+    import datafog.engine as engine
+
+    def fail_optional_engine_probe():
+        raise AssertionError("core defaults should not initialize optional engines")
+
+    monkeypatch.setattr(engine, "_get_spacy_annotator", fail_optional_engine_probe)
+    monkeypatch.setattr(engine, "_get_gliner_annotator", fail_optional_engine_probe)
+
+    scan_result = datafog.scan("Email jane@example.com")
+    assert [entity.type for entity in scan_result.entities] == ["EMAIL"]
+
+    redact_result = datafog.redact("Email jane@example.com")
+    assert redact_result.redacted_text == "Email [EMAIL_1]"
+
+    guardrail = datafog.protect()
+    guarded = guardrail.filter("Email jane@example.com")
+    assert guarded.redacted_text == "Email [EMAIL_1]"
+
+
+def test_import_probes_do_not_load_optional_models() -> None:
+    _run_isolated_python(
+        """
+import sys
+import types
+
+def blocked_model_load(*_args, **_kwargs):
+    raise AssertionError("import should not load optional models")
+
+spacy = types.ModuleType("spacy")
+spacy.load = blocked_model_load
+spacy.cli = types.SimpleNamespace(download=blocked_model_load)
+spacy.util = types.SimpleNamespace(get_installed_models=lambda: [])
+sys.modules["spacy"] = spacy
+
+gliner = types.ModuleType("gliner")
+
+class GLiNER:
+    @staticmethod
+    def from_pretrained(*_args, **_kwargs):
+        blocked_model_load()
+
+gliner.GLiNER = GLiNER
+sys.modules["gliner"] = gliner
+
+import datafog
+
+assert datafog.scan("Email jane@example.com").entities
+"""
+    )
diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py
index adc787ff..9e03697a 100644
--- a/tests/test_runtime_dependency_safety.py
+++ b/tests/test_runtime_dependency_safety.py
@@ -1,4 +1,6 @@
+import importlib
 import sys
+import types
 from pathlib import Path
 
 import pytest
@@ -39,6 +41,42 @@ def load(_model_name):
         SpacyPIIAnnotator.create()
 
 
+def test_spacy_engine_missing_model_surfaces_download_guidance(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    class FakeSpacy:
+        @staticmethod
+        def load(_model_name):
+            raise OSError("model not installed")
+
+    monkeypatch.setitem(sys.modules, "spacy", FakeSpacy())
+
+    from datafog import engine
+    from datafog.exceptions import EngineNotAvailable
+
+    engine._get_spacy_annotator.cache_clear()
+    try:
+        with pytest.raises(EngineNotAvailable, match="Download it explicitly"):
+            engine.scan("Jane Doe", engine="spacy")
+    finally:
+        engine._get_spacy_annotator.cache_clear()
+
+
+def test_spacy_helper_does_not_require_rich(monkeypatch: pytest.MonkeyPatch) -> None:
+    module_name = "datafog.models.spacy_nlp"
+    monkeypatch.delitem(sys.modules, module_name, raising=False)
+
+    fake_spacy = types.ModuleType("spacy")
+    fake_spacy.load = lambda _model_name: None
+    fake_spacy.cli = types.SimpleNamespace(download=lambda _model_name: None)
+    fake_spacy.util = types.SimpleNamespace(get_installed_models=lambda: [])
+    monkeypatch.setitem(sys.modules, "spacy", fake_spacy)
+
+    module = importlib.import_module(module_name)
+
+    assert module.SpacyAnnotator is not None
+
+
 def test_spark_missing_dependency_requires_explicit_install(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:

From 3fa6e2001a83f489d3c9f0f5a662d85d70e5c881 Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Mon, 27 Apr 2026 04:23:53 +0200
Subject: [PATCH 2/5] fix: avoid importing optional modules in telemetry probes

---
 datafog/telemetry.py    | 60 ++++++++++++++++-------------------------
 tests/test_telemetry.py | 19 +++++++++++++
 2 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/datafog/telemetry.py b/datafog/telemetry.py
index 6b3885a3..7cb7a703 100644
--- a/datafog/telemetry.py
+++ b/datafog/telemetry.py
@@ -13,9 +13,11 @@
 """
 
 import hashlib
+import importlib.util
 import json
 import os
 import platform
+import sys
 import threading
 import time
 import urllib.request
@@ -114,44 +116,28 @@ def _get_duration_bucket(duration_ms: float) -> str:
 
 def _detect_installed_extras() -> list:
     """Probe which optional extras are installed."""
-    extras = []
 
-    try:
-        import spacy  # noqa: F401
-
-        extras.append("nlp")
-    except ImportError:
-        pass
-
-    try:
-        import gliner  # noqa: F401
-
-        extras.append("nlp-advanced")
-    except ImportError:
-        pass
-
-    try:
-        import pytesseract  # noqa: F401
-
-        extras.append("ocr")
-    except ImportError:
-        pass
-
-    try:
-        import typer  # noqa: F401
-
-        extras.append("cli")
-    except ImportError:
-        pass
-
-    try:
-        import pyspark  # noqa: F401
-
-        extras.append("distributed")
-    except ImportError:
-        pass
-
-    return extras
+    def _module_available(module_name: str) -> bool:
+        module = sys.modules.get(module_name)
+        if module is not None and getattr(module, "__spec__", None) is None:
+            return True
+        try:
+            return importlib.util.find_spec(module_name) is not None
+        except (ImportError, ValueError):
+            return False
+
+    module_to_extra = {
+        "spacy": "nlp",
+        "gliner": "nlp-advanced",
+        "pytesseract": "ocr",
+        "typer": "cli",
+        "pyspark": "distributed",
+    }
+    return [
+        extra
+        for module_name, extra in module_to_extra.items()
+        if _module_available(module_name)
+    ]
 
 
 def _detect_ci() -> bool:
diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py
index 3886a1dc..41dcf688 100644
--- a/tests/test_telemetry.py
+++ b/tests/test_telemetry.py
@@ -1,5 +1,6 @@
 """Tests for datafog.telemetry module."""
 
+import builtins
 import json
 import threading
 import time
@@ -569,6 +570,24 @@ def test_detect_installed_extras_returns_list(self):
         result = _detect_installed_extras()
         assert isinstance(result, list)
 
+    def test_detect_installed_extras_does_not_import_optional_modules(
+        self, monkeypatch
+    ):
+        from datafog.telemetry import _detect_installed_extras
+
+        real_import = builtins.__import__
+        optional_modules = {"spacy", "gliner", "pytesseract", "typer", "pyspark"}
+
+        def guarded_import(name, *args, **kwargs):
+            if name.split(".", 1)[0] in optional_modules:
+                raise AssertionError(f"imported optional module {name}")
+            return real_import(name, *args, **kwargs)
+
+        monkeypatch.setattr(builtins, "__import__", guarded_import)
+
+        result = _detect_installed_extras()
+        assert isinstance(result, list)
+
     def test_services_init_does_not_require_aiohttp(self):
         """TextService should be importable without aiohttp/PIL (services/__init__.py fix)."""
         from datafog.services.text_service import TextService

From 7514af4c47dcbde81169fe016fd01ac3c0d30e16 Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Mon, 27 Apr 2026 04:31:06 +0200
Subject: [PATCH 3/5] fix: make telemetry init fully non-blocking

---
 datafog/telemetry.py    | 101 +++++++++++++++++++++++-----------------
 tests/test_telemetry.py |  27 +++++++++++
 2 files changed, 84 insertions(+), 44 deletions(-)

diff --git a/datafog/telemetry.py b/datafog/telemetry.py
index 7cb7a703..5b7201df 100644
--- a/datafog/telemetry.py
+++ b/datafog/telemetry.py
@@ -156,39 +156,69 @@ def _detect_ci() -> bool:
     return any(os.environ.get(v) for v in ci_vars)
 
 
-def _send_event(event_name: str, properties: dict) -> None:
-    """POST event to PostHog /capture/ endpoint in a daemon thread.
+def _post_event(event_name: str, properties: dict) -> None:
+    """POST event to PostHog /capture/ endpoint.
 
-    Fire-and-forget: failures are silently ignored.
+    Fire-and-forget callers run this in daemon threads. Failures are silently
+    ignored so telemetry can never affect SDK behavior.
     """
+    try:
+        payload = json.dumps(
+            {
+                "api_key": _POSTHOG_API_KEY,
+                "event": event_name,
+                "properties": {
+                    "distinct_id": _get_anonymous_id(),
+                    **properties,
+                },
+                "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
+            }
+        ).encode("utf-8")
+
+        req = urllib.request.Request(
+            f"{_POSTHOG_HOST}/capture/",
+            data=payload,
+            headers={"Content-Type": "application/json"},
+            method="POST",
+        )
+        urllib.request.urlopen(req, timeout=5)
+    except Exception:
+        pass
+
+
+def _send_event(event_name: str, properties: dict) -> None:
+    """POST event to PostHog /capture/ endpoint in a daemon thread."""
     if not _is_telemetry_enabled():
         return
 
-    def _post():
+    t = threading.Thread(target=_post_event, args=(event_name, properties), daemon=True)
+    t.start()
+
+
+def _send_init_event() -> None:
+    """Build and send the process init event without blocking API calls."""
+
+    def _post_init():
         try:
-            payload = json.dumps(
-                {
-                    "api_key": _POSTHOG_API_KEY,
-                    "event": event_name,
-                    "properties": {
-                        "distinct_id": _get_anonymous_id(),
-                        **properties,
-                    },
-                    "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()),
-                }
-            ).encode("utf-8")
-
-            req = urllib.request.Request(
-                f"{_POSTHOG_HOST}/capture/",
-                data=payload,
-                headers={"Content-Type": "application/json"},
-                method="POST",
-            )
-            urllib.request.urlopen(req, timeout=5)
+            from .__about__ import __version__
         except Exception:
-            pass
-
-    t = threading.Thread(target=_post, daemon=True)
+            __version__ = "unknown"
+
+        uname = platform.uname()
+        _post_event(
+            "datafog_init",
+            {
+                "package_version": __version__,
+                "python_version": platform.python_version(),
+                "os": uname.system,
+                "os_version": uname.release,
+                "arch": uname.machine,
+                "installed_extras": _detect_installed_extras(),
+                "is_ci": _detect_ci(),
+            },
+        )
+
+    t = threading.Thread(target=_post_init, daemon=True)
     t.start()
 
 
@@ -206,24 +236,7 @@ def _ensure_initialized() -> None:
     if not _is_telemetry_enabled():
         return
 
-    try:
-        from .__about__ import __version__
-    except Exception:
-        __version__ = "unknown"
-
-    uname = platform.uname()
-    _send_event(
-        "datafog_init",
-        {
-            "package_version": __version__,
-            "python_version": platform.python_version(),
-            "os": uname.system,
-            "os_version": uname.release,
-            "arch": uname.machine,
-            "installed_extras": _detect_installed_extras(),
-            "is_ci": _detect_ci(),
-        },
-    )
+    _send_init_event()
 
 
 def track_function_call(function_name: str, module: str, **kwargs) -> None:
diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py
index 41dcf688..d4033ad8 100644
--- a/tests/test_telemetry.py
+++ b/tests/test_telemetry.py
@@ -267,6 +267,33 @@ def test_track_function_call_returns_immediately(
 
         assert elapsed < 0.1
 
+    def test_track_function_call_does_not_wait_for_init_metadata(
+        self, monkeypatch, enable_telemetry
+    ):
+        import datafog.telemetry as tel
+
+        init_posted = threading.Event()
+
+        def slow_detect_installed_extras():
+            time.sleep(0.3)
+            return []
+
+        def fake_post_event(event_name, properties):
+            if event_name == "datafog_init":
+                init_posted.set()
+
+        monkeypatch.setattr(
+            tel, "_detect_installed_extras", slow_detect_installed_extras
+        )
+        monkeypatch.setattr(tel, "_post_event", fake_post_event)
+
+        start = time.monotonic()
+        tel.track_function_call("fn", "mod")
+        elapsed = time.monotonic() - start
+
+        assert elapsed < 0.1
+        assert init_posted.wait(1)
+
     def test_network_failure_is_silent(self, mock_urlopen, enable_telemetry):
         from datafog.telemetry import track_function_call
 

From b593cbe1625304655a6c3599796ee607ffe2f7a2 Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Mon, 27 Apr 2026 04:40:15 +0200
Subject: [PATCH 4/5] test: harden telemetry nonblocking assertions

---
 tests/test_telemetry.py | 67 ++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py
index d4033ad8..209131b5 100644
--- a/tests/test_telemetry.py
+++ b/tests/test_telemetry.py
@@ -244,38 +244,64 @@ class TestNonBlocking:
     def test_send_event_returns_immediately(self, mock_urlopen, enable_telemetry):
         from datafog.telemetry import _send_event
 
-        # Make urlopen block
-        mock_urlopen.side_effect = lambda *a, **k: time.sleep(10)
+        release_network = threading.Event()
 
-        start = time.monotonic()
-        _send_event("test", {"k": "v"})
-        elapsed = time.monotonic() - start
+        def block_until_released(*args, **kwargs):
+            release_network.wait(5)
 
-        # Should return in <100ms even though urlopen blocks for 10s
-        assert elapsed < 0.1
+        mock_urlopen.side_effect = block_until_released
+
+        call_done = threading.Event()
+        caller = threading.Thread(
+            target=lambda: (_send_event("test", {"k": "v"}), call_done.set())
+        )
+
+        try:
+            caller.start()
+            assert call_done.wait(1)
+            assert not release_network.is_set()
+        finally:
+            release_network.set()
+            caller.join(1)
 
     def test_track_function_call_returns_immediately(
         self, mock_urlopen, enable_telemetry
     ):
         from datafog.telemetry import track_function_call
 
-        mock_urlopen.side_effect = lambda *a, **k: time.sleep(10)
+        release_network = threading.Event()
 
-        start = time.monotonic()
-        track_function_call("fn", "mod")
-        elapsed = time.monotonic() - start
+        def block_until_released(*args, **kwargs):
+            release_network.wait(5)
+
+        mock_urlopen.side_effect = block_until_released
 
-        assert elapsed < 0.1
+        call_done = threading.Event()
+        caller = threading.Thread(
+            target=lambda: (track_function_call("fn", "mod"), call_done.set())
+        )
+
+        try:
+            caller.start()
+            assert call_done.wait(1)
+            assert not release_network.is_set()
+        finally:
+            release_network.set()
+            caller.join(1)
 
     def test_track_function_call_does_not_wait_for_init_metadata(
         self, monkeypatch, enable_telemetry
     ):
         import datafog.telemetry as tel
 
+        call_done = threading.Event()
+        detect_started = threading.Event()
+        release_detect = threading.Event()
         init_posted = threading.Event()
 
         def slow_detect_installed_extras():
-            time.sleep(0.3)
+            detect_started.set()
+            release_detect.wait(5)
             return []
 
         def fake_post_event(event_name, properties):
@@ -287,11 +313,18 @@ def fake_post_event(event_name, properties):
         )
         monkeypatch.setattr(tel, "_post_event", fake_post_event)
 
-        start = time.monotonic()
-        tel.track_function_call("fn", "mod")
-        elapsed = time.monotonic() - start
+        caller = threading.Thread(
+            target=lambda: (tel.track_function_call("fn", "mod"), call_done.set())
+        )
+
+        try:
+            caller.start()
+            assert detect_started.wait(1)
+            assert call_done.wait(1)
+        finally:
+            release_detect.set()
+            caller.join(1)
 
-        assert elapsed < 0.1
         assert init_posted.wait(1)
 
     def test_network_failure_is_silent(self, mock_urlopen, enable_telemetry):

From dbdbed5814cd6d325aea9b12c0e818001be285f7 Mon Sep 17 00:00:00 2001
From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com>
Date: Mon, 27 Apr 2026 04:47:47 +0200
Subject: [PATCH 5/5] test: cover optional dependency fallback paths

---
 tests/test_runtime_dependency_safety.py | 34 ++++++++++++++++++++++
 tests/test_telemetry.py                 | 38 +++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py
index 9e03697a..9410ddc6 100644
--- a/tests/test_runtime_dependency_safety.py
+++ b/tests/test_runtime_dependency_safety.py
@@ -62,6 +62,40 @@ def load(_model_name):
         engine._get_spacy_annotator.cache_clear()
 
 
+def test_spacy_engine_missing_module_surfaces_import_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from datafog import engine
+    from datafog.exceptions import EngineNotAvailable
+
+    module_name = "datafog.processing.text_processing.spacy_pii_annotator"
+    monkeypatch.setitem(sys.modules, module_name, None)
+
+    engine._get_spacy_annotator.cache_clear()
+    try:
+        with pytest.raises(EngineNotAvailable, match="spacy_pii_annotator"):
+            engine.scan("Jane Doe", engine="spacy")
+    finally:
+        engine._get_spacy_annotator.cache_clear()
+
+
+def test_gliner_engine_missing_module_surfaces_import_error(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    from datafog import engine
+    from datafog.exceptions import EngineNotAvailable
+
+    module_name = "datafog.processing.text_processing.gliner_annotator"
+    monkeypatch.setitem(sys.modules, module_name, None)
+
+    engine._get_gliner_annotator.cache_clear()
+    try:
+        with pytest.raises(EngineNotAvailable, match="gliner_annotator"):
+            engine.scan("Jane Doe", engine="gliner")
+    finally:
+        engine._get_gliner_annotator.cache_clear()
+
+
 def test_spacy_helper_does_not_require_rich(monkeypatch: pytest.MonkeyPatch) -> None:
     module_name = "datafog.models.spacy_nlp"
     monkeypatch.delitem(sys.modules, module_name, raising=False)
diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py
index 209131b5..ce5651a0 100644
--- a/tests/test_telemetry.py
+++ b/tests/test_telemetry.py
@@ -2,6 +2,7 @@
 
 import builtins
 import json
+import sys
 import threading
 import time
 from pathlib import Path
@@ -648,6 +649,43 @@ def guarded_import(name, *args, **kwargs):
         result = _detect_installed_extras()
         assert isinstance(result, list)
 
+    def test_detect_installed_extras_handles_probe_errors(self, monkeypatch):
+        import datafog.telemetry as tel
+
+        optional_modules = {"spacy", "gliner", "pytesseract", "typer", "pyspark"}
+        for module_name in optional_modules:
+            monkeypatch.delitem(sys.modules, module_name, raising=False)
+
+        def broken_find_spec(_module_name):
+            raise ValueError("invalid module state")
+
+        monkeypatch.setattr(tel.importlib.util, "find_spec", broken_find_spec)
+
+        assert tel._detect_installed_extras() == []
+
+    def test_send_init_event_uses_unknown_version_when_about_import_fails(
+        self, monkeypatch
+    ):
+        import datafog.telemetry as tel
+
+        init_posted = threading.Event()
+        captured = {}
+
+        monkeypatch.setitem(sys.modules, "datafog.__about__", None)
+
+        def fake_post_event(event_name, properties):
+            captured["event_name"] = event_name
+            captured["properties"] = properties
+            init_posted.set()
+
+        monkeypatch.setattr(tel, "_post_event", fake_post_event)
+
+        tel._send_init_event()
+
+        assert init_posted.wait(1)
+        assert captured["event_name"] == "datafog_init"
+        assert captured["properties"]["package_version"] == "unknown"
+
     def test_services_init_does_not_require_aiohttp(self):
         """TextService should be importable without aiohttp/PIL (services/__init__.py fix)."""
         from datafog.services.text_service import TextService