From 03e5f2b1a345ea39e1b2d0b6ab1841576dce73ba Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Mon, 27 Apr 2026 04:16:11 +0200 Subject: [PATCH 1/5] test: add no-network and install profile gates --- .github/workflows/ci.yml | 41 ++++++ datafog/engine.py | 26 ++-- datafog/models/spacy_nlp.py | 3 +- docs/v5-model-requirements.md | 159 ++++++++++++++++++++++++ setup.py | 2 + tests/test_install_profiles.py | 67 ++++++++++ tests/test_no_network_core.py | 96 ++++++++++++++ tests/test_runtime_dependency_safety.py | 38 ++++++ 8 files changed, 412 insertions(+), 20 deletions(-) create mode 100644 docs/v5-model-requirements.md create mode 100644 tests/test_install_profiles.py create mode 100644 tests/test_no_network_core.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2daa81ba..a3020390 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -154,6 +154,47 @@ jobs: flags: ${{ matrix.install-profile }}-py${{ matrix.python-version }} token: ${{ secrets.CODECOV_TOKEN }} + profile-smoke: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + install-profile: + - core + - cli + - nlp + - nlp-advanced + - ocr + - distributed + - web + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Upgrade pip + run: | + python -m pip install --upgrade pip + + - name: Install dependencies (core) + if: matrix.install-profile == 'core' + run: | + pip install -e ".[test]" + + - name: Install dependencies (profile) + if: matrix.install-profile != 'core' + run: | + pip install -e ".[test,${{ matrix.install-profile }}]" + + - name: Run install profile smoke test + env: + DATAFOG_INSTALL_PROFILE: ${{ matrix.install-profile }} + run: | + pytest tests/test_install_profiles.py -q + wheel-size: runs-on: ubuntu-latest steps: diff --git a/datafog/engine.py b/datafog/engine.py index 6687c24e..1a94e634 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -171,17 +171,13 @@ def _gliner_entities(text: str) -> list[Entity]: def _get_spacy_annotator(): try: from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator - except ImportError: - return _UnavailableAnnotator( - "SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]" - ) + except ImportError as exc: + return _UnavailableAnnotator(str(exc)) try: return SpacyPIIAnnotator.create() - except ImportError: - return _UnavailableAnnotator( - "SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]" - ) + except ImportError as exc: + return _UnavailableAnnotator(str(exc)) except Exception as exc: return _UnavailableAnnotator( f"SpaCy engine initialization failed: {type(exc).__name__}: {exc}" @@ -192,19 +188,13 @@ def _get_spacy_annotator(): def _get_gliner_annotator(): try: from .processing.text_processing.gliner_annotator import GLiNERAnnotator - except ImportError: - return _UnavailableAnnotator( - "GLiNER engine requires the nlp-advanced extra. " - "Install with: pip install datafog[nlp-advanced]" - ) + except ImportError as exc: + return _UnavailableAnnotator(str(exc)) try: annotator = GLiNERAnnotator.create() - except ImportError: - return _UnavailableAnnotator( - "GLiNER engine requires the nlp-advanced extra. " - "Install with: pip install datafog[nlp-advanced]" - ) + except ImportError as exc: + return _UnavailableAnnotator(str(exc)) except Exception as exc: return _UnavailableAnnotator( f"GLiNER engine initialization failed: {type(exc).__name__}: {exc}" diff --git a/datafog/models/spacy_nlp.py b/datafog/models/spacy_nlp.py index 5257ba3d..15ca8868 100644 --- a/datafog/models/spacy_nlp.py +++ b/datafog/models/spacy_nlp.py @@ -9,7 +9,6 @@ from uuid import uuid4 import spacy -from rich.progress import track from .annotator import AnnotationResult, AnnotatorRequest @@ -53,7 +52,7 @@ def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResul ) doc = self.nlp(annotator_request.text) results = [] - for ent in track(doc.ents, description="Processing entities"): + for ent in doc.ents: result = AnnotationResult( start=ent.start_char, end=ent.end_char, diff --git a/docs/v5-model-requirements.md b/docs/v5-model-requirements.md new file mode 100644 index 00000000..ab8d496f --- /dev/null +++ b/docs/v5-model-requirements.md @@ -0,0 +1,159 @@ +# v5 Model Selection Requirements + +This sheet defines requirements for revisiting DataFog's optional model stack before +locking the v5 core API around specific NLP/OCR backends. It is intentionally a +requirements document, not a model recommendation list. + +## Decision Goals + +- Pick models that improve adoption by making the first successful result easy, + trustworthy, and local by default. +- Keep the core SDK fast and lightweight; model-backed engines remain optional. +- Make model behavior explicit enough that users can defend it in privacy, + security, and compliance reviews. +- Preserve a clean path for future backend swaps without breaking the top-level + v5 API. + +## Must-Haves + +### Runtime And Packaging + +- No model downloads during import, install, or ordinary SDK calls. +- All model downloads must be explicit CLI/API actions or user-provided local + paths. +- The core install must not require ML, OCR, Torch, TensorFlow, Java, Spark, or + system OCR binaries. +- Optional extras must map cleanly to real imports: + - `nlp` for lightweight NLP engines. + - `nlp-advanced` for heavier ML NER engines. + - `ocr` for local image/OCR processing. + - `distributed` for Spark-style processing. +- Missing dependency and missing model errors must explain the exact install or + download command. +- Python 3.10, 3.11, and 3.12 must be supported for advertised optional model + profiles. Python 3.13 support should be advertised only after explicit profile + validation. +- Models must work in offline mode after explicit download/cache preparation. + +### Privacy And Trust + +- No network access during inference. +- No telemetry, remote callbacks, model hub lookups, or license checks during + inference. +- No raw PII should be written to logs, cache names, telemetry, exceptions, or + debug traces by default. +- Model metadata exposed by DataFog should identify model name/version/source + without storing detected raw PII. +- Reversible workflows must be opt-in and clearly separated from ordinary + redaction. + +### Detection Contract + +- Model outputs must include enough structure for the public result contract: + entity type, text/span, start/end offsets, confidence when available, and + engine/source. +- Spans must be deterministic for the same model, text, and settings. +- Entity labels must be mappable into DataFog's canonical entity taxonomy without + surprising users. +- Model-backed engines must compose with regex detection without duplicating or + overwriting high-confidence structured entities. +- Failure modes must be predictable: unsupported language, missing model, missing + optional dependency, and low-confidence results should all be distinguishable. + +### Quality Gates + +- Candidate models must be benchmarked on DataFog's target corpora before + adoption. +- Benchmarks must include precision/recall by entity type, not only aggregate F1. +- Structured PII such as email, phone, IP address, SSN, credit cards, dates, and + ZIP/postal codes should remain regex/validator-first unless a model clearly + improves quality. +- NER-style entities such as person, organization, location, address, and + domain-specific identifiers need regression tests with realistic app/log data. +- OCR models must be evaluated separately for text extraction quality and PII + extraction quality after OCR. + +### Operational Fit + +- CPU inference must be acceptable for the default advertised workflow. +- GPU-only models are not acceptable as default engines. +- Model size, cold-start time, memory use, and cache footprint must be measured. +- The model must have a usable open license for commercial SDK users. +- The model or provider must have credible maintenance signals and versioned + artifacts. + +## Nice-To-Haves + +- Strong multilingual support with per-language quality reporting. +- Quantized or small variants that keep local inference practical. +- ONNX or other portable runtime support for future non-Torch deployments. +- Streaming/chunked inference support or predictable behavior across chunk + boundaries. +- Custom entity hints or user-provided label sets. +- Confidence calibration good enough to expose threshold controls. +- Batch inference APIs for logs, CSV, and JSONL workflows. +- Clear model cards with training data notes, limitations, and intended use. +- Support for local cache directories that can be controlled by environment + variable or explicit config. +- Graceful operation on Apple Silicon and common Linux CI runners. + +## Disqualifiers + +- Requires network access for inference. +- Downloads weights implicitly from ordinary SDK calls. +- License is unclear, non-commercial, or incompatible with SDK distribution. +- Requires a hosted API for core value. +- Requires GPU for reasonable first-use behavior. +- Cannot return stable spans or forces only label-level output. +- Emits raw text or entities through logging, telemetry, or callbacks. +- Adds heavyweight dependencies to the core install. +- Breaks Python version support we already advertise. + +## Evaluation Matrix + +Each candidate backend should be scored before adoption: + +| Area | Required Evidence | +| --- | --- | +| Install footprint | Extra name, package deps, wheel size impact, system deps | +| Runtime footprint | Cold start, warm latency, memory, CPU/GPU requirements | +| Offline behavior | Explicit download path, local cache path, no-network test | +| Quality | Precision/recall by entity type on DataFog corpora | +| Span quality | Offset correctness and deduplication behavior | +| Privacy | No raw PII logs/cache/telemetry, safe error messages | +| Licensing | Model license, dependency licenses, commercial use notes | +| Maintenance | Release cadence, Python compatibility, issue activity | +| API fit | Entity taxonomy mapping, confidence support, batch/chunk support | +| Docs fit | Model card, limitations, user-facing setup instructions | + +## Candidate Backend Categories To Evaluate + +- Regex plus validators for structured PII and secrets. +- Lightweight NLP NER for person, organization, location, and address entities. +- Advanced local NER models for broader entity coverage and multilingual support. +- OCR text extraction engines for local images/PDF-derived images. +- Document understanding models only if they beat OCR plus text PII extraction + enough to justify their footprint. +- User-provided backend hooks for teams that already have a preferred model. + +## Recommended Selection Policy + +- Default v5 behavior should remain regex/validator-first. +- Model-backed engines should be opt-in by engine, policy, or extra. +- DataFog should prefer smaller, reliable local models over maximum leaderboard + scores if they improve install success and first-use latency. +- Model choices should be version-pinned in docs and CI once advertised. +- A model can be experimental in docs/examples before it becomes part of the + supported contract. + +## Open Questions + +- Do we want one recommended advanced NER model, or a pluggable registry with a + default? +- Should OCR stay Tesseract-first, or should v5 introduce a newer local OCR + default after benchmarking? +- How much multilingual quality is required for v5.0.0 versus a later release? +- Should Python 3.13 optional-profile support be a v4.5 compatibility release, + a v5 launch requirement, or both? +- What maximum model download size is acceptable for the default recommended + advanced profile? diff --git a/setup.py b/setup.py index 7c3b7992..f84c241a 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ ] ocr_deps = [ + "numpy>=1.24.0", "pytesseract>=0.3.0", "Pillow>=12.2.0", "sentencepiece>=0.2.0", @@ -51,6 +52,7 @@ web_deps = [ "fastapi>=0.100.0", "aiohttp>=3.13.4", + "certifi>=2025.4.26", "requests>=2.33.0", ] diff --git a/tests/test_install_profiles.py b/tests/test_install_profiles.py new file mode 100644 index 00000000..e17261be --- /dev/null +++ b/tests/test_install_profiles.py @@ -0,0 +1,67 @@ +import os + +import pytest + +pytestmark = pytest.mark.skipif( + not os.environ.get("DATAFOG_INSTALL_PROFILE"), + reason="install profile smoke tests run only in profile-specific CI jobs", +) + + +def test_install_profile_import_surface() -> None: + profile = os.environ["DATAFOG_INSTALL_PROFILE"] + + if profile == "core": + import datafog + + assert datafog.scan("Email jane@example.com").entities + assert datafog.redact("Email jane@example.com").redacted_text + elif profile == "cli": + from datafog.client import app + + assert app is not None + elif profile == "nlp": + import spacy # noqa: F401 + + from datafog.models.spacy_nlp import SpacyAnnotator + from datafog.processing.text_processing.spacy_pii_annotator import ( + SpacyPIIAnnotator, + ) + + assert SpacyAnnotator is not None + assert SpacyPIIAnnotator is not None + elif profile == "nlp-advanced": + import gliner # noqa: F401 + import torch # noqa: F401 + import transformers # noqa: F401 + + from datafog.processing.text_processing.gliner_annotator import GLiNERAnnotator + + assert GLiNERAnnotator is not None + elif profile == "ocr": + import numpy # noqa: F401 + import pytesseract # noqa: F401 + from PIL import Image # noqa: F401 + + from datafog.processing.image_processing.donut_processor import DonutProcessor + from datafog.processing.image_processing.pytesseract_processor import ( + PytesseractProcessor, + ) + from datafog.services.image_service import ImageService + + assert DonutProcessor is not None + assert ImageService is not None + assert PytesseractProcessor is not None + elif profile == "distributed": + from datafog.processing.spark_processing import pyspark_udfs + from datafog.services.spark_service import SparkService + + pyspark_udfs.ensure_installed("pyspark") + assert SparkService is not None + elif profile == "web": + import aiohttp # noqa: F401 + import certifi # noqa: F401 + import fastapi # noqa: F401 + import requests # noqa: F401 + else: + raise AssertionError(f"unknown DATAFOG_INSTALL_PROFILE: {profile}") diff --git a/tests/test_no_network_core.py b/tests/test_no_network_core.py new file mode 100644 index 00000000..905984f4 --- /dev/null +++ b/tests/test_no_network_core.py @@ -0,0 +1,96 @@ +import os +import subprocess +import sys +from pathlib import Path + + +def _run_isolated_python(script: str) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env["PYTHONPATH"] = str(Path.cwd()) + env.pop("DATAFOG_TELEMETRY", None) + env["DATAFOG_NO_TELEMETRY"] = "1" + env["DO_NOT_TRACK"] = "1" + return subprocess.run( + [sys.executable, "-c", script], + check=True, + env=env, + text=True, + capture_output=True, + ) + + +def test_import_scan_and_redact_do_not_open_network() -> None: + _run_isolated_python( + """ +import socket +import urllib.request + +def blocked(*_args, **_kwargs): + raise AssertionError("network access is blocked in this test") + +socket.create_connection = blocked +urllib.request.urlopen = blocked + +import datafog + +scan_result = datafog.scan("Email jane@example.com or call 415-555-1212") +assert {entity.type for entity in scan_result.entities} >= {"EMAIL", "PHONE"} + +redact_result = datafog.redact("Email jane@example.com or call 415-555-1212") +assert "jane@example.com" not in redact_result.redacted_text +assert "415-555-1212" not in redact_result.redacted_text +""" + ) + + +def test_core_defaults_do_not_initialize_optional_engines(monkeypatch) -> None: + import datafog + import datafog.engine as engine + + def fail_optional_engine_probe(): + raise AssertionError("core defaults should not initialize optional engines") + + monkeypatch.setattr(engine, "_get_spacy_annotator", fail_optional_engine_probe) + monkeypatch.setattr(engine, "_get_gliner_annotator", fail_optional_engine_probe) + + scan_result = datafog.scan("Email jane@example.com") + assert [entity.type for entity in scan_result.entities] == ["EMAIL"] + + redact_result = datafog.redact("Email jane@example.com") + assert redact_result.redacted_text == "Email [EMAIL_1]" + + guardrail = datafog.protect() + guarded = guardrail.filter("Email jane@example.com") + assert guarded.redacted_text == "Email [EMAIL_1]" + + +def test_import_probes_do_not_load_optional_models() -> None: + _run_isolated_python( + """ +import sys +import types + +def blocked_model_load(*_args, **_kwargs): + raise AssertionError("import should not load optional models") + +spacy = types.ModuleType("spacy") +spacy.load = blocked_model_load +spacy.cli = types.SimpleNamespace(download=blocked_model_load) +spacy.util = types.SimpleNamespace(get_installed_models=lambda: []) +sys.modules["spacy"] = spacy + +gliner = types.ModuleType("gliner") + +class GLiNER: + @staticmethod + def from_pretrained(*_args, **_kwargs): + blocked_model_load() + +gliner.GLiNER = GLiNER +sys.modules["gliner"] = gliner + +import datafog + +assert datafog.scan("Email jane@example.com").entities +""" + ) diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py index adc787ff..9e03697a 100644 --- a/tests/test_runtime_dependency_safety.py +++ b/tests/test_runtime_dependency_safety.py @@ -1,4 +1,6 @@ +import importlib import sys +import types from pathlib import Path import pytest @@ -39,6 +41,42 @@ def load(_model_name): SpacyPIIAnnotator.create() +def test_spacy_engine_missing_model_surfaces_download_guidance( + monkeypatch: pytest.MonkeyPatch, +) -> None: + class FakeSpacy: + @staticmethod + def load(_model_name): + raise OSError("model not installed") + + monkeypatch.setitem(sys.modules, "spacy", FakeSpacy()) + + from datafog import engine + from datafog.exceptions import EngineNotAvailable + + engine._get_spacy_annotator.cache_clear() + try: + with pytest.raises(EngineNotAvailable, match="Download it explicitly"): + engine.scan("Jane Doe", engine="spacy") + finally: + engine._get_spacy_annotator.cache_clear() + + +def test_spacy_helper_does_not_require_rich(monkeypatch: pytest.MonkeyPatch) -> None: + module_name = "datafog.models.spacy_nlp" + monkeypatch.delitem(sys.modules, module_name, raising=False) + + fake_spacy = types.ModuleType("spacy") + fake_spacy.load = lambda _model_name: None + fake_spacy.cli = types.SimpleNamespace(download=lambda _model_name: None) + fake_spacy.util = types.SimpleNamespace(get_installed_models=lambda: []) + monkeypatch.setitem(sys.modules, "spacy", fake_spacy) + + module = importlib.import_module(module_name) + + assert module.SpacyAnnotator is not None + + def test_spark_missing_dependency_requires_explicit_install( monkeypatch: pytest.MonkeyPatch, ) -> None: From 3fa6e2001a83f489d3c9f0f5a662d85d70e5c881 Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Mon, 27 Apr 2026 04:23:53 +0200 Subject: [PATCH 2/5] fix: avoid importing optional modules in telemetry probes --- datafog/telemetry.py | 60 ++++++++++++++++------------------------- tests/test_telemetry.py | 19 +++++++++++++ 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/datafog/telemetry.py b/datafog/telemetry.py index 6b3885a3..7cb7a703 100644 --- a/datafog/telemetry.py +++ b/datafog/telemetry.py @@ -13,9 +13,11 @@ """ import hashlib +import importlib.util import json import os import platform +import sys import threading import time import urllib.request @@ -114,44 +116,28 @@ def _get_duration_bucket(duration_ms: float) -> str: def _detect_installed_extras() -> list: """Probe which optional extras are installed.""" - extras = [] - try: - import spacy # noqa: F401 - - extras.append("nlp") - except ImportError: - pass - - try: - import gliner # noqa: F401 - - extras.append("nlp-advanced") - except ImportError: - pass - - try: - import pytesseract # noqa: F401 - - extras.append("ocr") - except ImportError: - pass - - try: - import typer # noqa: F401 - - extras.append("cli") - except ImportError: - pass - - try: - import pyspark # noqa: F401 - - extras.append("distributed") - except ImportError: - pass - - return extras + def _module_available(module_name: str) -> bool: + module = sys.modules.get(module_name) + if module is not None and getattr(module, "__spec__", None) is None: + return True + try: + return importlib.util.find_spec(module_name) is not None + except (ImportError, ValueError): + return False + + module_to_extra = { + "spacy": "nlp", + "gliner": "nlp-advanced", + "pytesseract": "ocr", + "typer": "cli", + "pyspark": "distributed", + } + return [ + extra + for module_name, extra in module_to_extra.items() + if _module_available(module_name) + ] def _detect_ci() -> bool: diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 3886a1dc..41dcf688 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -1,5 +1,6 @@ """Tests for datafog.telemetry module.""" +import builtins import json import threading import time @@ -569,6 +570,24 @@ def test_detect_installed_extras_returns_list(self): result = _detect_installed_extras() assert isinstance(result, list) + def test_detect_installed_extras_does_not_import_optional_modules( + self, monkeypatch + ): + from datafog.telemetry import _detect_installed_extras + + real_import = builtins.__import__ + optional_modules = {"spacy", "gliner", "pytesseract", "typer", "pyspark"} + + def guarded_import(name, *args, **kwargs): + if name.split(".", 1)[0] in optional_modules: + raise AssertionError(f"imported optional module {name}") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", guarded_import) + + result = _detect_installed_extras() + assert isinstance(result, list) + def test_services_init_does_not_require_aiohttp(self): """TextService should be importable without aiohttp/PIL (services/__init__.py fix).""" from datafog.services.text_service import TextService From 7514af4c47dcbde81169fe016fd01ac3c0d30e16 Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Mon, 27 Apr 2026 04:31:06 +0200 Subject: [PATCH 3/5] fix: make telemetry init fully non-blocking --- datafog/telemetry.py | 101 +++++++++++++++++++++++----------------- tests/test_telemetry.py | 27 +++++++++++ 2 files changed, 84 insertions(+), 44 deletions(-) diff --git a/datafog/telemetry.py b/datafog/telemetry.py index 7cb7a703..5b7201df 100644 --- a/datafog/telemetry.py +++ b/datafog/telemetry.py @@ -156,39 +156,69 @@ def _detect_ci() -> bool: return any(os.environ.get(v) for v in ci_vars) -def _send_event(event_name: str, properties: dict) -> None: - """POST event to PostHog /capture/ endpoint in a daemon thread. +def _post_event(event_name: str, properties: dict) -> None: + """POST event to PostHog /capture/ endpoint. - Fire-and-forget: failures are silently ignored. + Fire-and-forget callers run this in daemon threads. Failures are silently + ignored so telemetry can never affect SDK behavior. """ + try: + payload = json.dumps( + { + "api_key": _POSTHOG_API_KEY, + "event": event_name, + "properties": { + "distinct_id": _get_anonymous_id(), + **properties, + }, + "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()), + } + ).encode("utf-8") + + req = urllib.request.Request( + f"{_POSTHOG_HOST}/capture/", + data=payload, + headers={"Content-Type": "application/json"}, + method="POST", + ) + urllib.request.urlopen(req, timeout=5) + except Exception: + pass + + +def _send_event(event_name: str, properties: dict) -> None: + """POST event to PostHog /capture/ endpoint in a daemon thread.""" if not _is_telemetry_enabled(): return - def _post(): + t = threading.Thread(target=_post_event, args=(event_name, properties), daemon=True) + t.start() + + +def _send_init_event() -> None: + """Build and send the process init event without blocking API calls.""" + + def _post_init(): try: - payload = json.dumps( - { - "api_key": _POSTHOG_API_KEY, - "event": event_name, - "properties": { - "distinct_id": _get_anonymous_id(), - **properties, - }, - "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S.000Z", time.gmtime()), - } - ).encode("utf-8") - - req = urllib.request.Request( - f"{_POSTHOG_HOST}/capture/", - data=payload, - headers={"Content-Type": "application/json"}, - method="POST", - ) - urllib.request.urlopen(req, timeout=5) + from .__about__ import __version__ except Exception: - pass - - t = threading.Thread(target=_post, daemon=True) + __version__ = "unknown" + + uname = platform.uname() + _post_event( + "datafog_init", + { + "package_version": __version__, + "python_version": platform.python_version(), + "os": uname.system, + "os_version": uname.release, + "arch": uname.machine, + "installed_extras": _detect_installed_extras(), + "is_ci": _detect_ci(), + }, + ) + + t = threading.Thread(target=_post_init, daemon=True) t.start() @@ -206,24 +236,7 @@ def _ensure_initialized() -> None: if not _is_telemetry_enabled(): return - try: - from .__about__ import __version__ - except Exception: - __version__ = "unknown" - - uname = platform.uname() - _send_event( - "datafog_init", - { - "package_version": __version__, - "python_version": platform.python_version(), - "os": uname.system, - "os_version": uname.release, - "arch": uname.machine, - "installed_extras": _detect_installed_extras(), - "is_ci": _detect_ci(), - }, - ) + _send_init_event() def track_function_call(function_name: str, module: str, **kwargs) -> None: diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 41dcf688..d4033ad8 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -267,6 +267,33 @@ def test_track_function_call_returns_immediately( assert elapsed < 0.1 + def test_track_function_call_does_not_wait_for_init_metadata( + self, monkeypatch, enable_telemetry + ): + import datafog.telemetry as tel + + init_posted = threading.Event() + + def slow_detect_installed_extras(): + time.sleep(0.3) + return [] + + def fake_post_event(event_name, properties): + if event_name == "datafog_init": + init_posted.set() + + monkeypatch.setattr( + tel, "_detect_installed_extras", slow_detect_installed_extras + ) + monkeypatch.setattr(tel, "_post_event", fake_post_event) + + start = time.monotonic() + tel.track_function_call("fn", "mod") + elapsed = time.monotonic() - start + + assert elapsed < 0.1 + assert init_posted.wait(1) + def test_network_failure_is_silent(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_function_call From b593cbe1625304655a6c3599796ee607ffe2f7a2 Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Mon, 27 Apr 2026 04:40:15 +0200 Subject: [PATCH 4/5] test: harden telemetry nonblocking assertions --- tests/test_telemetry.py | 67 ++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 17 deletions(-) diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index d4033ad8..209131b5 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -244,38 +244,64 @@ class TestNonBlocking: def test_send_event_returns_immediately(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event - # Make urlopen block - mock_urlopen.side_effect = lambda *a, **k: time.sleep(10) + release_network = threading.Event() - start = time.monotonic() - _send_event("test", {"k": "v"}) - elapsed = time.monotonic() - start + def block_until_released(*args, **kwargs): + release_network.wait(5) - # Should return in <100ms even though urlopen blocks for 10s - assert elapsed < 0.1 + mock_urlopen.side_effect = block_until_released + + call_done = threading.Event() + caller = threading.Thread( + target=lambda: (_send_event("test", {"k": "v"}), call_done.set()) + ) + + try: + caller.start() + assert call_done.wait(1) + assert not release_network.is_set() + finally: + release_network.set() + caller.join(1) def test_track_function_call_returns_immediately( self, mock_urlopen, enable_telemetry ): from datafog.telemetry import track_function_call - mock_urlopen.side_effect = lambda *a, **k: time.sleep(10) + release_network = threading.Event() - start = time.monotonic() - track_function_call("fn", "mod") - elapsed = time.monotonic() - start + def block_until_released(*args, **kwargs): + release_network.wait(5) + + mock_urlopen.side_effect = block_until_released - assert elapsed < 0.1 + call_done = threading.Event() + caller = threading.Thread( + target=lambda: (track_function_call("fn", "mod"), call_done.set()) + ) + + try: + caller.start() + assert call_done.wait(1) + assert not release_network.is_set() + finally: + release_network.set() + caller.join(1) def test_track_function_call_does_not_wait_for_init_metadata( self, monkeypatch, enable_telemetry ): import datafog.telemetry as tel + call_done = threading.Event() + detect_started = threading.Event() + release_detect = threading.Event() init_posted = threading.Event() def slow_detect_installed_extras(): - time.sleep(0.3) + detect_started.set() + release_detect.wait(5) return [] def fake_post_event(event_name, properties): @@ -287,11 +313,18 @@ def fake_post_event(event_name, properties): ) monkeypatch.setattr(tel, "_post_event", fake_post_event) - start = time.monotonic() - tel.track_function_call("fn", "mod") - elapsed = time.monotonic() - start + caller = threading.Thread( + target=lambda: (tel.track_function_call("fn", "mod"), call_done.set()) + ) + + try: + caller.start() + assert detect_started.wait(1) + assert call_done.wait(1) + finally: + release_detect.set() + caller.join(1) - assert elapsed < 0.1 assert init_posted.wait(1) def test_network_failure_is_silent(self, mock_urlopen, enable_telemetry): From dbdbed5814cd6d325aea9b12c0e818001be285f7 Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Mon, 27 Apr 2026 04:47:47 +0200 Subject: [PATCH 5/5] test: cover optional dependency fallback paths --- tests/test_runtime_dependency_safety.py | 34 ++++++++++++++++++++++ tests/test_telemetry.py | 38 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/tests/test_runtime_dependency_safety.py b/tests/test_runtime_dependency_safety.py index 9e03697a..9410ddc6 100644 --- a/tests/test_runtime_dependency_safety.py +++ b/tests/test_runtime_dependency_safety.py @@ -62,6 +62,40 @@ def load(_model_name): engine._get_spacy_annotator.cache_clear() +def test_spacy_engine_missing_module_surfaces_import_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from datafog import engine + from datafog.exceptions import EngineNotAvailable + + module_name = "datafog.processing.text_processing.spacy_pii_annotator" + monkeypatch.setitem(sys.modules, module_name, None) + + engine._get_spacy_annotator.cache_clear() + try: + with pytest.raises(EngineNotAvailable, match="spacy_pii_annotator"): + engine.scan("Jane Doe", engine="spacy") + finally: + engine._get_spacy_annotator.cache_clear() + + +def test_gliner_engine_missing_module_surfaces_import_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + from datafog import engine + from datafog.exceptions import EngineNotAvailable + + module_name = "datafog.processing.text_processing.gliner_annotator" + monkeypatch.setitem(sys.modules, module_name, None) + + engine._get_gliner_annotator.cache_clear() + try: + with pytest.raises(EngineNotAvailable, match="gliner_annotator"): + engine.scan("Jane Doe", engine="gliner") + finally: + engine._get_gliner_annotator.cache_clear() + + def test_spacy_helper_does_not_require_rich(monkeypatch: pytest.MonkeyPatch) -> None: module_name = "datafog.models.spacy_nlp" monkeypatch.delitem(sys.modules, module_name, raising=False) diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 209131b5..ce5651a0 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -2,6 +2,7 @@ import builtins import json +import sys import threading import time from pathlib import Path @@ -648,6 +649,43 @@ def guarded_import(name, *args, **kwargs): result = _detect_installed_extras() assert isinstance(result, list) + def test_detect_installed_extras_handles_probe_errors(self, monkeypatch): + import datafog.telemetry as tel + + optional_modules = {"spacy", "gliner", "pytesseract", "typer", "pyspark"} + for module_name in optional_modules: + monkeypatch.delitem(sys.modules, module_name, raising=False) + + def broken_find_spec(_module_name): + raise ValueError("invalid module state") + + monkeypatch.setattr(tel.importlib.util, "find_spec", broken_find_spec) + + assert tel._detect_installed_extras() == [] + + def test_send_init_event_uses_unknown_version_when_about_import_fails( + self, monkeypatch + ): + import datafog.telemetry as tel + + init_posted = threading.Event() + captured = {} + + monkeypatch.setitem(sys.modules, "datafog.__about__", None) + + def fake_post_event(event_name, properties): + captured["event_name"] = event_name + captured["properties"] = properties + init_posted.set() + + monkeypatch.setattr(tel, "_post_event", fake_post_event) + + tel._send_init_event() + + assert init_posted.wait(1) + assert captured["event_name"] == "datafog_init" + assert captured["properties"]["package_version"] == "unknown" + def test_services_init_does_not_require_aiohttp(self): """TextService should be importable without aiohttp/PIL (services/__init__.py fix).""" from datafog.services.text_service import TextService