From bd305913b843d8ab8735cd65828e180ad58233af Mon Sep 17 00:00:00 2001 From: sidmohan0 <61345237+sidmohan0@users.noreply.github.com> Date: Sun, 26 Apr 2026 23:35:11 +0200 Subject: [PATCH] Make telemetry opt-in for v4.4 --- README.md | 14 ++++- datafog/telemetry.py | 20 +++++-- docs/v5-compatibility-matrix.rst | 4 +- scripts/generate_changelog.py | 37 +++++++++++++ tests/test_telemetry.py | 95 +++++++++++++++++++++++--------- 5 files changed, 135 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 794defcb..e3a211b3 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,10 @@ pip install datafog[nlp-advanced] pip install datafog[all] ``` +Python 3.13 support is certified for the core SDK and CLI. Optional extras such +as `nlp`, `nlp-advanced`, `ocr`, `distributed`, and `all` are available but not +yet certified on Python 3.13. + ## Quick Start ```python @@ -132,9 +136,15 @@ datafog hash-text "john@example.com" ## Telemetry -DataFog includes anonymous telemetry by default. +DataFog telemetry is disabled by default. + +To opt in: + +```bash +export DATAFOG_TELEMETRY=1 +``` -To opt out: +To force telemetry off: ```bash export DATAFOG_NO_TELEMETRY=1 diff --git a/datafog/telemetry.py b/datafog/telemetry.py index fb7e3137..6b3885a3 100644 --- a/datafog/telemetry.py +++ b/datafog/telemetry.py @@ -1,10 +1,13 @@ """ -Anonymous, opt-out usage telemetry for DataFog. +Anonymous, opt-in usage telemetry for DataFog. Collects anonymous usage data to help the DataFog team understand which engines, functions, and features are actually used. No text content is ever sent. -Opt out by setting either environment variable: +Telemetry is disabled by default. Opt in by setting: + DATAFOG_TELEMETRY=1 + +Force telemetry off by setting either environment variable: DATAFOG_NO_TELEMETRY=1 DO_NOT_TRACK=1 """ @@ -29,13 +32,18 @@ _scope = threading.local() +def _env_truthy(name: str) -> bool: + """Return True when an environment variable explicitly opts in/out.""" + return os.environ.get(name, "").strip().lower() in {"1", "true", "yes", "on"} + + def _is_telemetry_enabled() -> bool: - """Check if telemetry is enabled (opt-out via env vars).""" - if os.environ.get("DATAFOG_NO_TELEMETRY", "").strip() == "1": + """Check if telemetry is enabled (opt-in, with opt-out overrides).""" + if _env_truthy("DATAFOG_NO_TELEMETRY"): return False - if os.environ.get("DO_NOT_TRACK", "").strip() == "1": + if _env_truthy("DO_NOT_TRACK"): return False - return True + return _env_truthy("DATAFOG_TELEMETRY") def _get_anonymous_id() -> str: diff --git a/docs/v5-compatibility-matrix.rst b/docs/v5-compatibility-matrix.rst index 41c9e339..b95483e6 100644 --- a/docs/v5-compatibility-matrix.rst +++ b/docs/v5-compatibility-matrix.rst @@ -127,9 +127,9 @@ Compatibility Matrix - Deferred for v5.1+ overhaul. - Keep compatibility where practical; no runtime package installs. * - Telemetry - - Default-on opt-out telemetry. + - Opt-in telemetry. - Trust-critical behavior change. - - Make opt-in or no-network-by-default. + - Keep no-network-by-default. * - ``*_lean`` and ``*_original`` modules - Parallel historical implementations. - Remove or make private after migration path. diff --git a/scripts/generate_changelog.py b/scripts/generate_changelog.py index 293ac5b8..23babcf8 100755 --- a/scripts/generate_changelog.py +++ b/scripts/generate_changelog.py @@ -7,6 +7,16 @@ from datetime import datetime +def get_current_version(): + """Read the current package version from datafog/__about__.py.""" + try: + with open("datafog/__about__.py") as f: + match = re.search(r'^__version__ = "([^"]+)"', f.read(), re.M) + return match.group(1) if match else None + except OSError: + return None + + def get_latest_tag(): """Get the latest git tag.""" try: @@ -65,6 +75,7 @@ def generate_changelog(beta=False, alpha=False): """Generate changelog content.""" latest_tag = get_latest_tag() commits = get_commits_since_tag(latest_tag) + current_version = get_current_version() if not commits: return "No changes since last release." @@ -85,6 +96,32 @@ def generate_changelog(beta=False, alpha=False): changelog = "# What's New\n\n" changelog += f"*Released: {datetime.now().strftime('%Y-%m-%d')}*\n\n" + if not alpha and not beta and current_version == "4.4.0": + changelog += "## Python 3.13 Support Scope\n\n" + changelog += ( + "Python 3.13 support is certified for the core SDK and CLI: " + "`pip install datafog` and `pip install datafog[cli]`.\n\n" + ) + changelog += ( + "Optional extras including `nlp`, `nlp-advanced`, `ocr`, " + "`distributed`, and `all` are available but not yet certified on " + "Python 3.13. They will be validated separately based on user " + "demand.\n\n" + ) + changelog += "## v5 Migration Bridge\n\n" + changelog += ( + "This release adds the v5-preview top-level APIs `datafog.scan`, " + "`datafog.redact`, and `datafog.protect` while keeping the legacy " + "`datafog.detect` and `datafog.process` APIs working with targeted " + "migration warnings.\n\n" + ) + changelog += "## Privacy Defaults\n\n" + changelog += ( + "Telemetry is now opt-in. DataFog does not send telemetry unless " + "`DATAFOG_TELEMETRY=1` is explicitly set. `DATAFOG_NO_TELEMETRY=1` " + "and `DO_NOT_TRACK=1` continue to force telemetry off.\n\n" + ) + if categories["features"]: changelog += "## 🚀 New Features\n" for commit in categories["features"]: diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 9c69e178..3886a1dc 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -28,13 +28,20 @@ def _reset_telemetry_state(): def _clean_state(monkeypatch): """Ensure clean telemetry state for every test and disable network.""" _reset_telemetry_state() - # Default: telemetry enabled but network mocked + # Default: telemetry disabled unless a test opts in explicitly. + monkeypatch.delenv("DATAFOG_TELEMETRY", raising=False) monkeypatch.delenv("DATAFOG_NO_TELEMETRY", raising=False) monkeypatch.delenv("DO_NOT_TRACK", raising=False) yield _reset_telemetry_state() +@pytest.fixture +def enable_telemetry(monkeypatch): + """Opt telemetry in for tests that assert payload behavior.""" + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") + + @pytest.fixture def mock_urlopen(): """Mock urllib.request.urlopen to capture payloads without network.""" @@ -51,26 +58,62 @@ class TestOptOut: def test_datafog_no_telemetry_disables(self, monkeypatch): from datafog.telemetry import _is_telemetry_enabled + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") monkeypatch.setenv("DATAFOG_NO_TELEMETRY", "1") assert _is_telemetry_enabled() is False def test_do_not_track_disables(self, monkeypatch): from datafog.telemetry import _is_telemetry_enabled + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") monkeypatch.setenv("DO_NOT_TRACK", "1") assert _is_telemetry_enabled() is False - def test_enabled_by_default(self): + def test_disabled_by_default(self): + from datafog.telemetry import _is_telemetry_enabled + + assert _is_telemetry_enabled() is False + + def test_datafog_telemetry_enables(self, monkeypatch): from datafog.telemetry import _is_telemetry_enabled + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") assert _is_telemetry_enabled() is True - def test_non_one_value_does_not_disable(self, monkeypatch): + @pytest.mark.parametrize("value", ["true", "yes", "on"]) + def test_truthy_values_enable(self, monkeypatch, value): from datafog.telemetry import _is_telemetry_enabled - monkeypatch.setenv("DATAFOG_NO_TELEMETRY", "true") + monkeypatch.setenv("DATAFOG_TELEMETRY", value) assert _is_telemetry_enabled() is True + def test_falsey_value_does_not_enable(self, monkeypatch): + from datafog.telemetry import _is_telemetry_enabled + + monkeypatch.setenv("DATAFOG_TELEMETRY", "0") + assert _is_telemetry_enabled() is False + + def test_truthy_opt_out_overrides_opt_in(self, monkeypatch): + from datafog.telemetry import _is_telemetry_enabled + + monkeypatch.setenv("DATAFOG_TELEMETRY", "1") + monkeypatch.setenv("DATAFOG_NO_TELEMETRY", "true") + assert _is_telemetry_enabled() is False + + def test_send_event_noop_by_default(self, mock_urlopen): + from datafog.telemetry import _send_event + + _send_event("test_event", {"key": "value"}) + time.sleep(0.1) + mock_urlopen.assert_not_called() + + def test_track_function_call_noop_by_default(self, mock_urlopen): + from datafog.telemetry import track_function_call + + track_function_call("test_fn", "test_module") + time.sleep(0.1) + mock_urlopen.assert_not_called() + def test_send_event_noop_when_disabled(self, monkeypatch, mock_urlopen): from datafog.telemetry import _send_event @@ -166,7 +209,7 @@ def test_anonymous_id_persisted(self, tmp_path, monkeypatch): id2 = tel._get_anonymous_id() assert id1 == id2 - def test_payload_never_contains_text_content(self, mock_urlopen): + def test_payload_never_contains_text_content(self, mock_urlopen, enable_telemetry): """Verify that tracked events don't leak text content.""" from datafog.telemetry import track_function_call @@ -197,7 +240,7 @@ def test_payload_never_contains_text_content(self, mock_urlopen): class TestNonBlocking: - def test_send_event_returns_immediately(self, mock_urlopen): + def test_send_event_returns_immediately(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event # Make urlopen block @@ -210,7 +253,9 @@ def test_send_event_returns_immediately(self, mock_urlopen): # Should return in <100ms even though urlopen blocks for 10s assert elapsed < 0.1 - def test_track_function_call_returns_immediately(self, mock_urlopen): + def test_track_function_call_returns_immediately( + self, mock_urlopen, enable_telemetry + ): from datafog.telemetry import track_function_call mock_urlopen.side_effect = lambda *a, **k: time.sleep(10) @@ -221,7 +266,7 @@ def test_track_function_call_returns_immediately(self, mock_urlopen): assert elapsed < 0.1 - def test_network_failure_is_silent(self, mock_urlopen): + def test_network_failure_is_silent(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_function_call mock_urlopen.side_effect = Exception("Network down") @@ -229,7 +274,7 @@ def test_network_failure_is_silent(self, mock_urlopen): track_function_call("fn", "mod") time.sleep(0.3) - def test_urlopen_timeout_is_bounded(self, mock_urlopen): + def test_urlopen_timeout_is_bounded(self, mock_urlopen, enable_telemetry): """Verify we pass a timeout to urlopen.""" from datafog.telemetry import _send_event @@ -248,7 +293,7 @@ def test_urlopen_timeout_is_bounded(self, mock_urlopen): class TestPayloadCorrectness: - def test_init_event_sent_once(self, mock_urlopen): + def test_init_event_sent_once(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _ensure_initialized _ensure_initialized() @@ -259,7 +304,7 @@ def test_init_event_sent_once(self, mock_urlopen): # Should only create one thread/call for init assert mock_urlopen.call_count <= 1 - def test_init_event_has_required_properties(self, mock_urlopen): + def test_init_event_has_required_properties(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _ensure_initialized _ensure_initialized() @@ -281,7 +326,7 @@ def test_init_event_has_required_properties(self, mock_urlopen): assert "is_ci" in props assert "distinct_id" in props - def test_function_call_event_properties(self, mock_urlopen): + def test_function_call_event_properties(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_function_call track_function_call( @@ -308,7 +353,7 @@ def test_function_call_event_properties(self, mock_urlopen): found = True assert found, "datafog_function_called event not found" - def test_error_event_properties(self, mock_urlopen): + def test_error_event_properties(self, mock_urlopen, enable_telemetry): from datafog.telemetry import track_error track_error("detect", "ValueError", engine="regex") @@ -326,7 +371,7 @@ def test_error_event_properties(self, mock_urlopen): found = True assert found, "datafog_error event not found" - def test_posthog_endpoint_url(self, mock_urlopen): + def test_posthog_endpoint_url(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event _send_event("test_event", {"k": "v"}) @@ -336,7 +381,7 @@ def test_posthog_endpoint_url(self, mock_urlopen): req = mock_urlopen.call_args[0][0] assert req.full_url == "https://us.i.posthog.com/capture/" - def test_content_type_is_json(self, mock_urlopen): + def test_content_type_is_json(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _send_event _send_event("test_event", {"k": "v"}) @@ -353,7 +398,7 @@ def test_content_type_is_json(self, mock_urlopen): class TestIntegration: - def test_detect_triggers_telemetry(self, mock_urlopen): + def test_detect_triggers_telemetry(self, mock_urlopen, enable_telemetry): from datafog import detect with pytest.warns(FutureWarning, match=r"Use datafog\.scan\(\) instead"): @@ -367,7 +412,7 @@ def test_detect_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_process_triggers_telemetry(self, mock_urlopen): + def test_process_triggers_telemetry(self, mock_urlopen, enable_telemetry): from datafog import process with pytest.warns( @@ -384,7 +429,7 @@ def test_process_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_datafog_class_triggers_telemetry(self, mock_urlopen): + def test_datafog_class_triggers_telemetry(self, mock_urlopen, enable_telemetry): from datafog.main import DataFog df = DataFog() @@ -398,7 +443,7 @@ def test_datafog_class_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_text_service_triggers_telemetry(self, mock_urlopen): + def test_text_service_triggers_telemetry(self, mock_urlopen, enable_telemetry): try: from datafog.services.text_service import TextService except ImportError: @@ -415,7 +460,7 @@ def test_text_service_triggers_telemetry(self, mock_urlopen): events.append(body["event"]) assert "datafog_function_called" in events - def test_core_detect_pii_triggers_telemetry(self, mock_urlopen): + def test_core_detect_pii_triggers_telemetry(self, mock_urlopen, enable_telemetry): try: from datafog.core import detect_pii @@ -440,7 +485,7 @@ def test_core_detect_pii_triggers_telemetry(self, mock_urlopen): class TestEdgeCases: - def test_empty_text(self, mock_urlopen): + def test_empty_text(self, mock_urlopen, enable_telemetry): from datafog.telemetry import _get_text_length_bucket, track_function_call track_function_call( @@ -456,7 +501,7 @@ def test_large_text_bucket(self, mock_urlopen): assert _get_text_length_bucket(10_000_000) == "100k+" - def test_concurrent_init(self, mock_urlopen): + def test_concurrent_init(self, mock_urlopen, enable_telemetry): """Multiple threads calling _ensure_initialized should only init once.""" from datafog.telemetry import _ensure_initialized @@ -492,7 +537,7 @@ def fake_home(): anon_id = tel._get_anonymous_id() assert len(anon_id) == 64 - def test_dedup_nested_calls(self, mock_urlopen): + def test_dedup_nested_calls(self, mock_urlopen, enable_telemetry): """Nested track_function_call should only record the outer call.""" from datafog.telemetry import track_function_call @@ -531,7 +576,7 @@ def test_services_init_does_not_require_aiohttp(self): ts = TextService(engine="regex") assert ts.engine == "regex" - def test_track_error_sent_on_exception(self, mock_urlopen): + def test_track_error_sent_on_exception(self, mock_urlopen, enable_telemetry): """track_error should fire a datafog_error event.""" from datafog.telemetry import track_error @@ -550,7 +595,7 @@ def test_track_error_sent_on_exception(self, mock_urlopen): assert error_events[0]["error_type"] == "ValueError" assert error_events[0]["engine"] == "regex" - def test_pipeline_error_triggers_track_error(self, mock_urlopen): + def test_pipeline_error_triggers_track_error(self, mock_urlopen, enable_telemetry): """DataFog.run_text_pipeline_sync should fire datafog_error on failure.""" from datafog.main import DataFog