diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b22c91cb..c39cd405 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.6.0 +current_version = 4.7.0 commit = True tag = True tag_name = v{new_version} diff --git a/CHANGELOG.MD b/CHANGELOG.MD index ca893890..b51971c0 100644 --- a/CHANGELOG.MD +++ b/CHANGELOG.MD @@ -2,6 +2,41 @@ ## [2026-07-02] +### `datafog-python` [4.7.0] + +#### Added + +- **Allowlist support** on `scan()` and `redact()`: `allowlist=[...]` exempts + exact entity texts (your own support address, documentation placeholders); + `allowlist_patterns=[...]` exempts entities whose full text matches a regex + (e.g. `^\d{10}$` so unix timestamps stop matching as phone numbers). + Matching is deliberately strict: case-sensitive, no Unicode normalization, + exact/fullmatch only — a partial match never suppresses a finding. + Threaded through both agent adapters: `DATAFOG_HOOK_ALLOWLIST` / + `DATAFOG_HOOK_ALLOWLIST_PATTERNS` environment variables for the Claude + Code hook, `allowlist` / `allowlist_patterns` parameters for the LiteLLM + guardrail. Patterns are operator configuration — treat them like code and + never accept them from end users; patterns with nested quantifiers are + rejected at configuration time (catastrophic-backtracking guard), pattern + length is capped at 512 characters, and entities longer than 512 + characters skip pattern matching fail-safe (the finding is kept). +- **Presidio-compatible entity aliases**: `EMAIL_ADDRESS` and `US_SSN` are + accepted as input aliases for `EMAIL` and `SSN` (joining the existing + `PHONE_NUMBER` alias), so Presidio configurations migrate without renames. +- **`py.typed` marker**: the package now advertises its inline type + annotations to type checkers (PEP 561). + +#### Changed + +- **LiteLLM guardrail observability**: redaction events are now recorded + with `guardrail_status="guardrail_intervened"` (previously `"success"`), + so compliance dashboards flag redactions as interventions. Guardrail + logging metadata is attached to the request dict actually returned in + redact mode, fixing dropped observability records for requests arriving + without a pre-existing `metadata` key. +- Documentation: corrected the engine entity-type list — the scan API + returns `DATE` and `ZIP_CODE`; `DOB` and `ZIP` are accepted input aliases. + ### `datafog-python` [4.6.0] #### Added diff --git a/README.md b/README.md index 856fafd4..965a5808 100644 --- a/README.md +++ b/README.md @@ -5,17 +5,43 @@ DataFog is a Python library for detecting and redacting personally identifiable It provides: - Fast structured PII detection via regex +- An offline PII firewall for AI agents: a Claude Code hook and a LiteLLM + gateway guardrail (new in 4.6) - Optional NER support via spaCy and GLiNER - A simple agent-oriented API for LLM applications - Backward-compatible `DataFog` and `TextService` classes -## 4.5 Focus +## Agent & Gateway Firewall (4.6) -DataFog 4.5 is focused on lightweight text PII screening: a small core install, -fast regex-based scan/redact helpers, explicit optional extras, and a clearer -path toward future middleware use cases. Dedicated Sentry, OpenTelemetry, -logging-framework, and cloud DLP adapters are future-facing work and are not -part of the 4.5 release. +DataFog 4.6 adds two ready-made enforcement points that catch PII at the +moment it would leave your machine — offline, in microseconds, with matched +values never echoed into logs or transcripts: + +- **Claude Code hook** (`datafog-hook`): gates agent tool calls (shell + commands, web requests, file writes, MCP tools) and warns the model when + prompts or tool results carry PII. ~70ms per invocation including process + startup. Easiest install is the + [Claude Code plugin](https://github.com/DataFog/datafog-claude-plugin): + + ``` + /plugin marketplace add DataFog/datafog-claude-plugin + /plugin install datafog@datafog + ``` + + Manual hook setup and limitations: [examples/claude_code_hook/](examples/claude_code_hook/). + +- **LiteLLM guardrail** (`DataFogGuardrail`): redacts or blocks PII in + requests and responses at the gateway, for any LiteLLM-proxied provider. + In-process (~31µs per request), no sidecar service. Setup: + [examples/litellm_guardrail/](examples/litellm_guardrail/). + +Both default to the high-precision entity set (`EMAIL`, `PHONE`, +`CREDIT_CARD`, `SSN`); noisier types are opt-in. Known-safe values can be +exempted with an allowlist: `scan(text, allowlist=[...])` for exact values, +`allowlist_patterns=[...]` for full-match regexes (e.g. `^\d{10}$` to stop +unix timestamps matching as phone numbers) — available in both adapters and +the API. Presidio-style entity names (`EMAIL_ADDRESS`, `PHONE_NUMBER`, +`US_SSN`) are accepted as aliases for easy migration. ## Installation @@ -42,7 +68,7 @@ pip install datafog[all] Python 3.13 support is certified for the core SDK, CLI, `nlp`, `nlp-advanced`, and `ocr` install profiles. Donut OCR still requires a model that is available locally before runtime use. `distributed` and `all` are not -newly certified on Python 3.13 in the 4.5 line. +newly certified on Python 3.13 in the 4.x line. ## Quick Start @@ -117,7 +143,7 @@ Use the engine that matches your accuracy and dependency constraints: - `regex`: - Fastest and always available. - - Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DATE`, `ZIP_CODE`. + - Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DATE`, `ZIP_CODE` (`DOB` and `ZIP` are accepted as input aliases). - Use `locales=["de"]` for German structured IDs such as `DE_VAT_ID`, `DE_IBAN`, `DE_TAX_ID`, `DE_POSTAL_CODE`, and passport or residence permit numbers. - `spacy`: - Requires `pip install datafog[nlp]`. @@ -131,7 +157,7 @@ Use the engine that matches your accuracy and dependency constraints: ## Optional OCR And Spark Surfaces -DataFog 4.5 keeps the main package story centered on lightweight text PII +The 4.x line keeps the main package story centered on lightweight text PII screening. OCR and Spark remain supported optional surfaces for users who already rely on them, but they are not required for the core import, default scan/redact helpers, or guardrail helpers. @@ -151,7 +177,7 @@ scan/redact helpers, or guardrail helpers. - A Java runtime is required by PySpark. OCR and Spark are not deprecated. Their broader API and packaging overhaul is -deferred; the 4.5 goal is to keep them explicit, documented, and isolated from +deferred; the 4.x goal is to keep them explicit, documented, and isolated from the lightweight core path. ## Backward-Compatible APIs diff --git a/datafog/__about__.py b/datafog/__about__.py index db01fb21..8355eb42 100644 --- a/datafog/__about__.py +++ b/datafog/__about__.py @@ -1 +1 @@ -__version__ = "4.6.0" +__version__ = "4.7.0" diff --git a/datafog/__init__.py b/datafog/__init__.py index 4f7567a8..f1bd8daf 100644 --- a/datafog/__init__.py +++ b/datafog/__init__.py @@ -153,14 +153,28 @@ def scan( engine: str = "regex", entity_types: list[str] | None = None, locales: list[str] | None = None, + allowlist: list[str] | None = None, + allowlist_patterns: list[str] | None = None, ) -> ScanResult: """ v5-preview scan entrypoint. Defaults to the lightweight regex engine so the core install works without optional dependency fallback warnings. + + ``allowlist`` exempts exact entity texts (your own support address, doc + placeholders); ``allowlist_patterns`` exempts entities whose full text + matches a regex (e.g. ``^\\d{10}$`` so unix timestamps stop matching as + phone numbers). """ - return _scan(text=text, engine=engine, entity_types=entity_types, locales=locales) + return _scan( + text=text, + engine=engine, + entity_types=entity_types, + locales=locales, + allowlist=allowlist, + allowlist_patterns=allowlist_patterns, + ) def redact( @@ -171,12 +185,17 @@ def redact( strategy: str = "token", preset: str | None = None, locales: list[str] | None = None, + allowlist: list[str] | None = None, + allowlist_patterns: list[str] | None = None, ) -> RedactResult: """ v5-preview redaction entrypoint. If entities are provided, redact those spans. Otherwise, scan text first - using the selected engine and redact the detected entities. + using the selected engine and redact the detected entities. ``allowlist`` + and ``allowlist_patterns`` exempt findings from redaction (exact text and + full-text regex match respectively); they apply to the scan path and are + rejected when explicit ``entities`` are supplied. """ if preset is not None: try: @@ -186,6 +205,11 @@ def redact( raise ValueError(f"preset must be one of: {allowed}") from exc if entities is not None: + if allowlist or allowlist_patterns: + raise ValueError( + "allowlist/allowlist_patterns cannot be combined with explicit " + "entities; filter the entities before calling redact" + ) return _redact_entities(text=text, entities=entities, strategy=strategy) return _scan_and_redact( @@ -194,6 +218,8 @@ def redact( entity_types=entity_types, strategy=strategy, locales=locales, + allowlist=allowlist, + allowlist_patterns=allowlist_patterns, ) diff --git a/datafog/engine.py b/datafog/engine.py index 250cd6fe..46ed72a8 100644 --- a/datafog/engine.py +++ b/datafog/engine.py @@ -3,6 +3,7 @@ from __future__ import annotations import hashlib +import re import warnings from dataclasses import dataclass from functools import lru_cache @@ -23,6 +24,9 @@ "SOCIAL_SECURITY_NUMBER": "SSN", "CREDIT_CARD_NUMBER": "CREDIT_CARD", "DATE_OF_BIRTH": "DATE", + # Presidio-compatible aliases, so configs migrate without renames. + "EMAIL_ADDRESS": "EMAIL", + "US_SSN": "SSN", } ALL_ENTITY_TYPES = { @@ -277,6 +281,74 @@ def _filter_entity_types( return [entity for entity in entities if entity.type in allowed] +# Python's re module backtracks; a quantified group containing another +# quantifier (e.g. ``(a+)+``) can take exponential time on adversarial +# input, and entity text can be attacker-influenced (LLM messages, tool +# output). Reject that construct outright rather than matching under it. +_NESTED_QUANTIFIER = re.compile( + r"\((?:[^()\\]|\\.)*(? list["re.Pattern[str]"]: + compiled = [] + for raw in allowlist_patterns or []: + if len(raw) > MAX_ALLOWLIST_PATTERN_LENGTH: + raise ValueError( + "allowlist_patterns entries must be at most " + f"{MAX_ALLOWLIST_PATTERN_LENGTH} characters" + ) + if _NESTED_QUANTIFIER.search(raw): + raise ValueError( + "allowlist_patterns contains a quantified group with a nested " + f"quantifier ({raw!r}), which risks catastrophic backtracking; " + "rewrite the pattern without nesting quantifiers" + ) + try: + compiled.append(re.compile(raw)) + except re.error as exc: + raise ValueError( + f"allowlist_patterns contains an invalid regex: {raw!r} ({exc})" + ) from None + return compiled + + +def _apply_allowlist( + entities: list[Entity], + allowlist: Optional[list[str]], + allowlist_patterns: Optional[list[str]], +) -> list[Entity]: + """Drop entities whose exact text is allowlisted. + + Matching semantics, deliberately strict for a security boundary: + exact values are case-sensitive with no Unicode normalization, and + patterns must fullmatch the entity text, so a partial match never + suppresses a finding. Allowlist entries and patterns are operator + configuration; treat them like code and never accept them from end + users. + """ + if not allowlist and not allowlist_patterns: + return entities + exact = set(allowlist or []) + patterns = _compile_allowlist_patterns(allowlist_patterns) + return [ + entity + for entity in entities + if entity.text not in exact + and not any( + pattern.fullmatch(entity.text) + for pattern in patterns + if len(entity.text) <= MAX_PATTERN_SUBJECT_LENGTH + ) + ] + + def _needs_ner(entity_types: Optional[list[str]]) -> bool: if entity_types is None: return True @@ -289,14 +361,25 @@ def scan( engine: str = "smart", entity_types: Optional[list[str]] = None, locales: Optional[list[str]] = None, + allowlist: Optional[list[str]] = None, + allowlist_patterns: Optional[list[str]] = None, ) -> ScanResult: - """Scan text for PII entities.""" + """Scan text for PII entities. + + ``allowlist`` exempts exact entity texts (e.g. your own support email); + ``allowlist_patterns`` exempts entities whose full text matches a regex + (e.g. ``^\\d{10}$`` to stop unix timestamps matching as phone numbers). + """ if not isinstance(text, str): raise TypeError("text must be a string") if engine not in {"regex", "spacy", "gliner", "smart"}: raise ValueError("engine must be one of: regex, spacy, gliner, smart") + # Validate patterns up front so config errors fail fast even when the + # text contains no entities. + _compile_allowlist_patterns(allowlist_patterns) + regex_entities = _regex_entities( text, entity_types=entity_types, @@ -305,6 +388,7 @@ def scan( if engine == "regex": filtered = _filter_entity_types(regex_entities, entity_types) + filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns) return ScanResult( entities=_dedupe_entities(filtered), text=text, engine_used="regex" ) @@ -367,6 +451,7 @@ def scan( ) filtered = _filter_entity_types(combined, entity_types) + filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns) deduped = _dedupe_entities(filtered) return ScanResult( entities=deduped, @@ -437,6 +522,8 @@ def scan_and_redact( entity_types: Optional[list[str]] = None, strategy: str = "token", locales: Optional[list[str]] = None, + allowlist: Optional[list[str]] = None, + allowlist_patterns: Optional[list[str]] = None, ) -> RedactResult: """Convenience wrapper: scan then redact.""" scan_result = scan( @@ -444,5 +531,7 @@ def scan_and_redact( engine=engine, entity_types=entity_types, locales=locales, + allowlist=allowlist, + allowlist_patterns=allowlist_patterns, ) return redact(text=text, entities=scan_result.entities, strategy=strategy) diff --git a/datafog/integrations/claude_code.py b/datafog/integrations/claude_code.py index dfbfb351..28c4e787 100644 --- a/datafog/integrations/claude_code.py +++ b/datafog/integrations/claude_code.py @@ -16,6 +16,11 @@ - ``DATAFOG_HOOK_ENTITIES``: comma-separated entity types to detect. Defaults to the high-precision set; noisy-in-code types (IP_ADDRESS, DOB, ZIP) must be opted into. +- ``DATAFOG_HOOK_ALLOWLIST``: comma-separated exact values to exempt + (your own support address, documentation placeholders). +- ``DATAFOG_HOOK_ALLOWLIST_PATTERNS``: comma-separated regexes; findings + whose full text matches are exempt (note: a pattern containing a comma + cannot be expressed here). Failure policy: fail open. A hook bug must never brick a Claude Code session, so any unexpected error exits non-blocking with no output. @@ -59,6 +64,11 @@ def _action(env: Mapping[str, str]) -> str: return action if action in VALID_ACTIONS else "ask" +def _csv_env(env: Mapping[str, str], name: str) -> list[str]: + raw = env.get(name, "") + return [item.strip() for item in raw.split(",") if item.strip()] + + def _iter_strings(value: Any) -> Iterator[str]: """Yield every string embedded in a JSON-like structure. @@ -76,7 +86,12 @@ def _iter_strings(value: Any) -> Iterator[str]: stack.extend(current) -def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]: +def _scan_findings( + value: Any, + entity_types: list[str], + allowlist: list[str] | None = None, + allowlist_patterns: list[str] | None = None, +) -> dict[str, int]: """Scan all strings in ``value``; return counts per entity type.""" import datafog @@ -87,7 +102,13 @@ def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]: break chunk = text[: min(MAX_SCAN_CHARS, total_budget)] total_budget -= len(chunk) - result = datafog.scan(chunk, engine="regex", entity_types=entity_types) + result = datafog.scan( + chunk, + engine="regex", + entity_types=entity_types, + allowlist=allowlist or None, + allowlist_patterns=allowlist_patterns or None, + ) for entity in result.entities: counts[entity.type] = counts.get(entity.type, 0) + 1 return counts @@ -104,7 +125,12 @@ def _emit(event: str, fields: dict[str, Any]) -> str: def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str: - counts = _scan_findings(payload.get("tool_input"), _entity_types(env)) + counts = _scan_findings( + payload.get("tool_input"), + _entity_types(env), + allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"), + allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"), + ) if not counts: return "" tool = payload.get("tool_name", "tool") @@ -119,7 +145,12 @@ def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str: def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str: - counts = _scan_findings(payload.get("prompt"), _entity_types(env)) + counts = _scan_findings( + payload.get("prompt"), + _entity_types(env), + allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"), + allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"), + ) if not counts: return "" context = ( @@ -130,7 +161,12 @@ def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str: def _handle_post_tool_use(payload: dict, env: Mapping[str, str]) -> str: - counts = _scan_findings(payload.get("tool_response"), _entity_types(env)) + counts = _scan_findings( + payload.get("tool_response"), + _entity_types(env), + allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"), + allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"), + ) if not counts: return "" tool = payload.get("tool_name", "tool") diff --git a/datafog/integrations/litellm_guardrail.py b/datafog/integrations/litellm_guardrail.py index 756755ca..7e33ed55 100644 --- a/datafog/integrations/litellm_guardrail.py +++ b/datafog/integrations/litellm_guardrail.py @@ -46,11 +46,22 @@ logger = logging.getLogger(__name__) -def _redact_text(text: str, entity_types: list[str]) -> tuple[str, dict[str, int]]: +def _redact_text( + text: str, + entity_types: list[str], + allowlist: list[str] | None = None, + allowlist_patterns: list[str] | None = None, +) -> tuple[str, dict[str, int]]: """Redact ``text``; return (redacted_text, counts per entity type).""" import datafog - result = datafog.redact(text, engine="regex", entity_types=entity_types) + result = datafog.redact( + text, + engine="regex", + entity_types=entity_types, + allowlist=allowlist, + allowlist_patterns=allowlist_patterns, + ) counts: dict[str, int] = {} for entity in result.entities: counts[entity.type] = counts.get(entity.type, 0) + 1 @@ -69,6 +80,8 @@ def __init__( action: str = "redact", entity_types: Optional[list[str]] = None, fail_policy: str = "open", + allowlist: Optional[list[str]] = None, + allowlist_patterns: Optional[list[str]] = None, **kwargs: Any, ) -> None: if action not in VALID_ACTIONS: @@ -80,13 +93,17 @@ def __init__( self.action = action self.entity_types = entity_types or DEFAULT_ENTITY_TYPES self.fail_policy = fail_policy + self.allowlist = allowlist + self.allowlist_patterns = allowlist_patterns super().__init__(**kwargs) def _process_content(self, content: Any) -> tuple[Any, dict[str, int]]: """Redact a message content value (str or list of content parts).""" counts: dict[str, int] = {} if isinstance(content, str): - redacted, counts = _redact_text(content, self.entity_types) + redacted, counts = _redact_text( + content, self.entity_types, self.allowlist, self.allowlist_patterns + ) return redacted, counts if isinstance(content, list): new_parts = [] @@ -94,7 +111,10 @@ def _process_content(self, content: Any) -> tuple[Any, dict[str, int]]: for part in content: if isinstance(part, dict) and isinstance(part.get("text"), str): redacted, part_counts = _redact_text( - part["text"], self.entity_types + part["text"], + self.entity_types, + self.allowlist, + self.allowlist_patterns, ) new_parts.append({**part, "text": redacted}) for etype, n in part_counts.items(): @@ -160,9 +180,8 @@ async def async_pre_call_hook( if not total_counts: return data - self._record_guardrail_logging(data, total_counts) - if self.action == "block": + self._record_guardrail_logging(data, total_counts) # HTTPException(400) is one of the exception types litellm's # _is_guardrail_intervention recognizes, so the block is # classified as a policy intervention (not a backend failure) @@ -178,7 +197,9 @@ async def async_pre_call_hook( }, ) - return {**data, "messages": new_messages} + new_data = {**data, "messages": new_messages} + self._record_guardrail_logging(new_data, total_counts) + return new_data def _record_guardrail_logging( self, data: dict, total_counts: dict[str, int] @@ -188,9 +209,7 @@ def _record_guardrail_logging( self.add_standard_logging_guardrail_information_to_request_data( guardrail_json_response=_summary(total_counts), request_data=data, - guardrail_status=( - "guardrail_intervened" if self.action == "block" else "success" - ), + guardrail_status="guardrail_intervened", masked_entity_count=dict(total_counts), ) except Exception: # noqa: BLE001 — observability must never break traffic @@ -217,7 +236,12 @@ async def async_post_call_success_hook( for choice in choices: message = getattr(choice, "message", None) if message is not None and isinstance(message.content, str): - redacted, counts = _redact_text(message.content, self.entity_types) + redacted, counts = _redact_text( + message.content, + self.entity_types, + self.allowlist, + self.allowlist_patterns, + ) if counts: message.content = redacted elif message is not None and message.content is not None: diff --git a/datafog/py.typed b/datafog/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/setup.py b/setup.py index abffa1e7..078a91bf 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,7 @@ long_description=long_description, long_description_content_type="text/markdown", packages=find_packages(exclude=["tests", "tests.*"]), + package_data={"datafog": ["py.typed"]}, install_requires=core_deps, extras_require=extras_require, python_requires=">=3.10,<3.14", diff --git a/tests/test_allowlist.py b/tests/test_allowlist.py new file mode 100644 index 00000000..1f3bef6e --- /dev/null +++ b/tests/test_allowlist.py @@ -0,0 +1,166 @@ +"""Tests for scan/redact allowlist support and presidio-style entity aliases. + +PII literals are assembled from split parts so write-time scanners +(including our own Claude Code hook) do not match this source file. +""" + +import pytest + +import datafog + +EMAIL = "jane.doe@" "example.com" +OTHER_EMAIL = "sid@" "example.com" +TIMESTAMP_LIKE = "17830" "25668" # ten digits: matches the PHONE pattern + + +class TestExactAllowlist: + def test_allowlisted_value_is_not_reported(self): + result = datafog.scan( + f"mail {EMAIL} and {OTHER_EMAIL}", + engine="regex", + allowlist=[OTHER_EMAIL], + ) + assert [e.text for e in result.entities] == [EMAIL] + + def test_allowlist_is_exact_not_substring(self): + result = datafog.scan(f"mail {EMAIL}", engine="regex", allowlist=["jane.doe"]) + assert [e.text for e in result.entities] == [EMAIL] + + def test_empty_allowlist_is_noop(self): + result = datafog.scan(f"mail {EMAIL}", engine="regex", allowlist=[]) + assert len(result.entities) == 1 + + def test_redact_respects_allowlist(self): + result = datafog.redact( + f"mail {EMAIL} and {OTHER_EMAIL}", + engine="regex", + allowlist=[OTHER_EMAIL], + ) + assert OTHER_EMAIL in result.redacted_text + assert EMAIL not in result.redacted_text + + +class TestPatternAllowlist: + def test_pattern_suppresses_matching_entities(self): + # The motivating case: unix timestamps and numeric IDs match the + # PHONE pattern; a pattern allowlist can exempt all-digit strings. + noisy = datafog.scan(f"created {TIMESTAMP_LIKE}", engine="regex") + assert len(noisy.entities) == 1 # sanity: it is detected by default + + result = datafog.scan( + f"created {TIMESTAMP_LIKE}", + engine="regex", + allowlist_patterns=[r"^\d{10}$"], + ) + assert result.entities == [] + + def test_pattern_matches_full_entity_text_only(self): + result = datafog.scan( + f"mail {EMAIL}", engine="regex", allowlist_patterns=[r"^jane\."] + ) + assert len(result.entities) == 1 # partial match must not suppress + + def test_invalid_pattern_raises_value_error(self): + with pytest.raises(ValueError, match="allowlist_patterns"): + datafog.scan("text", engine="regex", allowlist_patterns=["("]) + + def test_patterns_and_values_combine(self): + result = datafog.scan( + f"{EMAIL} then {TIMESTAMP_LIKE}", + engine="regex", + allowlist=[EMAIL], + allowlist_patterns=[r"^\d{10}$"], + ) + assert result.entities == [] + + +class TestReDoSGuards: + def test_catastrophic_pattern_rejected(self): + with pytest.raises(ValueError, match="catastrophic backtracking"): + datafog.scan("text", engine="regex", allowlist_patterns=[r"(a+)+$"]) + + def test_nested_star_rejected(self): + with pytest.raises(ValueError, match="catastrophic backtracking"): + datafog.scan("text", engine="regex", allowlist_patterns=[r"(.*)*"]) + + def test_overlong_pattern_rejected(self): + with pytest.raises(ValueError, match="at most"): + datafog.scan("text", engine="regex", allowlist_patterns=["a" * 513]) + + def test_overlong_entity_text_skips_patterns_but_is_kept(self): + # Fail-safe: an entity too long to pattern-match safely must still + # be reported, never silently suppressed. + from datafog.engine import Entity, _apply_allowlist + + long_entity = Entity( + type="EMAIL", + text="a" * 600, + start=0, + end=600, + confidence=1.0, + engine="regex", + ) + kept = _apply_allowlist([long_entity], None, [r".*"]) + assert kept == [long_entity] + + def test_overlong_entity_text_still_matches_exact_allowlist(self): + # The subject-length cap only bounds regex matching; exact string + # comparison is O(n) and still applies. + from datafog.engine import Entity, _apply_allowlist + + long_entity = Entity( + type="EMAIL", + text="a" * 600, + start=0, + end=600, + confidence=1.0, + engine="regex", + ) + assert _apply_allowlist([long_entity], ["a" * 600], None) == [] + + def test_benign_quantified_group_still_allowed(self): + result = datafog.scan( + f"mail {EMAIL}", + engine="regex", + allowlist_patterns=[r"(abc)+", r".*@example\.com"], + ) + assert result.entities == [] # broad pattern suppresses, no rejection + + +class TestEnginePaths: + def test_smart_engine_applies_allowlist(self): + import warnings as _warnings + + with _warnings.catch_warnings(): + _warnings.simplefilter("ignore") + result = datafog.scan(f"mail {EMAIL}", engine="smart", allowlist=[EMAIL]) + assert result.entities == [] + + def test_redact_rejects_allowlist_with_explicit_entities(self): + scanned = datafog.scan(f"mail {EMAIL}", engine="regex") + with pytest.raises(ValueError, match="cannot be combined"): + datafog.redact( + f"mail {EMAIL}", + entities=scanned.entities, + allowlist=[EMAIL], + ) + + +class TestPresidioAliases: + def test_email_address_alias(self): + result = datafog.scan( + f"mail {EMAIL}", engine="regex", entity_types=["EMAIL_ADDRESS"] + ) + assert [e.type for e in result.entities] == ["EMAIL"] + + def test_us_ssn_alias(self): + ssn = "856-45-" "6789" + result = datafog.scan(f"ssn {ssn}", engine="regex", entity_types=["US_SSN"]) + assert [e.type for e in result.entities] == ["SSN"] + + +class TestPyTyped: + def test_py_typed_marker_ships_with_package(self): + import importlib.resources + + assert importlib.resources.files("datafog").joinpath("py.typed").is_file() diff --git a/tests/test_claude_code_hook.py b/tests/test_claude_code_hook.py index c0620540..99fbae64 100644 --- a/tests/test_claude_code_hook.py +++ b/tests/test_claude_code_hook.py @@ -72,6 +72,36 @@ def test_entity_filter_env_enables_ip(self): _, stdout = run(payload, env={"DATAFOG_HOOK_ENTITIES": "IP_ADDRESS"}) assert "IP_ADDRESS" in _decision(stdout)["permissionDecisionReason"] + def test_allowlist_env_exempts_exact_value(self): + own_email = "sid@" "example.com" + payload = _pre_tool_use("Bash", {"command": f"echo {own_email}"}) + code, stdout = run(payload, env={"DATAFOG_HOOK_ALLOWLIST": own_email}) + assert code == 0 + assert stdout == "" + + def test_allowlist_pattern_env_exempts_timestamps(self): + # Ten-digit numeric IDs and unix timestamps match the PHONE pattern; + # the pattern allowlist silences that class of false positive. + payload = _pre_tool_use("Bash", {"command": "echo created 17830" "25668"}) + code, stdout = run( + payload, env={"DATAFOG_HOOK_ALLOWLIST_PATTERNS": r"^\d{10}$"} + ) + assert code == 0 + assert stdout == "" + + def test_allowlist_does_not_exempt_other_values(self): + own_email = "sid@" "example.com" + other = "jane.doe@" "example.com" + payload = _pre_tool_use("Bash", {"command": f"echo {other}"}) + _, stdout = run(payload, env={"DATAFOG_HOOK_ALLOWLIST": own_email}) + assert "EMAIL" in _decision(stdout)["permissionDecisionReason"] + + def test_invalid_allowlist_pattern_fails_open(self): + payload = _pre_tool_use("Bash", {"command": "echo jane.doe@" "example.com"}) + code, stdout = run(payload, env={"DATAFOG_HOOK_ALLOWLIST_PATTERNS": "("}) + assert code != 2 # fail-open, never blocking + assert stdout == "" + class TestUserPromptSubmit: def test_pii_in_prompt_adds_context_warning(self): diff --git a/tests/test_litellm_guardrail.py b/tests/test_litellm_guardrail.py index c8d844f2..1d7656a4 100644 --- a/tests/test_litellm_guardrail.py +++ b/tests/test_litellm_guardrail.py @@ -262,6 +262,20 @@ async def test_fail_closed_raises_on_engine_error(self, monkeypatch): call_type="completion", ) + async def test_allowlist_exempts_configured_values(self): + own = "sid@" "example.com" + other = "jane.doe@" "example.com" + guardrail = DataFogGuardrail(guardrail_name="datafog-pii", allowlist=[own]) + data = await guardrail.async_pre_call_hook( + user_api_key_dict=None, + cache=None, + data=_chat_data(f"contact {own} or {other}"), + call_type="completion", + ) + content = data["messages"][0]["content"] + assert own in content + assert other not in content + async def test_invalid_config_rejected(self): with pytest.raises(ValueError): DataFogGuardrail(guardrail_name="datafog-pii", action="explode")