From 8ad51fe5468b086ec84b85abd8e92da159cfe3d5 Mon Sep 17 00:00:00 2001
From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:21:05 -0700
Subject: [PATCH 1/3] feat: allowlist support, presidio entity aliases,
 py.typed

Adds allowlist (exact values) and allowlist_patterns (full-match
regexes) to scan/redact and threads them through both agent adapters:
DATAFOG_HOOK_ALLOWLIST / DATAFOG_HOOK_ALLOWLIST_PATTERNS env vars for
the Claude Code hook, allowlist/allowlist_patterns params for the
LiteLLM guardrail. Motivated by a day of dogfooding: unix timestamps
and numeric IDs match the PHONE pattern, and intentional identifiers
(own support email, doc placeholders) should be exemptable.

Accepts presidio-style entity names (EMAIL_ADDRESS, US_SSN) as input
aliases via the existing canonical type map, ships a py.typed marker
so downstream type checkers see our annotations, and backports the
upstream-review fixes to the in-repo litellm adapter (guardrail spans
recorded on the returned dict, redaction reported as intervention).

Also corrects an entity-name documentation error introduced in #156:
the scan API returns DATE and ZIP_CODE (DOB/ZIP are input aliases).
---
 README.md                                 | 10 ++-
 datafog/__init__.py                       | 30 ++++++-
 datafog/engine.py                         | 59 +++++++++++++-
 datafog/integrations/claude_code.py       | 46 +++++++++--
 datafog/integrations/litellm_guardrail.py | 47 ++++++++---
 datafog/py.typed                          |  0
 setup.py                                  |  1 +
 tests/test_allowlist.py                   | 98 +++++++++++++++++++++++
 tests/test_claude_code_hook.py            | 30 +++++++
 tests/test_litellm_guardrail.py           | 14 ++++
 10 files changed, 314 insertions(+), 21 deletions(-)
 create mode 100644 datafog/py.typed
 create mode 100644 tests/test_allowlist.py

diff --git a/README.md b/README.md
index aeff5264..965a5808 100644
--- a/README.md
+++ b/README.md
@@ -29,13 +29,19 @@ values never echoed into logs or transcripts:
   ```
 
   Manual hook setup and limitations: [examples/claude_code_hook/](examples/claude_code_hook/).
+
 - **LiteLLM guardrail** (`DataFogGuardrail`): redacts or blocks PII in
   requests and responses at the gateway, for any LiteLLM-proxied provider.
   In-process (~31µs per request), no sidecar service. Setup:
   [examples/litellm_guardrail/](examples/litellm_guardrail/).
 
 Both default to the high-precision entity set (`EMAIL`, `PHONE`,
-`CREDIT_CARD`, `SSN`); noisier types are opt-in.
+`CREDIT_CARD`, `SSN`); noisier types are opt-in. Known-safe values can be
+exempted with an allowlist: `scan(text, allowlist=[...])` for exact values,
+`allowlist_patterns=[...]` for full-match regexes (e.g. `^\d{10}$` to stop
+unix timestamps matching as phone numbers) — available in both adapters and
+the API. Presidio-style entity names (`EMAIL_ADDRESS`, `PHONE_NUMBER`,
+`US_SSN`) are accepted as aliases for easy migration.
 
 ## Installation
 
@@ -137,7 +143,7 @@ Use the engine that matches your accuracy and dependency constraints:
 
 - `regex`:
   - Fastest and always available.
-  - Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DOB`, `ZIP`.
+  - Best for default structured entities: `EMAIL`, `PHONE`, `SSN`, `CREDIT_CARD`, `IP_ADDRESS`, `DATE`, `ZIP_CODE` (`DOB` and `ZIP` are accepted as input aliases).
   - Use `locales=["de"]` for German structured IDs such as `DE_VAT_ID`, `DE_IBAN`, `DE_TAX_ID`, `DE_POSTAL_CODE`, and passport or residence permit numbers.
 - `spacy`:
   - Requires `pip install datafog[nlp]`.
diff --git a/datafog/__init__.py b/datafog/__init__.py
index 4f7567a8..f1bd8daf 100644
--- a/datafog/__init__.py
+++ b/datafog/__init__.py
@@ -153,14 +153,28 @@ def scan(
     engine: str = "regex",
     entity_types: list[str] | None = None,
     locales: list[str] | None = None,
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
 ) -> ScanResult:
     """
     v5-preview scan entrypoint.
 
     Defaults to the lightweight regex engine so the core install works without
     optional dependency fallback warnings.
+
+    ``allowlist`` exempts exact entity texts (your own support address, doc
+    placeholders); ``allowlist_patterns`` exempts entities whose full text
+    matches a regex (e.g. ``^\\d{10}$`` so unix timestamps stop matching as
+    phone numbers).
     """
-    return _scan(text=text, engine=engine, entity_types=entity_types, locales=locales)
+    return _scan(
+        text=text,
+        engine=engine,
+        entity_types=entity_types,
+        locales=locales,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
+    )
 
 
 def redact(
@@ -171,12 +185,17 @@ def redact(
     strategy: str = "token",
     preset: str | None = None,
     locales: list[str] | None = None,
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
 ) -> RedactResult:
     """
     v5-preview redaction entrypoint.
 
     If entities are provided, redact those spans. Otherwise, scan text first
-    using the selected engine and redact the detected entities.
+    using the selected engine and redact the detected entities. ``allowlist``
+    and ``allowlist_patterns`` exempt findings from redaction (exact text and
+    full-text regex match respectively); they apply to the scan path and are
+    rejected when explicit ``entities`` are supplied.
     """
     if preset is not None:
         try:
@@ -186,6 +205,11 @@ def redact(
             raise ValueError(f"preset must be one of: {allowed}") from exc
 
     if entities is not None:
+        if allowlist or allowlist_patterns:
+            raise ValueError(
+                "allowlist/allowlist_patterns cannot be combined with explicit "
+                "entities; filter the entities before calling redact"
+            )
         return _redact_entities(text=text, entities=entities, strategy=strategy)
 
     return _scan_and_redact(
@@ -194,6 +218,8 @@ def redact(
         entity_types=entity_types,
         strategy=strategy,
         locales=locales,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
     )
 
 
diff --git a/datafog/engine.py b/datafog/engine.py
index 250cd6fe..43621257 100644
--- a/datafog/engine.py
+++ b/datafog/engine.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import hashlib
+import re
 import warnings
 from dataclasses import dataclass
 from functools import lru_cache
@@ -23,6 +24,9 @@
     "SOCIAL_SECURITY_NUMBER": "SSN",
     "CREDIT_CARD_NUMBER": "CREDIT_CARD",
     "DATE_OF_BIRTH": "DATE",
+    # Presidio-compatible aliases, so configs migrate without renames.
+    "EMAIL_ADDRESS": "EMAIL",
+    "US_SSN": "SSN",
 }
 
 ALL_ENTITY_TYPES = {
@@ -277,6 +281,42 @@ def _filter_entity_types(
     return [entity for entity in entities if entity.type in allowed]
 
 
+def _compile_allowlist_patterns(
+    allowlist_patterns: Optional[list[str]],
+) -> list["re.Pattern[str]"]:
+    compiled = []
+    for raw in allowlist_patterns or []:
+        try:
+            compiled.append(re.compile(raw))
+        except re.error as exc:
+            raise ValueError(
+                f"allowlist_patterns contains an invalid regex: {raw!r} ({exc})"
+            ) from None
+    return compiled
+
+
+def _apply_allowlist(
+    entities: list[Entity],
+    allowlist: Optional[list[str]],
+    allowlist_patterns: Optional[list[str]],
+) -> list[Entity]:
+    """Drop entities whose exact text is allowlisted.
+
+    Exact values match the full entity text; patterns must fullmatch it,
+    so a partial match never suppresses a finding.
+    """
+    if not allowlist and not allowlist_patterns:
+        return entities
+    exact = set(allowlist or [])
+    patterns = _compile_allowlist_patterns(allowlist_patterns)
+    return [
+        entity
+        for entity in entities
+        if entity.text not in exact
+        and not any(pattern.fullmatch(entity.text) for pattern in patterns)
+    ]
+
+
 def _needs_ner(entity_types: Optional[list[str]]) -> bool:
     if entity_types is None:
         return True
@@ -289,14 +329,25 @@ def scan(
     engine: str = "smart",
     entity_types: Optional[list[str]] = None,
     locales: Optional[list[str]] = None,
+    allowlist: Optional[list[str]] = None,
+    allowlist_patterns: Optional[list[str]] = None,
 ) -> ScanResult:
-    """Scan text for PII entities."""
+    """Scan text for PII entities.
+
+    ``allowlist`` exempts exact entity texts (e.g. your own support email);
+    ``allowlist_patterns`` exempts entities whose full text matches a regex
+    (e.g. ``^\\d{10}$`` to stop unix timestamps matching as phone numbers).
+    """
     if not isinstance(text, str):
         raise TypeError("text must be a string")
 
     if engine not in {"regex", "spacy", "gliner", "smart"}:
         raise ValueError("engine must be one of: regex, spacy, gliner, smart")
 
+    # Validate patterns up front so config errors fail fast even when the
+    # text contains no entities.
+    _compile_allowlist_patterns(allowlist_patterns)
+
     regex_entities = _regex_entities(
         text,
         entity_types=entity_types,
@@ -305,6 +356,7 @@ def scan(
 
     if engine == "regex":
         filtered = _filter_entity_types(regex_entities, entity_types)
+        filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns)
         return ScanResult(
             entities=_dedupe_entities(filtered), text=text, engine_used="regex"
         )
@@ -367,6 +419,7 @@ def scan(
                 )
 
     filtered = _filter_entity_types(combined, entity_types)
+    filtered = _apply_allowlist(filtered, allowlist, allowlist_patterns)
     deduped = _dedupe_entities(filtered)
     return ScanResult(
         entities=deduped,
@@ -437,6 +490,8 @@ def scan_and_redact(
     entity_types: Optional[list[str]] = None,
     strategy: str = "token",
     locales: Optional[list[str]] = None,
+    allowlist: Optional[list[str]] = None,
+    allowlist_patterns: Optional[list[str]] = None,
 ) -> RedactResult:
     """Convenience wrapper: scan then redact."""
     scan_result = scan(
@@ -444,5 +499,7 @@ def scan_and_redact(
         engine=engine,
         entity_types=entity_types,
         locales=locales,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
     )
     return redact(text=text, entities=scan_result.entities, strategy=strategy)
diff --git a/datafog/integrations/claude_code.py b/datafog/integrations/claude_code.py
index dfbfb351..28c4e787 100644
--- a/datafog/integrations/claude_code.py
+++ b/datafog/integrations/claude_code.py
@@ -16,6 +16,11 @@
 - ``DATAFOG_HOOK_ENTITIES``: comma-separated entity types to detect.
   Defaults to the high-precision set; noisy-in-code types (IP_ADDRESS,
   DOB, ZIP) must be opted into.
+- ``DATAFOG_HOOK_ALLOWLIST``: comma-separated exact values to exempt
+  (your own support address, documentation placeholders).
+- ``DATAFOG_HOOK_ALLOWLIST_PATTERNS``: comma-separated regexes; findings
+  whose full text matches are exempt (note: a pattern containing a comma
+  cannot be expressed here).
 
 Failure policy: fail open. A hook bug must never brick a Claude Code
 session, so any unexpected error exits non-blocking with no output.
@@ -59,6 +64,11 @@ def _action(env: Mapping[str, str]) -> str:
     return action if action in VALID_ACTIONS else "ask"
 
 
+def _csv_env(env: Mapping[str, str], name: str) -> list[str]:
+    raw = env.get(name, "")
+    return [item.strip() for item in raw.split(",") if item.strip()]
+
+
 def _iter_strings(value: Any) -> Iterator[str]:
     """Yield every string embedded in a JSON-like structure.
 
@@ -76,7 +86,12 @@ def _iter_strings(value: Any) -> Iterator[str]:
             stack.extend(current)
 
 
-def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
+def _scan_findings(
+    value: Any,
+    entity_types: list[str],
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
+) -> dict[str, int]:
     """Scan all strings in ``value``; return counts per entity type."""
     import datafog
 
@@ -87,7 +102,13 @@ def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
             break
         chunk = text[: min(MAX_SCAN_CHARS, total_budget)]
         total_budget -= len(chunk)
-        result = datafog.scan(chunk, engine="regex", entity_types=entity_types)
+        result = datafog.scan(
+            chunk,
+            engine="regex",
+            entity_types=entity_types,
+            allowlist=allowlist or None,
+            allowlist_patterns=allowlist_patterns or None,
+        )
         for entity in result.entities:
             counts[entity.type] = counts.get(entity.type, 0) + 1
     return counts
@@ -104,7 +125,12 @@ def _emit(event: str, fields: dict[str, Any]) -> str:
 
 
 def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:
-    counts = _scan_findings(payload.get("tool_input"), _entity_types(env))
+    counts = _scan_findings(
+        payload.get("tool_input"),
+        _entity_types(env),
+        allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
+        allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
+    )
     if not counts:
         return ""
     tool = payload.get("tool_name", "tool")
@@ -119,7 +145,12 @@ def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:
 
 
 def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:
-    counts = _scan_findings(payload.get("prompt"), _entity_types(env))
+    counts = _scan_findings(
+        payload.get("prompt"),
+        _entity_types(env),
+        allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
+        allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
+    )
     if not counts:
         return ""
     context = (
@@ -130,7 +161,12 @@ def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:
 
 
 def _handle_post_tool_use(payload: dict, env: Mapping[str, str]) -> str:
-    counts = _scan_findings(payload.get("tool_response"), _entity_types(env))
+    counts = _scan_findings(
+        payload.get("tool_response"),
+        _entity_types(env),
+        allowlist=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST"),
+        allowlist_patterns=_csv_env(env, "DATAFOG_HOOK_ALLOWLIST_PATTERNS"),
+    )
     if not counts:
         return ""
     tool = payload.get("tool_name", "tool")
diff --git a/datafog/integrations/litellm_guardrail.py b/datafog/integrations/litellm_guardrail.py
index 756755ca..eb717d97 100644
--- a/datafog/integrations/litellm_guardrail.py
+++ b/datafog/integrations/litellm_guardrail.py
@@ -46,11 +46,22 @@
 logger = logging.getLogger(__name__)
 
 
-def _redact_text(text: str, entity_types: list[str]) -> tuple[str, dict[str, int]]:
+def _redact_text(
+    text: str,
+    entity_types: list[str],
+    allowlist: list[str] | None = None,
+    allowlist_patterns: list[str] | None = None,
+) -> tuple[str, dict[str, int]]:
     """Redact ``text``; return (redacted_text, counts per entity type)."""
     import datafog
 
-    result = datafog.redact(text, engine="regex", entity_types=entity_types)
+    result = datafog.redact(
+        text,
+        engine="regex",
+        entity_types=entity_types,
+        allowlist=allowlist,
+        allowlist_patterns=allowlist_patterns,
+    )
     counts: dict[str, int] = {}
     for entity in result.entities:
         counts[entity.type] = counts.get(entity.type, 0) + 1
@@ -69,6 +80,8 @@ def __init__(
         action: str = "redact",
         entity_types: Optional[list[str]] = None,
         fail_policy: str = "open",
+        allowlist: Optional[list[str]] = None,
+        allowlist_patterns: Optional[list[str]] = None,
         **kwargs: Any,
     ) -> None:
         if action not in VALID_ACTIONS:
@@ -80,13 +93,17 @@ def __init__(
         self.action = action
         self.entity_types = entity_types or DEFAULT_ENTITY_TYPES
         self.fail_policy = fail_policy
+        self.allowlist = allowlist
+        self.allowlist_patterns = allowlist_patterns
         super().__init__(**kwargs)
 
     def _process_content(self, content: Any) -> tuple[Any, dict[str, int]]:
         """Redact a message content value (str or list of content parts)."""
         counts: dict[str, int] = {}
         if isinstance(content, str):
-            redacted, counts = _redact_text(content, self.entity_types)
+            redacted, counts = _redact_text(
+                content, self.entity_types, self.allowlist, self.allowlist_patterns
+            )
             return redacted, counts
         if isinstance(content, list):
             new_parts = []
@@ -94,7 +111,10 @@ def _process_content(self, content: Any) -> tuple[Any, dict[str, int]]:
             for part in content:
                 if isinstance(part, dict) and isinstance(part.get("text"), str):
                     redacted, part_counts = _redact_text(
-                        part["text"], self.entity_types
+                        part["text"],
+                        self.entity_types,
+                        self.allowlist,
+                        self.allowlist_patterns,
                     )
                     new_parts.append({**part, "text": redacted})
                     for etype, n in part_counts.items():
@@ -160,9 +180,8 @@ async def async_pre_call_hook(
         if not total_counts:
             return data
 
-        self._record_guardrail_logging(data, total_counts)
-
         if self.action == "block":
+            self._record_guardrail_logging(data, total_counts)
             # HTTPException(400) is one of the exception types litellm's
             # _is_guardrail_intervention recognizes, so the block is
             # classified as a policy intervention (not a backend failure)
@@ -178,7 +197,10 @@ async def async_pre_call_hook(
                 },
             )
 
-        return {**data, "messages": new_messages}
+        self._record_guardrail_logging(
+            new_data_final := {**data, "messages": new_messages}, total_counts
+        )
+        return new_data_final
 
     def _record_guardrail_logging(
         self, data: dict, total_counts: dict[str, int]
@@ -188,9 +210,7 @@ def _record_guardrail_logging(
             self.add_standard_logging_guardrail_information_to_request_data(
                 guardrail_json_response=_summary(total_counts),
                 request_data=data,
-                guardrail_status=(
-                    "guardrail_intervened" if self.action == "block" else "success"
-                ),
+                guardrail_status="guardrail_intervened",
                 masked_entity_count=dict(total_counts),
             )
         except Exception:  # noqa: BLE001 — observability must never break traffic
@@ -217,7 +237,12 @@ async def async_post_call_success_hook(
             for choice in choices:
                 message = getattr(choice, "message", None)
                 if message is not None and isinstance(message.content, str):
-                    redacted, counts = _redact_text(message.content, self.entity_types)
+                    redacted, counts = _redact_text(
+                        message.content,
+                        self.entity_types,
+                        self.allowlist,
+                        self.allowlist_patterns,
+                    )
                     if counts:
                         message.content = redacted
                 elif message is not None and message.content is not None:
diff --git a/datafog/py.typed b/datafog/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/setup.py b/setup.py
index abffa1e7..078a91bf 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,7 @@
     long_description=long_description,
     long_description_content_type="text/markdown",
     packages=find_packages(exclude=["tests", "tests.*"]),
+    package_data={"datafog": ["py.typed"]},
     install_requires=core_deps,
     extras_require=extras_require,
     python_requires=">=3.10,<3.14",
diff --git a/tests/test_allowlist.py b/tests/test_allowlist.py
new file mode 100644
index 00000000..44b24d67
--- /dev/null
+++ b/tests/test_allowlist.py
@@ -0,0 +1,98 @@
+"""Tests for scan/redact allowlist support and presidio-style entity aliases.
+
+PII literals are assembled from split parts so write-time scanners
+(including our own Claude Code hook) do not match this source file.
+"""
+
+import pytest
+
+import datafog
+
+EMAIL = "jane.doe@" "example.com"
+OTHER_EMAIL = "sid@" "example.com"
+TIMESTAMP_LIKE = "17830" "25668"  # ten digits: matches the PHONE pattern
+
+
+class TestExactAllowlist:
+    def test_allowlisted_value_is_not_reported(self):
+        result = datafog.scan(
+            f"mail {EMAIL} and {OTHER_EMAIL}",
+            engine="regex",
+            allowlist=[OTHER_EMAIL],
+        )
+        assert [e.text for e in result.entities] == [EMAIL]
+
+    def test_allowlist_is_exact_not_substring(self):
+        result = datafog.scan(
+            f"mail {EMAIL}", engine="regex", allowlist=["jane.doe"]
+        )
+        assert [e.text for e in result.entities] == [EMAIL]
+
+    def test_empty_allowlist_is_noop(self):
+        result = datafog.scan(f"mail {EMAIL}", engine="regex", allowlist=[])
+        assert len(result.entities) == 1
+
+    def test_redact_respects_allowlist(self):
+        result = datafog.redact(
+            f"mail {EMAIL} and {OTHER_EMAIL}",
+            engine="regex",
+            allowlist=[OTHER_EMAIL],
+        )
+        assert OTHER_EMAIL in result.redacted_text
+        assert EMAIL not in result.redacted_text
+
+
+class TestPatternAllowlist:
+    def test_pattern_suppresses_matching_entities(self):
+        # The motivating case: unix timestamps and numeric IDs match the
+        # PHONE pattern; a pattern allowlist can exempt all-digit strings.
+        noisy = datafog.scan(f"created {TIMESTAMP_LIKE}", engine="regex")
+        assert len(noisy.entities) == 1  # sanity: it is detected by default
+
+        result = datafog.scan(
+            f"created {TIMESTAMP_LIKE}",
+            engine="regex",
+            allowlist_patterns=[r"^\d{10}$"],
+        )
+        assert result.entities == []
+
+    def test_pattern_matches_full_entity_text_only(self):
+        result = datafog.scan(
+            f"mail {EMAIL}", engine="regex", allowlist_patterns=[r"^jane\."]
+        )
+        assert len(result.entities) == 1  # partial match must not suppress
+
+    def test_invalid_pattern_raises_value_error(self):
+        with pytest.raises(ValueError, match="allowlist_patterns"):
+            datafog.scan("text", engine="regex", allowlist_patterns=["("])
+
+    def test_patterns_and_values_combine(self):
+        result = datafog.scan(
+            f"{EMAIL} then {TIMESTAMP_LIKE}",
+            engine="regex",
+            allowlist=[EMAIL],
+            allowlist_patterns=[r"^\d{10}$"],
+        )
+        assert result.entities == []
+
+
+class TestPresidioAliases:
+    def test_email_address_alias(self):
+        result = datafog.scan(
+            f"mail {EMAIL}", engine="regex", entity_types=["EMAIL_ADDRESS"]
+        )
+        assert [e.type for e in result.entities] == ["EMAIL"]
+
+    def test_us_ssn_alias(self):
+        ssn = "856-45-" "6789"
+        result = datafog.scan(
+            f"ssn {ssn}", engine="regex", entity_types=["US_SSN"]
+        )
+        assert [e.type for e in result.entities] == ["SSN"]
+
+
+class TestPyTyped:
+    def test_py_typed_marker_ships_with_package(self):
+        import importlib.resources
+
+        assert importlib.resources.files("datafog").joinpath("py.typed").is_file()
diff --git a/tests/test_claude_code_hook.py b/tests/test_claude_code_hook.py
index c0620540..99fbae64 100644
--- a/tests/test_claude_code_hook.py
+++ b/tests/test_claude_code_hook.py
@@ -72,6 +72,36 @@ def test_entity_filter_env_enables_ip(self):
         _, stdout = run(payload, env={"DATAFOG_HOOK_ENTITIES": "IP_ADDRESS"})
         assert "IP_ADDRESS" in _decision(stdout)["permissionDecisionReason"]
 
+    def test_allowlist_env_exempts_exact_value(self):
+        own_email = "sid@" "example.com"
+        payload = _pre_tool_use("Bash", {"command": f"echo {own_email}"})
+        code, stdout = run(payload, env={"DATAFOG_HOOK_ALLOWLIST": own_email})
+        assert code == 0
+        assert stdout == ""
+
+    def test_allowlist_pattern_env_exempts_timestamps(self):
+        # Ten-digit numeric IDs and unix timestamps match the PHONE pattern;
+        # the pattern allowlist silences that class of false positive.
+        payload = _pre_tool_use("Bash", {"command": "echo created 17830" "25668"})
+        code, stdout = run(
+            payload, env={"DATAFOG_HOOK_ALLOWLIST_PATTERNS": r"^\d{10}$"}
+        )
+        assert code == 0
+        assert stdout == ""
+
+    def test_allowlist_does_not_exempt_other_values(self):
+        own_email = "sid@" "example.com"
+        other = "jane.doe@" "example.com"
+        payload = _pre_tool_use("Bash", {"command": f"echo {other}"})
+        _, stdout = run(payload, env={"DATAFOG_HOOK_ALLOWLIST": own_email})
+        assert "EMAIL" in _decision(stdout)["permissionDecisionReason"]
+
+    def test_invalid_allowlist_pattern_fails_open(self):
+        payload = _pre_tool_use("Bash", {"command": "echo jane.doe@" "example.com"})
+        code, stdout = run(payload, env={"DATAFOG_HOOK_ALLOWLIST_PATTERNS": "("})
+        assert code != 2  # fail-open, never blocking
+        assert stdout == ""
+
 
 class TestUserPromptSubmit:
     def test_pii_in_prompt_adds_context_warning(self):
diff --git a/tests/test_litellm_guardrail.py b/tests/test_litellm_guardrail.py
index c8d844f2..1d7656a4 100644
--- a/tests/test_litellm_guardrail.py
+++ b/tests/test_litellm_guardrail.py
@@ -262,6 +262,20 @@ async def test_fail_closed_raises_on_engine_error(self, monkeypatch):
                 call_type="completion",
             )
 
+    async def test_allowlist_exempts_configured_values(self):
+        own = "sid@" "example.com"
+        other = "jane.doe@" "example.com"
+        guardrail = DataFogGuardrail(guardrail_name="datafog-pii", allowlist=[own])
+        data = await guardrail.async_pre_call_hook(
+            user_api_key_dict=None,
+            cache=None,
+            data=_chat_data(f"contact {own} or {other}"),
+            call_type="completion",
+        )
+        content = data["messages"][0]["content"]
+        assert own in content
+        assert other not in content
+
     async def test_invalid_config_rejected(self):
         with pytest.raises(ValueError):
             DataFogGuardrail(guardrail_name="datafog-pii", action="explode")

From a6cf3fd9ed626c18bf1ea5585c9990c03255b291 Mon Sep 17 00:00:00 2001
From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:27:24 -0700
Subject: [PATCH 2/3] fix: harden allowlist patterns against ReDoS, document
 match semantics

Review findings: reject quantified groups containing nested quantifiers
at compile time (catastrophic backtracking on attacker-influenced entity
text), cap pattern length at 512 chars, and skip pattern matching for
entities longer than 512 chars (fail-safe: the finding is kept). Match
semantics documented as case-sensitive with no Unicode normalization;
allowlist entries are operator configuration, never end-user input.
Adds regression tests for the rejection heuristic, the smart-engine
path, and the redact(entities=..., allowlist=...) guard. Replaces a
walrus assignment with a plain one in the litellm adapter.
---
 datafog/engine.py                         | 38 ++++++++++++++++--
 datafog/integrations/litellm_guardrail.py |  7 ++--
 tests/test_allowlist.py                   | 49 ++++++++++++++++++++---
 3 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/datafog/engine.py b/datafog/engine.py
index 43621257..46ed72a8 100644
--- a/datafog/engine.py
+++ b/datafog/engine.py
@@ -281,11 +281,35 @@ def _filter_entity_types(
     return [entity for entity in entities if entity.type in allowed]
 
 
+# Python's re module backtracks; a quantified group containing another
+# quantifier (e.g. ``(a+)+``) can take exponential time on adversarial
+# input, and entity text can be attacker-influenced (LLM messages, tool
+# output). Reject that construct outright rather than matching under it.
+_NESTED_QUANTIFIER = re.compile(
+    r"\((?:[^()\\]|\\.)*(?<!\\)[+*}](?:[^()\\]|\\.)*\)\s*[+*{]"
+)
+MAX_ALLOWLIST_PATTERN_LENGTH = 512
+# Entities longer than this skip pattern matching (fail-safe: the finding
+# is kept, never suppressed) so match time stays bounded.
+MAX_PATTERN_SUBJECT_LENGTH = 512
+
+
 def _compile_allowlist_patterns(
     allowlist_patterns: Optional[list[str]],
 ) -> list["re.Pattern[str]"]:
     compiled = []
     for raw in allowlist_patterns or []:
+        if len(raw) > MAX_ALLOWLIST_PATTERN_LENGTH:
+            raise ValueError(
+                "allowlist_patterns entries must be at most "
+                f"{MAX_ALLOWLIST_PATTERN_LENGTH} characters"
+            )
+        if _NESTED_QUANTIFIER.search(raw):
+            raise ValueError(
+                "allowlist_patterns contains a quantified group with a nested "
+                f"quantifier ({raw!r}), which risks catastrophic backtracking; "
+                "rewrite the pattern without nesting quantifiers"
+            )
         try:
             compiled.append(re.compile(raw))
         except re.error as exc:
@@ -302,8 +326,12 @@ def _apply_allowlist(
 ) -> list[Entity]:
     """Drop entities whose exact text is allowlisted.
 
-    Exact values match the full entity text; patterns must fullmatch it,
-    so a partial match never suppresses a finding.
+    Matching semantics, deliberately strict for a security boundary:
+    exact values are case-sensitive with no Unicode normalization, and
+    patterns must fullmatch the entity text, so a partial match never
+    suppresses a finding. Allowlist entries and patterns are operator
+    configuration; treat them like code and never accept them from end
+    users.
     """
     if not allowlist and not allowlist_patterns:
         return entities
@@ -313,7 +341,11 @@ def _apply_allowlist(
         entity
         for entity in entities
         if entity.text not in exact
-        and not any(pattern.fullmatch(entity.text) for pattern in patterns)
+        and not any(
+            pattern.fullmatch(entity.text)
+            for pattern in patterns
+            if len(entity.text) <= MAX_PATTERN_SUBJECT_LENGTH
+        )
     ]
 
 
diff --git a/datafog/integrations/litellm_guardrail.py b/datafog/integrations/litellm_guardrail.py
index eb717d97..7e33ed55 100644
--- a/datafog/integrations/litellm_guardrail.py
+++ b/datafog/integrations/litellm_guardrail.py
@@ -197,10 +197,9 @@ async def async_pre_call_hook(
                 },
             )
 
-        self._record_guardrail_logging(
-            new_data_final := {**data, "messages": new_messages}, total_counts
-        )
-        return new_data_final
+        new_data = {**data, "messages": new_messages}
+        self._record_guardrail_logging(new_data, total_counts)
+        return new_data
 
     def _record_guardrail_logging(
         self, data: dict, total_counts: dict[str, int]
diff --git a/tests/test_allowlist.py b/tests/test_allowlist.py
index 44b24d67..eb2e29c4 100644
--- a/tests/test_allowlist.py
+++ b/tests/test_allowlist.py
@@ -23,9 +23,7 @@ def test_allowlisted_value_is_not_reported(self):
         assert [e.text for e in result.entities] == [EMAIL]
 
     def test_allowlist_is_exact_not_substring(self):
-        result = datafog.scan(
-            f"mail {EMAIL}", engine="regex", allowlist=["jane.doe"]
-        )
+        result = datafog.scan(f"mail {EMAIL}", engine="regex", allowlist=["jane.doe"])
         assert [e.text for e in result.entities] == [EMAIL]
 
     def test_empty_allowlist_is_noop(self):
@@ -76,6 +74,47 @@ def test_patterns_and_values_combine(self):
         assert result.entities == []
 
 
+class TestReDoSGuards:
+    def test_catastrophic_pattern_rejected(self):
+        with pytest.raises(ValueError, match="catastrophic backtracking"):
+            datafog.scan("text", engine="regex", allowlist_patterns=[r"(a+)+$"])
+
+    def test_nested_star_rejected(self):
+        with pytest.raises(ValueError, match="catastrophic backtracking"):
+            datafog.scan("text", engine="regex", allowlist_patterns=[r"(.*)*"])
+
+    def test_overlong_pattern_rejected(self):
+        with pytest.raises(ValueError, match="at most"):
+            datafog.scan("text", engine="regex", allowlist_patterns=["a" * 513])
+
+    def test_benign_quantified_group_still_allowed(self):
+        result = datafog.scan(
+            f"mail {EMAIL}",
+            engine="regex",
+            allowlist_patterns=[r"(abc)+", r".*@example\.com"],
+        )
+        assert result.entities == []  # broad pattern suppresses, no rejection
+
+
+class TestEnginePaths:
+    def test_smart_engine_applies_allowlist(self):
+        import warnings as _warnings
+
+        with _warnings.catch_warnings():
+            _warnings.simplefilter("ignore")
+            result = datafog.scan(f"mail {EMAIL}", engine="smart", allowlist=[EMAIL])
+        assert result.entities == []
+
+    def test_redact_rejects_allowlist_with_explicit_entities(self):
+        scanned = datafog.scan(f"mail {EMAIL}", engine="regex")
+        with pytest.raises(ValueError, match="cannot be combined"):
+            datafog.redact(
+                f"mail {EMAIL}",
+                entities=scanned.entities,
+                allowlist=[EMAIL],
+            )
+
+
 class TestPresidioAliases:
     def test_email_address_alias(self):
         result = datafog.scan(
@@ -85,9 +124,7 @@ def test_email_address_alias(self):
 
     def test_us_ssn_alias(self):
         ssn = "856-45-" "6789"
-        result = datafog.scan(
-            f"ssn {ssn}", engine="regex", entity_types=["US_SSN"]
-        )
+        result = datafog.scan(f"ssn {ssn}", engine="regex", entity_types=["US_SSN"])
         assert [e.type for e in result.entities] == ["SSN"]
 
 

From 4894c8daef373976b84b8cfa673742bec4a1f137 Mon Sep 17 00:00:00 2001
From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com>
Date: Thu, 2 Jul 2026 15:29:19 -0700
Subject: [PATCH 3/3] test: cover subject-length cap fail-safe in allowlist
 pattern matching

---
 tests/test_allowlist.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/test_allowlist.py b/tests/test_allowlist.py
index eb2e29c4..1f3bef6e 100644
--- a/tests/test_allowlist.py
+++ b/tests/test_allowlist.py
@@ -87,6 +87,37 @@ def test_overlong_pattern_rejected(self):
         with pytest.raises(ValueError, match="at most"):
             datafog.scan("text", engine="regex", allowlist_patterns=["a" * 513])
 
+    def test_overlong_entity_text_skips_patterns_but_is_kept(self):
+        # Fail-safe: an entity too long to pattern-match safely must still
+        # be reported, never silently suppressed.
+        from datafog.engine import Entity, _apply_allowlist
+
+        long_entity = Entity(
+            type="EMAIL",
+            text="a" * 600,
+            start=0,
+            end=600,
+            confidence=1.0,
+            engine="regex",
+        )
+        kept = _apply_allowlist([long_entity], None, [r".*"])
+        assert kept == [long_entity]
+
+    def test_overlong_entity_text_still_matches_exact_allowlist(self):
+        # The subject-length cap only bounds regex matching; exact string
+        # comparison is O(n) and still applies.
+        from datafog.engine import Entity, _apply_allowlist
+
+        long_entity = Entity(
+            type="EMAIL",
+            text="a" * 600,
+            start=0,
+            end=600,
+            confidence=1.0,
+            engine="regex",
+        )
+        assert _apply_allowlist([long_entity], ["a" * 600], None) == []
+
     def test_benign_quantified_group_still_allowed(self):
         result = datafog.scan(
             f"mail {EMAIL}",