DataFog · sidmohan0 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 4.5.0b5
+current_version = 4.6.0
 commit = True
 tag = True
 tag_name = v{new_version}

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -58,6 +58,7 @@ jobs:
         if: matrix.install-profile == 'nlp-advanced'
         run: |
           pip install -e ".[test,cli,nlp,nlp-advanced]" -r requirements-test.txt
+          pip install "litellm>=1.90,<2" fastapi  # exercises the LiteLLM guardrail adapter tests (proxy deployments always have fastapi)
           python -m spacy download en_core_web_lg
           datafog download-model urchade/gliner_multi_pii-v1 --engine gliner
 

diff --git a/CHANGELOG.MD b/CHANGELOG.MD
@@ -2,6 +2,34 @@
 
 ## [2026-07-02]
 
+### `datafog-python` [4.6.0]
+
+#### Added
+
+- **Claude Code hook adapter** (`datafog-hook` console script,
+  `datafog/integrations/claude_code.py`): an offline PII firewall for
+  agent tool calls. `PreToolUse` gates egress tools (Bash, WebFetch,
+  Write/Edit, MCP tools) with `ask` or `deny` decisions when tool input
+  contains PII; `UserPromptSubmit` and `PostToolUse` inject non-blocking
+  warnings. Core-only dependencies, ~70ms per invocation including
+  process startup, fail-open by design, and findings are reported as
+  entity-type counts only — matched values are never echoed. See
+  `examples/claude_code_hook/` for setup, recommended `deny`
+  configuration (survives `--dangerously-skip-permissions`), and
+  documented limitations.
+- **LiteLLM guardrail adapter**
+  (`datafog.integrations.litellm_guardrail.DataFogGuardrail`): redact or
+  block PII at the gateway for any LiteLLM-proxied provider. `pre_call`
+  rewrites request messages in place (`[EMAIL_1]`-style tokens) before
+  egress or rejects with HTTP 400; `post_call` redacts model responses.
+  Configurable `fail_policy` (`open`/`closed`) and entity types;
+  in-process at ~31µs per request — no sidecar service. See
+  `examples/litellm_guardrail/`.
+
+Both adapters default to the high-precision entity set (`EMAIL`, `PHONE`,
+`CREDIT_CARD`, `SSN`); noisier types (`IP_ADDRESS`, `DOB`, `ZIP`) are
+opt-in. No changes to the core library or its dependencies.
+
 ### `datafog-python` [4.5.0]
 
 #### Behavior Changes Since 4.4.0

diff --git a/datafog/__about__.py b/datafog/__about__.py
@@ -1 +1 @@
-__version__ = "4.5.0"
+__version__ = "4.6.0"
diff --git a/datafog/integrations/__init__.py b/datafog/integrations/__init__.py
@@ -0,0 +1 @@
+"""Adapters that embed DataFog into agent harnesses and pipelines."""
diff --git a/datafog/integrations/claude_code.py b/datafog/integrations/claude_code.py
@@ -0,0 +1,183 @@
+"""Claude Code hook adapter: an offline PII firewall for agent tool calls.
+
+Speaks the Claude Code hooks protocol (JSON on stdin, JSON on stdout):
+
+- ``PreToolUse``   — gate outbound tool calls (Bash, WebFetch, Write, MCP
+  tools). PII in the tool input yields an ``ask`` (default) or ``deny``
+  permission decision, so data is stopped *before* it leaves the machine.
+- ``UserPromptSubmit`` — non-blocking: warns the model that the prompt
+  contains PII so it avoids repeating it in output or logs.
+- ``PostToolUse``  — non-blocking: warns when a tool result carries PII
+  into the conversation context.
+
+Configuration (environment variables):
+
+- ``DATAFOG_HOOK_ACTION``: ``ask`` (default) or ``deny`` for PreToolUse.
+- ``DATAFOG_HOOK_ENTITIES``: comma-separated entity types to detect.
+  Defaults to the high-precision set; noisy-in-code types (IP_ADDRESS,
+  DOB, ZIP) must be opted into.
+
+Failure policy: fail open. A hook bug must never brick a Claude Code
+session, so any unexpected error exits non-blocking with no output.
+"""
+
+import json
+import os
+import sys
+from typing import Any, Iterator, Mapping
+
+# High-precision defaults. IP_ADDRESS, DOB, and ZIP are deliberately
+# excluded: version strings, dates, and 5-digit numbers saturate coding
+# sessions and would make the firewall cry wolf (see DFPY-110).
+DEFAULT_ENTITY_TYPES = ["EMAIL", "PHONE", "CREDIT_CARD", "SSN"]
+
+VALID_ACTIONS = {"ask", "deny"}
+
+# Per-string scan cap, so a huge file write can't stall the hook. Applied
+# per string (not shared across the payload) so a padding field can't starve
+# the scan of later fields; TOTAL_SCAN_CHARS bounds the worst case overall.
+MAX_SCAN_CHARS = 1_000_000
+TOTAL_SCAN_CHARS = 8_000_000
+
+_EXIT_OK = 0
+# Exit 1 is Claude Code's non-blocking error: stderr is shown to the user,
+# the tool call proceeds. Never exit 2 (blocking) on our own failures.
+_EXIT_FAIL_OPEN = 1
+
+
+def _entity_types(env: Mapping[str, str]) -> list[str]:
+    raw = env.get("DATAFOG_HOOK_ENTITIES", "")
+    parsed = [t.strip().upper() for t in raw.split(",") if t.strip()]
+    # An empty parse (unset, or a value like " , ") must fall back to the
+    # defaults: passing [] downstream would disable filtering entirely and
+    # silently enable the noisy opt-in entity types.
+    return parsed or DEFAULT_ENTITY_TYPES
+
+
+def _action(env: Mapping[str, str]) -> str:
+    action = env.get("DATAFOG_HOOK_ACTION", "ask").strip().lower()
+    return action if action in VALID_ACTIONS else "ask"
+
+
+def _iter_strings(value: Any) -> Iterator[str]:
+    """Yield every string embedded in a JSON-like structure.
+
+    Iterative (explicit stack), so adversarially deep nesting cannot
+    trigger ``RecursionError`` and silently skip the scan.
+    """
+    stack = [value]
+    while stack:
+        current = stack.pop()
+        if isinstance(current, str):
+            yield current
+        elif isinstance(current, dict):
+            stack.extend(current.values())
+        elif isinstance(current, (list, tuple)):
+            stack.extend(current)
+
+
+def _scan_findings(value: Any, entity_types: list[str]) -> dict[str, int]:
+    """Scan all strings in ``value``; return counts per entity type."""
+    import datafog
+
+    counts: dict[str, int] = {}
+    total_budget = TOTAL_SCAN_CHARS
+    for text in _iter_strings(value):
+        if total_budget <= 0:
+            break
+        chunk = text[: min(MAX_SCAN_CHARS, total_budget)]
+        total_budget -= len(chunk)
+        result = datafog.scan(chunk, engine="regex", entity_types=entity_types)
+        for entity in result.entities:
+            counts[entity.type] = counts.get(entity.type, 0) + 1
+    return counts
+
+
+def _summary(counts: dict[str, int]) -> str:
+    """Render findings without ever echoing the matched PII itself."""
+    parts = [f"{etype} x{n}" for etype, n in sorted(counts.items())]
+    return ", ".join(parts)
+
+
+def _emit(event: str, fields: dict[str, Any]) -> str:
+    return json.dumps({"hookSpecificOutput": {"hookEventName": event, **fields}})
+
+
+def _handle_pre_tool_use(payload: dict, env: Mapping[str, str]) -> str:
+    counts = _scan_findings(payload.get("tool_input"), _entity_types(env))
+    if not counts:
+        return ""
+    tool = payload.get("tool_name", "tool")
+    reason = (
+        f"DataFog PII firewall: {tool} input contains {_summary(counts)}. "
+        "Redact or tokenize these values before sending them anywhere."
+    )
+    return _emit(
+        "PreToolUse",
+        {"permissionDecision": _action(env), "permissionDecisionReason": reason},
+    )
+
+
+def _handle_user_prompt_submit(payload: dict, env: Mapping[str, str]) -> str:
+    counts = _scan_findings(payload.get("prompt"), _entity_types(env))
+    if not counts:
+        return ""
+    context = (
+        f"DataFog PII firewall: the user's prompt contains {_summary(counts)}. "
+        "Avoid repeating these values verbatim in responses, code, or files."
+    )
+    return _emit("UserPromptSubmit", {"additionalContext": context})
+
+
+def _handle_post_tool_use(payload: dict, env: Mapping[str, str]) -> str:
+    counts = _scan_findings(payload.get("tool_response"), _entity_types(env))
+    if not counts:
+        return ""
+    tool = payload.get("tool_name", "tool")
+    context = (
+        f"DataFog PII firewall: {tool} output contains {_summary(counts)}. "
+        "Avoid repeating these values verbatim in responses, code, or files."
+    )
+    return _emit("PostToolUse", {"additionalContext": context})
+
+
+_HANDLERS = {
+    "PreToolUse": _handle_pre_tool_use,
+    "UserPromptSubmit": _handle_user_prompt_submit,
+    "PostToolUse": _handle_post_tool_use,
+}
+
+
+def run(payload: dict, env: Mapping[str, str]) -> tuple[int, str]:
+    """Process one hook payload; return (exit_code, stdout). Fails open."""
+    try:
+        handler = _HANDLERS.get(payload.get("hook_event_name", ""))
+        if handler is None:
+            return _EXIT_OK, ""
+        return _EXIT_OK, handler(payload, env)
+    except Exception as exc:  # noqa: BLE001 — fail open by design
+        print(f"datafog-hook error (fail-open): {exc}", file=sys.stderr)
+        return _EXIT_FAIL_OPEN, ""
+
+
+def main() -> None:
+    """Console entry point: ``datafog-hook``."""
+    # Catch everything, including RecursionError from json.load on
+    # adversarially nested payloads: the fail-open contract applies to the
+    # entire process, not just the handler.
+    try:
+        payload = json.load(sys.stdin)
+        if not isinstance(payload, dict):
+            payload = {}
+    except Exception as exc:  # noqa: BLE001 — fail open by design
+        print(f"datafog-hook: invalid hook payload (fail-open): {exc}", file=sys.stderr)
+        sys.exit(_EXIT_FAIL_OPEN)
+
+    code, stdout = run(payload, os.environ)
+    if stdout:
+        print(stdout)
+    sys.exit(code)
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Adapters that embed DataFog into agent harnesses and pipelines."""