From 2dd51342121eb9765e6281bb035cad6a1e5b734f Mon Sep 17 00:00:00 2001
From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com>
Date: Fri, 3 Jul 2026 12:21:53 -0700
Subject: [PATCH 1/2] fix: stop no-dash SSN branch matching digit runs inside
 hex IDs

A bare nine-digit run embedded in a longer alphanumeric token (random
hex IDs, UUID segments) matched the SSN pattern because its boundaries
only excluded adjacent digits. In practice this let randomly generated
server IDs trip the Claude Code PII firewall hook and block entire
sessions.

Tighten the no-dash branch only: the run must not be followed by a
letter, and must start at a non-alphanumeric boundary or right after a
two-letter token prefix (preserving v4.4.0 country-code parity, e.g.
DE123456789). The dashed branch keeps its existing boundaries.
---
 .../regex_annotator/regex_annotator.py        | 12 ++++++++--
 tests/test_regex_annotator.py                 | 23 +++++++++++++++++++
 2 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py
index 44dfa2e0..24ba8396 100644
--- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py
+++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py
@@ -104,15 +104,23 @@ def __init__(
             # inside a DE_VAT_ID) are resolved by the engine's span-overlap
             # suppression, not here, so default (EN) detection keeps v4.4.0
             # behavior even when German labels are active.
+            # The no-dash branch has stricter boundaries than the dashed one:
+            # a nine-digit run embedded in a longer alphanumeric token (random
+            # hex IDs, UUID segments) is not an SSN. The run must not be
+            # followed by a letter, and must start either at a non-alphanumeric
+            # boundary or right after a two-letter token prefix (country codes
+            # like "DE123456789" — v4.4.0 parity).
             "SSN": re.compile(
                 r"""
-                (?<!\d)
                 (?:
+                    (?<!\d)
                     (?!000|666)\d{3}-(?!00)\d{2}-(?!0000)\d{4}
+                    (?!\d)
                     |
+                    (?:(?<![0-9A-Za-z])|(?<=\b[A-Za-z]{2}))
                     (?!000|666)\d{3}(?!00)\d{2}(?!0000)\d{4}
+                    (?![0-9A-Za-z])
                 )
-                (?!\d)
                 """,
                 re.IGNORECASE | re.MULTILINE | re.VERBOSE,
             ),
diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py
index ec0363c1..e12ea2e6 100644
--- a/tests/test_regex_annotator.py
+++ b/tests/test_regex_annotator.py
@@ -373,6 +373,29 @@ def test_annotation_result_format():
     assert ssn_spans[0].text == "123-45-6789"
 
 
+def test_ssn_no_dash_not_flagged_inside_alphanumeric_tokens():
+    """Regression guard: bare nine-digit runs embedded in longer
+    alphanumeric tokens (random hex IDs, UUID segments, blob names) must
+    not match SSN. A digit run counts as embedded when the token
+    continues with letters after it, or when it is preceded by a token
+    prefix that is not a bare two-letter country code (see
+    test_ssn_detection_keeps_v44_behavior_for_country_prefixed_digits)."""
+    annotator = RegexAnnotator()
+    for text in (
+        "serverId 3f2a123456789bc is up.",  # letters on both sides
+        "session 9c-3f2a123456789-77 restarted.",  # run ends a hex segment
+        "blob deadbeef123456789cafe stored.",
+        "id a1b2-123456789abc-x9",  # run starts a segment, letters after
+    ):
+        assert annotator.annotate(text)["SSN"] == [], text
+
+
+def test_ssn_no_dash_still_flagged_standalone():
+    """A bare nine-digit number that is its own token must keep matching."""
+    annotator = RegexAnnotator()
+    assert annotator.annotate("Case 219099999 assigned.")["SSN"] == ["219099999"]
+
+
 def test_ssn_detection_keeps_v44_behavior_for_country_prefixed_digits():
     """Regression guard: bare nine-digit runs after a country prefix must
     still match SSN when no locale is configured (v4.4.0 parity). The

From 8b6b9b5c9c3d27b0e8b1370750d6baf851a9b75b Mon Sep 17 00:00:00 2001
From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com>
Date: Fri, 3 Jul 2026 12:35:02 -0700
Subject: [PATCH 2/2] fix: scope no-dash SSN letter-prefix exception to
 uppercase DE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Review found the two-letter-prefix exception matched any two-letter
token (ID, PO, ...), and via IGNORECASE also lowercase "de" — a hex
byte, which would reopen the random-hex-ID false positive. The v4.4.0
parity test only pins uppercase DE (DE_VAT_ID overlap), so restrict the
lookbehind to a case-sensitive DE prefix and cover generic and
lowercase prefixes in the regression test.
---
 .../text_processing/regex_annotator/regex_annotator.py |  8 +++++---
 tests/test_regex_annotator.py                          | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py
index 24ba8396..b2897560 100644
--- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py
+++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py
@@ -108,8 +108,10 @@ def __init__(
             # a nine-digit run embedded in a longer alphanumeric token (random
             # hex IDs, UUID segments) is not an SSN. The run must not be
             # followed by a letter, and must start either at a non-alphanumeric
-            # boundary or right after a two-letter token prefix (country codes
-            # like "DE123456789" — v4.4.0 parity).
+            # boundary or right after an uppercase "DE" token prefix — the one
+            # letter-prefixed shape pinned for v4.4.0 DE_VAT_ID parity. The
+            # prefix is case-sensitive ((?-i:...)) because lowercase "de" is a
+            # hex byte and would reopen the random-hex-ID false positive.
             "SSN": re.compile(
                 r"""
                 (?:
@@ -117,7 +119,7 @@ def __init__(
                     (?!000|666)\d{3}-(?!00)\d{2}-(?!0000)\d{4}
                     (?!\d)
                     |
-                    (?:(?<![0-9A-Za-z])|(?<=\b[A-Za-z]{2}))
+                    (?:(?<![0-9A-Za-z])|(?<=\b(?-i:DE)))
                     (?!000|666)\d{3}(?!00)\d{2}(?!0000)\d{4}
                     (?![0-9A-Za-z])
                 )
diff --git a/tests/test_regex_annotator.py b/tests/test_regex_annotator.py
index e12ea2e6..5b2c5ce1 100644
--- a/tests/test_regex_annotator.py
+++ b/tests/test_regex_annotator.py
@@ -375,10 +375,9 @@ def test_annotation_result_format():
 
 def test_ssn_no_dash_not_flagged_inside_alphanumeric_tokens():
     """Regression guard: bare nine-digit runs embedded in longer
-    alphanumeric tokens (random hex IDs, UUID segments, blob names) must
-    not match SSN. A digit run counts as embedded when the token
-    continues with letters after it, or when it is preceded by a token
-    prefix that is not a bare two-letter country code (see
+    alphanumeric tokens (random hex IDs, UUID segments, blob names,
+    prefixed record IDs) must not match SSN. The only letter-prefixed
+    shape that still matches is an uppercase "DE" token prefix (see
     test_ssn_detection_keeps_v44_behavior_for_country_prefixed_digits)."""
     annotator = RegexAnnotator()
     for text in (
@@ -386,6 +385,9 @@ def test_ssn_no_dash_not_flagged_inside_alphanumeric_tokens():
         "session 9c-3f2a123456789-77 restarted.",  # run ends a hex segment
         "blob deadbeef123456789cafe stored.",
         "id a1b2-123456789abc-x9",  # run starts a segment, letters after
+        "order ID219099999 shipped.",  # generic two-letter record prefix
+        "ticket PO219099999 open.",
+        "hex de219099999 blob.",  # lowercase "de" is a hex byte, not a country code
     ):
         assert annotator.annotate(text)["SSN"] == [], text