From 2dd51342121eb9765e6281bb035cad6a1e5b734f Mon Sep 17 00:00:00 2001 From: Sid Mohan <61345237+sidmohan0@users.noreply.github.com> Date: Fri, 3 Jul 2026 12:21:53 -0700 Subject: [PATCH 1/2] fix: stop no-dash SSN branch matching digit runs inside hex IDs A bare nine-digit run embedded in a longer alphanumeric token (random hex IDs, UUID segments) matched the SSN pattern because its boundaries only excluded adjacent digits. In practice this let randomly generated server IDs trip the Claude Code PII firewall hook and block entire sessions. Tighten the no-dash branch only: the run must not be followed by a letter, and must start at a non-alphanumeric boundary or right after a two-letter token prefix (preserving v4.4.0 country-code parity, e.g. DE123456789). The dashed branch keeps its existing boundaries. --- .../regex_annotator/regex_annotator.py | 12 ++++++++-- tests/test_regex_annotator.py | 23 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 44dfa2e0..24ba8396 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -104,15 +104,23 @@ def __init__( # inside a DE_VAT_ID) are resolved by the engine's span-overlap # suppression, not here, so default (EN) detection keeps v4.4.0 # behavior even when German labels are active. + # The no-dash branch has stricter boundaries than the dashed one: + # a nine-digit run embedded in a longer alphanumeric token (random + # hex IDs, UUID segments) is not an SSN. The run must not be + # followed by a letter, and must start either at a non-alphanumeric + # boundary or right after a two-letter token prefix (country codes + # like "DE123456789" — v4.4.0 parity). "SSN": re.compile( r""" - (? Date: Fri, 3 Jul 2026 12:35:02 -0700 Subject: [PATCH 2/2] fix: scope no-dash SSN letter-prefix exception to uppercase DE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review found the two-letter-prefix exception matched any two-letter token (ID, PO, ...), and via IGNORECASE also lowercase "de" — a hex byte, which would reopen the random-hex-ID false positive. The v4.4.0 parity test only pins uppercase DE (DE_VAT_ID overlap), so restrict the lookbehind to a case-sensitive DE prefix and cover generic and lowercase prefixes in the regression test. --- .../text_processing/regex_annotator/regex_annotator.py | 8 +++++--- tests/test_regex_annotator.py | 10 ++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 24ba8396..b2897560 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -108,8 +108,10 @@ def __init__( # a nine-digit run embedded in a longer alphanumeric token (random # hex IDs, UUID segments) is not an SSN. The run must not be # followed by a letter, and must start either at a non-alphanumeric - # boundary or right after a two-letter token prefix (country codes - # like "DE123456789" — v4.4.0 parity). + # boundary or right after an uppercase "DE" token prefix — the one + # letter-prefixed shape pinned for v4.4.0 DE_VAT_ID parity. The + # prefix is case-sensitive ((?-i:...)) because lowercase "de" is a + # hex byte and would reopen the random-hex-ID false positive. "SSN": re.compile( r""" (?: @@ -117,7 +119,7 @@ def __init__( (?!000|666)\d{3}-(?!00)\d{2}-(?!0000)\d{4} (?!\d) | - (?:(?