diff --git a/datafog/processing/text_processing/regex_annotator/regex_annotator.py b/datafog/processing/text_processing/regex_annotator/regex_annotator.py index 44dfa2e0..b2897560 100644 --- a/datafog/processing/text_processing/regex_annotator/regex_annotator.py +++ b/datafog/processing/text_processing/regex_annotator/regex_annotator.py @@ -104,15 +104,25 @@ def __init__( # inside a DE_VAT_ID) are resolved by the engine's span-overlap # suppression, not here, so default (EN) detection keeps v4.4.0 # behavior even when German labels are active. + # The no-dash branch has stricter boundaries than the dashed one: + # a nine-digit run embedded in a longer alphanumeric token (random + # hex IDs, UUID segments) is not an SSN. The run must not be + # followed by a letter, and must start either at a non-alphanumeric + # boundary or right after an uppercase "DE" token prefix — the one + # letter-prefixed shape pinned for v4.4.0 DE_VAT_ID parity. The + # prefix is case-sensitive ((?-i:...)) because lowercase "de" is a + # hex byte and would reopen the random-hex-ID false positive. "SSN": re.compile( r""" - (?