Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,25 @@ def __init__(
# inside a DE_VAT_ID) are resolved by the engine's span-overlap
# suppression, not here, so default (EN) detection keeps v4.4.0
# behavior even when German labels are active.
# The no-dash branch has stricter boundaries than the dashed one:
# a nine-digit run embedded in a longer alphanumeric token (random
# hex IDs, UUID segments) is not an SSN. The run must not be
# followed by a letter, and must start either at a non-alphanumeric
# boundary or right after an uppercase "DE" token prefix — the one
# letter-prefixed shape pinned for v4.4.0 DE_VAT_ID parity. The
# prefix is case-sensitive ((?-i:...)) because lowercase "de" is a
# hex byte and would reopen the random-hex-ID false positive.
"SSN": re.compile(
r"""
(?<!\d)
(?:
(?<!\d)
(?!000|666)\d{3}-(?!00)\d{2}-(?!0000)\d{4}
(?!\d)
|
(?:(?<![0-9A-Za-z])|(?<=\b(?-i:DE)))
(?!000|666)\d{3}(?!00)\d{2}(?!0000)\d{4}
(?![0-9A-Za-z])
)
(?!\d)
""",
re.IGNORECASE | re.MULTILINE | re.VERBOSE,
),
Expand Down
25 changes: 25 additions & 0 deletions tests/test_regex_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,31 @@ def test_annotation_result_format():
assert ssn_spans[0].text == "123-45-6789"


def test_ssn_no_dash_not_flagged_inside_alphanumeric_tokens():
"""Regression guard: bare nine-digit runs embedded in longer
alphanumeric tokens (random hex IDs, UUID segments, blob names,
prefixed record IDs) must not match SSN. The only letter-prefixed
shape that still matches is an uppercase "DE" token prefix (see
test_ssn_detection_keeps_v44_behavior_for_country_prefixed_digits)."""
annotator = RegexAnnotator()
for text in (
"serverId 3f2a123456789bc is up.", # letters on both sides
"session 9c-3f2a123456789-77 restarted.", # run ends a hex segment
"blob deadbeef123456789cafe stored.",
"id a1b2-123456789abc-x9", # run starts a segment, letters after
"order ID219099999 shipped.", # generic two-letter record prefix
"ticket PO219099999 open.",
"hex de219099999 blob.", # lowercase "de" is a hex byte, not a country code
):
assert annotator.annotate(text)["SSN"] == [], text


def test_ssn_no_dash_still_flagged_standalone():
"""A bare nine-digit number that is its own token must keep matching."""
annotator = RegexAnnotator()
assert annotator.annotate("Case 219099999 assigned.")["SSN"] == ["219099999"]


def test_ssn_detection_keeps_v44_behavior_for_country_prefixed_digits():
"""Regression guard: bare nine-digit runs after a country prefix must
still match SSN when no locale is configured (v4.4.0 parity). The
Expand Down