Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 4.7.0
current_version = 4.7.1
commit = True
tag = True
tag_name = v{new_version}
Expand Down
22 changes: 22 additions & 0 deletions CHANGELOG.MD
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,28 @@

## [2026-07-02]

### `datafog-python` [4.7.1]

#### Behavior change (precision)

- **Bare digit runs are no longer treated as SSN or PHONE by default.**
A nine-digit run with no delimiter and a ten-digit run with no formatting
are no longer matched as `SSN`/`PHONE`; delimited SSNs (`NNN-NN-NNNN`,
`NNN NN NNNN`) and formatted phone numbers (separators, parentheses, or a
`+country` prefix) still match. This fixes the dominant false-positive
class in structured tool output — tab ids, database row ids, epoch
timestamps, ticket numbers — reported in
[#158](https://github.com/DataFog/datafog-python/issues/158). Spaces are
now also accepted as an SSN delimiter.
- **Migration:** to restore the previous behavior (undelimited nine-digit
SSNs and bare ten-digit phone numbers), pass `strict_numeric=False` to
`scan()` / `redact()`. The Claude Code hook and LiteLLM guardrail run in
strict mode; the hook additionally supports an
`^\d{9}$|^\d{10}$` allowlist pattern as belt-and-braces.
- Broader structural validation (SSA area/group ranges, NANP area/exchange
rules) is deferred to the v5 validator layer; this release only changes
the delimiter/formatting requirement.

### `datafog-python` [4.7.0]

#### Added
Expand Down
2 changes: 1 addition & 1 deletion datafog/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "4.7.0"
__version__ = "4.7.1"
13 changes: 11 additions & 2 deletions datafog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def scan(
locales: list[str] | None = None,
allowlist: list[str] | None = None,
allowlist_patterns: list[str] | None = None,
strict_numeric: bool = True,
) -> ScanResult:
"""
v5-preview scan entrypoint.
Expand All @@ -165,7 +166,10 @@ def scan(
``allowlist`` exempts exact entity texts (your own support address, doc
placeholders); ``allowlist_patterns`` exempts entities whose full text
matches a regex (e.g. ``^\\d{10}$`` so unix timestamps stop matching as
phone numbers).
phone numbers). ``strict_numeric`` (default True) requires SSNs to be
delimited and phone numbers to carry formatting cues, so bare digit runs
(tab ids, row ids, timestamps) are not flagged; set False to also detect
undelimited nine-digit SSNs and bare ten-digit phone numbers.
"""
return _scan(
text=text,
Expand All @@ -174,6 +178,7 @@ def scan(
locales=locales,
allowlist=allowlist,
allowlist_patterns=allowlist_patterns,
strict_numeric=strict_numeric,
)


Expand All @@ -187,6 +192,7 @@ def redact(
locales: list[str] | None = None,
allowlist: list[str] | None = None,
allowlist_patterns: list[str] | None = None,
strict_numeric: bool = True,
) -> RedactResult:
"""
v5-preview redaction entrypoint.
Expand All @@ -195,7 +201,9 @@ def redact(
using the selected engine and redact the detected entities. ``allowlist``
and ``allowlist_patterns`` exempt findings from redaction (exact text and
full-text regex match respectively); they apply to the scan path and are
rejected when explicit ``entities`` are supplied.
rejected when explicit ``entities`` are supplied. ``strict_numeric``
matches ``scan``: bare digit runs are not treated as SSN/PHONE unless it
is set False.
"""
if preset is not None:
try:
Expand All @@ -220,6 +228,7 @@ def redact(
locales=locales,
allowlist=allowlist,
allowlist_patterns=allowlist_patterns,
strict_numeric=strict_numeric,
)


Expand Down
9 changes: 2 additions & 7 deletions datafog/__init___lean.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,10 @@
"""

from .__about__ import __version__

# Core imports - always available
from .models.annotator import AnnotationResult, AnnotatorRequest
from .models.anonymizer import (
AnonymizationResult,
Anonymizer,
AnonymizerRequest,
AnonymizerType,
)
from .models.anonymizer import (AnonymizationResult, Anonymizer,
AnonymizerRequest, AnonymizerType)
from .models.common import EntityTypes
from .processing.text_processing.regex_annotator import RegexAnnotator

Expand Down
23 changes: 9 additions & 14 deletions datafog/__init___original.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,18 @@
from .client import app
from .config import OperationType, get_config
from .main import DataFog, TextPIIAnnotator
from .models.annotator import (
AnalysisExplanation,
AnnotationResult,
AnnotationResultWithAnaysisExplanation,
AnnotatorRequest,
)
from .models.anonymizer import (
AnonymizationResult,
Anonymizer,
AnonymizerRequest,
AnonymizerType,
)
from .models.common import AnnotatorMetadata, EntityTypes, Pattern, PatternRecognizer
from .models.annotator import (AnalysisExplanation, AnnotationResult,
AnnotationResultWithAnaysisExplanation,
AnnotatorRequest)
from .models.anonymizer import (AnonymizationResult, Anonymizer,
AnonymizerRequest, AnonymizerType)
from .models.common import (AnnotatorMetadata, EntityTypes, Pattern,
PatternRecognizer)
from .models.spacy_nlp import SpacyAnnotator
from .processing.image_processing.donut_processor import DonutProcessor
from .processing.image_processing.image_downloader import ImageDownloader
from .processing.image_processing.pytesseract_processor import PytesseractProcessor
from .processing.image_processing.pytesseract_processor import \
PytesseractProcessor
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
from .services.image_service import ImageService
from .services.spark_service import SparkService
Expand Down
10 changes: 4 additions & 6 deletions datafog/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,8 @@ def download_model(

elif engine == "gliner":
try:
from datafog.processing.text_processing.gliner_annotator import (
GLiNERAnnotator,
)
from datafog.processing.text_processing.gliner_annotator import \
GLiNERAnnotator

GLiNERAnnotator.download_model(model_name)
typer.echo(f"GLiNER model {model_name} downloaded and cached successfully.")
Expand Down Expand Up @@ -310,9 +309,8 @@ def list_entities():
typer.echo(annotator.list_entities())
except ModuleNotFoundError as e:
try:
from .processing.text_processing.spacy_pii_annotator import (
PII_ANNOTATION_LABELS,
)
from .processing.text_processing.spacy_pii_annotator import \
PII_ANNOTATION_LABELS

typer.echo(PII_ANNOTATION_LABELS)
except Exception:
Expand Down
16 changes: 6 additions & 10 deletions datafog/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,9 @@ def detect_pii(
pii_dict[entity.type].append(entity.text)

try:
from datafog.telemetry import (
_get_duration_bucket,
_get_text_length_bucket,
track_function_call,
)
from datafog.telemetry import (_get_duration_bucket,
_get_text_length_bucket,
track_function_call)

_duration = (_time.monotonic() - _start) * 1000
entity_count = sum(len(v) for v in pii_dict.values())
Expand Down Expand Up @@ -132,11 +130,9 @@ def anonymize_text(
)

try:
from datafog.telemetry import (
_get_duration_bucket,
_get_text_length_bucket,
track_function_call,
)
from datafog.telemetry import (_get_duration_bucket,
_get_text_length_bucket,
track_function_call)

_duration = (_time.monotonic() - _start) * 1000
track_function_call(
Expand Down
15 changes: 14 additions & 1 deletion datafog/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,13 @@ def _regex_entities(
text: str,
entity_types: Optional[list[str]] = None,
locales: Optional[list[str]] = None,
strict_numeric: bool = True,
) -> list[Entity]:
annotator = RegexAnnotator(locales=locales, enabled_labels=entity_types)
annotator = RegexAnnotator(
locales=locales,
enabled_labels=entity_types,
strict_numeric=strict_numeric,
)
_, structured = annotator.annotate_with_spans(text)
entities: list[Entity] = []
for span in structured.spans:
Expand Down Expand Up @@ -363,12 +368,17 @@ def scan(
locales: Optional[list[str]] = None,
allowlist: Optional[list[str]] = None,
allowlist_patterns: Optional[list[str]] = None,
strict_numeric: bool = True,
) -> ScanResult:
"""Scan text for PII entities.

``allowlist`` exempts exact entity texts (e.g. your own support email);
``allowlist_patterns`` exempts entities whose full text matches a regex
(e.g. ``^\\d{10}$`` to stop unix timestamps matching as phone numbers).
``strict_numeric`` (default True) requires SSNs to be delimited and phone
numbers to carry formatting cues, so bare digit runs are not matched; set
False to also detect undelimited nine-digit SSNs and bare ten-digit phone
numbers.
"""
if not isinstance(text, str):
raise TypeError("text must be a string")
Expand All @@ -384,6 +394,7 @@ def scan(
text,
entity_types=entity_types,
locales=locales,
strict_numeric=strict_numeric,
)

if engine == "regex":
Expand Down Expand Up @@ -524,6 +535,7 @@ def scan_and_redact(
locales: Optional[list[str]] = None,
allowlist: Optional[list[str]] = None,
allowlist_patterns: Optional[list[str]] = None,
strict_numeric: bool = True,
) -> RedactResult:
"""Convenience wrapper: scan then redact."""
scan_result = scan(
Expand All @@ -533,5 +545,6 @@ def scan_and_redact(
locales=locales,
allowlist=allowlist,
allowlist_patterns=allowlist_patterns,
strict_numeric=strict_numeric,
)
return redact(text=text, entities=scan_result.entities, strategy=strategy)
11 changes: 5 additions & 6 deletions datafog/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ def run_text_pipeline_sync(self, str_list: List[str]) -> List:
_pipeline_result = str_list

try:
from .telemetry import _get_duration_bucket, track_function_call
from .telemetry import (_get_duration_bucket,
track_function_call)

_duration = (_time.monotonic() - _start) * 1000
track_function_call(
Expand Down Expand Up @@ -194,11 +195,9 @@ def detect(self, text: str) -> dict:
result.setdefault(label, []).append(entity.text)

try:
from .telemetry import (
_get_duration_bucket,
_get_text_length_bucket,
track_function_call,
)
from .telemetry import (_get_duration_bucket,
_get_text_length_bucket,
track_function_call)

_duration = (_time.monotonic() - _start) * 1000
entity_count = sum(len(v) for v in result.values())
Expand Down
3 changes: 2 additions & 1 deletion datafog/processing/image_processing/donut_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ async def extract_text_from_image(self, image: "Image.Image") -> str:
raise ImportError(self._missing_dependency_message("torch")) from exc

try:
from transformers import DonutProcessor as TransformersDonutProcessor
from transformers import \
DonutProcessor as TransformersDonutProcessor
from transformers import VisionEncoderDecoderModel
except ImportError as e:
raise ImportError(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from datafog.processing.text_processing.regex_annotator.regex_annotator import (
AnnotationResult,
RegexAnnotator,
Span,
)
AnnotationResult, RegexAnnotator, Span)

__all__ = ["RegexAnnotator", "Span", "AnnotationResult"]
Loading
Loading