Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
3e2e84e
feat(version-scanner): support target list inputs via --targets
chalmerlowe Jun 16, 2026
d26715a
feat(version-scanner): simplify targets list input to accept only YAM…
chalmerlowe Jun 17, 2026
156f2b8
test(version-scanner): parametrize targets file failure tests
chalmerlowe Jun 17, 2026
f1c47bb
test(version-scanner): refactor formatting tests to use a shared samp…
chalmerlowe Jun 17, 2026
3c0b8fe
chore(version-scanner): move sample_match fixture to the top of test …
chalmerlowe Jun 17, 2026
00d409d
refactor(version-scanner): consolidate file reading and error handlin…
chalmerlowe Jun 17, 2026
437a13f
test(version-scanner): add parametrized unit tests for _safe_read_fil…
chalmerlowe Jun 17, 2026
5011c0b
chore(version-scanner): configure GHA to use targets file for multi-v…
chalmerlowe Jun 23, 2026
8fc7e61
chore(version-scanner): limit GHA workflow to scan only handwritten a…
chalmerlowe Jun 23, 2026
9f612da
refactor(version-scanner): rename targets file to matrix file to reso…
chalmerlowe Jun 23, 2026
d835491
refactor(version-scanner): rename package list file to example-list-n…
chalmerlowe Jun 23, 2026
66eabc1
fix(version-scanner): address reviewer feedback regarding encoding, f…
chalmerlowe Jun 23, 2026
055975c
feat(version-scanner): support globbing and subpath patterns in ignor…
chalmerlowe Jun 23, 2026
8ca523c
feat(version-scanner): fix root anchoring in ignores, parametrize tes…
chalmerlowe Jun 24, 2026
d2d2d6c
perf(version-scanner): optimize ignore logic and exclude caches/noise
chalmerlowe Jun 24, 2026
ea7eb92
Merge branch 'main' into feat/version-scanner-pr2
chalmerlowe Jun 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions scripts/version_scanner/.scannerignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,14 @@ repositories.bzl
*.png
*.gif
*.ico
*.pdf

# Ignore caches and temporary directories
.ruff_cache
.pytest_cache
.mypy_cache
.coverage
.htmlcov

# Ignore data files
*.csv
28 changes: 24 additions & 4 deletions scripts/version_scanner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,32 @@ pip install -r scripts/version_scanner/requirements.txt

The scanner uses a YAML configuration file (`regex_pattern_config.yaml`) to define rules and regex patterns.

## Ignoring Directories
## Matrix File Format

You can create a `.scannerignore` file in the directory you are scanning (usually the repo root) to list directories to skip, one per line.
When using `--matrix-file`, you must provide a YAML file specifying dependencies and versions.

## Known Issues & Future Investigations
- **Binary Ignores in `.scannerignore`**: Recursive wildcard ignores (e.g., `*.jpg`) currently do not effectively ignore deeply nested binary files. The scanner logic should be investigated to support robust globbing or full-path suffix matching.
### Example
```yaml
python:
- "3.10"
- "3.11"
protobuf: "4.25.8"
```

> [!IMPORTANT]
> **Versions must be specified as quoted strings** (e.g., `"3.10"`, not `3.10`). This prevents YAML parsers from converting them to floats (which would truncate `3.10` to `3.1`).

## Ignoring Directories and Files

In order to ignore files OR entire directories, you can add ignore patterns to the `.scannerignore` file located in the same directory as the script (`scripts/version_scanner/.scannerignore`). Ignore patterns should be added one per line.

### Features
- **Case-insensitive**: All patterns are matched case-insensitively.
- **Globbing**: Supports standard shell globbing patterns (e.g., `*.jpg`, `test_*`).
- **Subpaths**: You can specify subpaths (e.g., `packages/pkg_a/.nox`).
- **Root Anchoring**: Patterns starting with a slash `/` are anchored to the root of the scan (e.g., `/packages` ignores the `packages` directory at root, but not `some/other/packages`).

---

---

Expand Down
44 changes: 44 additions & 0 deletions scripts/version_scanner/tests/unit/test_version_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,50 @@ def test_scan_repository_ignores_version_scanner(tmp_path):
assert len(results) == 0


def test_scan_repository_wildcard_ignores(tmp_path):
# Create files
(tmp_path / "test.jpg").write_text("dummy version 3.7\n")
(tmp_path / "test.py").write_text("python_requires = '>=3.7'\n")

rules = [
{"name": "python_requires_check", "pattern": "python_requires\\s*=\\s*['\"]>=3\\.7['\"]"},
{"name": "explicit_version_string", "pattern": "3\\.7"}
]

from version_scanner import scan_repository
# Without ignore
results = scan_repository(str(tmp_path), rules)
assert len(results) >= 2

# With wildcard ignore for *.jpg
results_ignored = scan_repository(str(tmp_path), rules, ignore_dirs=["*.jpg"])
# test.jpg should be ignored completely
for match in results_ignored:
assert not match["file_path"].endswith("test.jpg")


DEFAULT_IGNORE_PATTERNS = [".git", "*.jpg", "packages/pkg_a/.nox", "*.egg-info"]

@pytest.mark.parametrize(
"rel_path, name, ignore_patterns, expected",
[
pytest.param(".git", ".git", DEFAULT_IGNORE_PATTERNS, True, id="exact_match"),
pytest.param(".GIT", ".GIT", DEFAULT_IGNORE_PATTERNS, True, id="case_insensitive_match"),
pytest.param("some/path/image.jpg", "image.jpg", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_subpath_match"),
pytest.param("image.JPG", "image.JPG", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_case_insensitive_match"),
pytest.param("packages/pkg_a/.nox", ".nox", DEFAULT_IGNORE_PATTERNS, True, id="subpath_exact_match"),
pytest.param("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_directory_match"),
pytest.param("setup.py", "setup.py", DEFAULT_IGNORE_PATTERNS, False, id="no_match"),
pytest.param("packages", "packages", ["/packages"], True, id="anchored_root_match"),
pytest.param("some/other/packages", "packages", ["/packages"], False, id="anchored_root_nested_no_match"),
]
)
def test__should_ignore(rel_path, name, ignore_patterns, expected):
from version_scanner import _should_ignore, _preprocess_ignore_patterns
preprocessed = _preprocess_ignore_patterns(ignore_patterns)
assert _should_ignore(rel_path, name, preprocessed) is expected


def test_load_ignore_file(tmp_path):
from version_scanner import load_ignore_file

Expand Down
80 changes: 75 additions & 5 deletions scripts/version_scanner/version_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import argparse
import csv
import datetime
import fnmatch
Comment thread
chalmerlowe marked this conversation as resolved.
import os
import re
import sys
Expand Down Expand Up @@ -498,6 +499,63 @@ def read_package_file(file_path: str) -> List[str]:
return packages


def _preprocess_ignore_patterns(ignore_patterns: List[str]) -> List[Tuple[str, str]]:
"""Preprocesses ignore patterns into a classified list for faster matching.

Args:
ignore_patterns: A list of raw ignore patterns from .scannerignore.

Returns:
A list of tuples (type, pattern) where type is 'anchored', 'subpath', or 'filename'.
"""
if not ignore_patterns:
return []

preprocessed = []
for pattern in ignore_patterns:
pattern_lower = pattern.lower()
if '/' in pattern:
if pattern_lower.startswith('/'):
preprocessed.append(('anchored', pattern_lower[1:]))
else:
preprocessed.append(('subpath', pattern_lower))
else:
preprocessed.append(('filename', pattern_lower))
return preprocessed


def _should_ignore(rel_path: str, name: str, preprocessed_patterns: List[Tuple[str, str]]) -> bool:
"""Check if a file or directory matches any of the ignore patterns.

Directories and files can be ignored by providing an ignore pattern in the
.scannerignore file.

Args:
rel_path: The relative path of the file or directory from the scan root.
name: The name of the file or directory (basename).
preprocessed_patterns: A list of preprocessed ignore patterns.

Returns:
True if the file or directory should be ignored, False otherwise.
"""
if not preprocessed_patterns:
return False
name_lower = name.lower()
rel_path_norm = rel_path.replace(os.sep, '/').lower()

for p_type, p_val in preprocessed_patterns:
if p_type == 'anchored':
if fnmatch.fnmatchcase(rel_path_norm, p_val):
return True
elif p_type == 'subpath':
if fnmatch.fnmatchcase(rel_path_norm, p_val) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p_val}"):
return True
elif p_type == 'filename':
if fnmatch.fnmatchcase(name_lower, p_val):
return True
return False


def scan_repository(
root_path: str,
rules: List[Dict[str, Any]],
Expand Down Expand Up @@ -528,7 +586,6 @@ def scan_repository(
Returns:
A list of dictionaries detailing each match.
"""
ignore_lower = {i.lower() for i in ignore_dirs} if ignore_dirs else set()
results = []

filename_targets = []
Expand All @@ -552,18 +609,31 @@ def scan_repository(
print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr)
continue

# Preprocess ignore patterns once
preprocessed_ignores = _preprocess_ignore_patterns(ignore_dirs)

print(f"\nScanning repository: {root_path}")
if target_packages:
print(f"Filtering for packages: {target_packages}")

for root, dirs, files in os.walk(root_path):
rel_root = os.path.relpath(root, root_path)

# Helper to construct relative path for ignore matching
def get_rel_path(name):
return name if rel_root == "." else os.path.join(rel_root, name)

# Prune ignore directories (case-insensitive)
dirs[:] = [d for d in dirs if d.lower() not in ignore_lower]
dirs[:] = [
d for d in dirs
if not _should_ignore(get_rel_path(d), d, preprocessed_ignores)
]

# Filter ignore files (case-insensitive)
files = [f for f in files if f.lower() not in ignore_lower]

rel_root = os.path.relpath(root, root_path)
files = [
f for f in files
if not _should_ignore(get_rel_path(f), f, preprocessed_ignores)
]
Comment thread
chalmerlowe marked this conversation as resolved.

# Layout-agnostic generic subdirectory filtering
if target_packages:
Expand Down
Loading