diff --git a/scripts/version_scanner/.scannerignore b/scripts/version_scanner/.scannerignore index e200930894d0..aa7406addb41 100644 --- a/scripts/version_scanner/.scannerignore +++ b/scripts/version_scanner/.scannerignore @@ -20,3 +20,14 @@ repositories.bzl *.png *.gif *.ico +*.pdf + +# Ignore caches and temporary directories +.ruff_cache +.pytest_cache +.mypy_cache +.coverage +.htmlcov + +# Ignore data files +*.csv diff --git a/scripts/version_scanner/README.md b/scripts/version_scanner/README.md index 4868087de1ec..f7fe06b17d71 100644 --- a/scripts/version_scanner/README.md +++ b/scripts/version_scanner/README.md @@ -45,12 +45,32 @@ pip install -r scripts/version_scanner/requirements.txt The scanner uses a YAML configuration file (`regex_pattern_config.yaml`) to define rules and regex patterns. -## Ignoring Directories +## Matrix File Format -You can create a `.scannerignore` file in the directory you are scanning (usually the repo root) to list directories to skip, one per line. +When using `--matrix-file`, you must provide a YAML file specifying dependencies and versions. -## Known Issues & Future Investigations -- **Binary Ignores in `.scannerignore`**: Recursive wildcard ignores (e.g., `*.jpg`) currently do not effectively ignore deeply nested binary files. The scanner logic should be investigated to support robust globbing or full-path suffix matching. +### Example +```yaml +python: + - "3.10" + - "3.11" +protobuf: "4.25.8" +``` + +> [!IMPORTANT] +> **Versions must be specified as quoted strings** (e.g., `"3.10"`, not `3.10`). This prevents YAML parsers from converting them to floats (which would truncate `3.10` to `3.1`). + +## Ignoring Directories and Files + +In order to ignore files OR entire directories, you can add ignore patterns to the `.scannerignore` file located in the same directory as the script (`scripts/version_scanner/.scannerignore`). Ignore patterns should be added one per line. + +### Features +- **Case-insensitive**: All patterns are matched case-insensitively. +- **Globbing**: Supports standard shell globbing patterns (e.g., `*.jpg`, `test_*`). +- **Subpaths**: You can specify subpaths (e.g., `packages/pkg_a/.nox`). +- **Root Anchoring**: Patterns starting with a slash `/` are anchored to the root of the scan (e.g., `/packages` ignores the `packages` directory at root, but not `some/other/packages`). + +--- --- diff --git a/scripts/version_scanner/tests/unit/test_version_scanner.py b/scripts/version_scanner/tests/unit/test_version_scanner.py index 223214dbd3ee..531ac7559e27 100644 --- a/scripts/version_scanner/tests/unit/test_version_scanner.py +++ b/scripts/version_scanner/tests/unit/test_version_scanner.py @@ -366,6 +366,50 @@ def test_scan_repository_ignores_version_scanner(tmp_path): assert len(results) == 0 +def test_scan_repository_wildcard_ignores(tmp_path): + # Create files + (tmp_path / "test.jpg").write_text("dummy version 3.7\n") + (tmp_path / "test.py").write_text("python_requires = '>=3.7'\n") + + rules = [ + {"name": "python_requires_check", "pattern": "python_requires\\s*=\\s*['\"]>=3\\.7['\"]"}, + {"name": "explicit_version_string", "pattern": "3\\.7"} + ] + + from version_scanner import scan_repository + # Without ignore + results = scan_repository(str(tmp_path), rules) + assert len(results) >= 2 + + # With wildcard ignore for *.jpg + results_ignored = scan_repository(str(tmp_path), rules, ignore_dirs=["*.jpg"]) + # test.jpg should be ignored completely + for match in results_ignored: + assert not match["file_path"].endswith("test.jpg") + + +DEFAULT_IGNORE_PATTERNS = [".git", "*.jpg", "packages/pkg_a/.nox", "*.egg-info"] + +@pytest.mark.parametrize( + "rel_path, name, ignore_patterns, expected", + [ + pytest.param(".git", ".git", DEFAULT_IGNORE_PATTERNS, True, id="exact_match"), + pytest.param(".GIT", ".GIT", DEFAULT_IGNORE_PATTERNS, True, id="case_insensitive_match"), + pytest.param("some/path/image.jpg", "image.jpg", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_subpath_match"), + pytest.param("image.JPG", "image.JPG", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_case_insensitive_match"), + pytest.param("packages/pkg_a/.nox", ".nox", DEFAULT_IGNORE_PATTERNS, True, id="subpath_exact_match"), + pytest.param("google_cloud_pubsub.egg-info", "google_cloud_pubsub.egg-info", DEFAULT_IGNORE_PATTERNS, True, id="wildcard_directory_match"), + pytest.param("setup.py", "setup.py", DEFAULT_IGNORE_PATTERNS, False, id="no_match"), + pytest.param("packages", "packages", ["/packages"], True, id="anchored_root_match"), + pytest.param("some/other/packages", "packages", ["/packages"], False, id="anchored_root_nested_no_match"), + ] +) +def test__should_ignore(rel_path, name, ignore_patterns, expected): + from version_scanner import _should_ignore, _preprocess_ignore_patterns + preprocessed = _preprocess_ignore_patterns(ignore_patterns) + assert _should_ignore(rel_path, name, preprocessed) is expected + + def test_load_ignore_file(tmp_path): from version_scanner import load_ignore_file diff --git a/scripts/version_scanner/version_scanner.py b/scripts/version_scanner/version_scanner.py index a20bad716dd8..9fb0cd068494 100644 --- a/scripts/version_scanner/version_scanner.py +++ b/scripts/version_scanner/version_scanner.py @@ -20,6 +20,7 @@ import argparse import csv import datetime +import fnmatch import os import re import sys @@ -498,6 +499,63 @@ def read_package_file(file_path: str) -> List[str]: return packages +def _preprocess_ignore_patterns(ignore_patterns: List[str]) -> List[Tuple[str, str]]: + """Preprocesses ignore patterns into a classified list for faster matching. + + Args: + ignore_patterns: A list of raw ignore patterns from .scannerignore. + + Returns: + A list of tuples (type, pattern) where type is 'anchored', 'subpath', or 'filename'. + """ + if not ignore_patterns: + return [] + + preprocessed = [] + for pattern in ignore_patterns: + pattern_lower = pattern.lower() + if '/' in pattern: + if pattern_lower.startswith('/'): + preprocessed.append(('anchored', pattern_lower[1:])) + else: + preprocessed.append(('subpath', pattern_lower)) + else: + preprocessed.append(('filename', pattern_lower)) + return preprocessed + + +def _should_ignore(rel_path: str, name: str, preprocessed_patterns: List[Tuple[str, str]]) -> bool: + """Check if a file or directory matches any of the ignore patterns. + + Directories and files can be ignored by providing an ignore pattern in the + .scannerignore file. + + Args: + rel_path: The relative path of the file or directory from the scan root. + name: The name of the file or directory (basename). + preprocessed_patterns: A list of preprocessed ignore patterns. + + Returns: + True if the file or directory should be ignored, False otherwise. + """ + if not preprocessed_patterns: + return False + name_lower = name.lower() + rel_path_norm = rel_path.replace(os.sep, '/').lower() + + for p_type, p_val in preprocessed_patterns: + if p_type == 'anchored': + if fnmatch.fnmatchcase(rel_path_norm, p_val): + return True + elif p_type == 'subpath': + if fnmatch.fnmatchcase(rel_path_norm, p_val) or fnmatch.fnmatchcase(rel_path_norm, f"*/{p_val}"): + return True + elif p_type == 'filename': + if fnmatch.fnmatchcase(name_lower, p_val): + return True + return False + + def scan_repository( root_path: str, rules: List[Dict[str, Any]], @@ -528,7 +586,6 @@ def scan_repository( Returns: A list of dictionaries detailing each match. """ - ignore_lower = {i.lower() for i in ignore_dirs} if ignore_dirs else set() results = [] filename_targets = [] @@ -552,18 +609,31 @@ def scan_repository( print(f"Error compiling regex for rule {rule['name']}: {e}", file=sys.stderr) continue + # Preprocess ignore patterns once + preprocessed_ignores = _preprocess_ignore_patterns(ignore_dirs) + print(f"\nScanning repository: {root_path}") if target_packages: print(f"Filtering for packages: {target_packages}") for root, dirs, files in os.walk(root_path): + rel_root = os.path.relpath(root, root_path) + + # Helper to construct relative path for ignore matching + def get_rel_path(name): + return name if rel_root == "." else os.path.join(rel_root, name) + # Prune ignore directories (case-insensitive) - dirs[:] = [d for d in dirs if d.lower() not in ignore_lower] + dirs[:] = [ + d for d in dirs + if not _should_ignore(get_rel_path(d), d, preprocessed_ignores) + ] # Filter ignore files (case-insensitive) - files = [f for f in files if f.lower() not in ignore_lower] - - rel_root = os.path.relpath(root, root_path) + files = [ + f for f in files + if not _should_ignore(get_rel_path(f), f, preprocessed_ignores) + ] # Layout-agnostic generic subdirectory filtering if target_packages: