diff --git a/README.md b/README.md index 85a20c22..386e1d84 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,21 @@ def split_video_into_scenes(video_path, threshold=27.0): See [the documentation](https://www.scenedetect.com/docs/latest/api.html) for more examples. +**Optional: Semantic Scene Labels**: + +PySceneDetect finds *where* the cuts are; if you also want a short description of *what* is in each scene, the optional `label_scenes` helper runs the detected scenes through the [TwelveLabs](https://twelvelabs.io) Pegasus video-understanding model. Install with `pip install scenedetect[twelvelabs]` and set `TWELVELABS_API_KEY`: + +```python +from scenedetect import detect, ContentDetector +from scenedetect.output import label_scenes + +scenes = detect("my_video.mp4", ContentDetector()) +for label in label_scenes(scenes, video_url="https://example.com/my_video.mp4"): + print(label.index, label.label) +``` + +This is entirely opt-in and never runs during normal detection. A free API key with a generous free tier is available at [twelvelabs.io](https://twelvelabs.io). + **Benchmark**: We evaluate the performance of different detectors in terms of accuracy and processing speed. See the [benchmark report](benchmark/README.md) for details. diff --git a/pyproject.toml b/pyproject.toml index 79ba94b2..9f050864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,8 @@ dependencies = [ [project.optional-dependencies] pyav = ["av>=9.2"] moviepy = ["moviepy"] +# Optional semantic scene labelling via TwelveLabs Pegasus (see scenedetect.output.label_scenes). +twelvelabs = ["twelvelabs>=1.2.8"] dev = ["av>=9.2", "moviepy", "pytest>=7.0", "pytest-rerunfailures"] docs = ["Sphinx==7.0.1", "sphinx-copybutton==0.5.2"] website = ["mkdocs==1.5.2", "jinja2>=3.1.6"] diff --git a/scenedetect/__init__.py b/scenedetect/__init__.py index 395741e1..07390e65 100644 --- a/scenedetect/__init__.py +++ b/scenedetect/__init__.py @@ -51,9 +51,11 @@ is_mkvmerge_available, write_scene_list, write_scene_list_html, + label_scenes, PathFormatter, VideoMetadata, SceneMetadata, + SceneLabel, ) from scenedetect.detector import SceneDetector from scenedetect.detectors import ( diff --git a/scenedetect/output/__init__.py b/scenedetect/output/__init__.py index eef360af..453ddc72 100644 --- a/scenedetect/output/__init__.py +++ b/scenedetect/output/__init__.py @@ -40,6 +40,12 @@ # Commonly used classes/functions exported under the `scenedetect.output` namespace for brevity. from scenedetect.output.image import save_images +from scenedetect.output.labels import ( + DEFAULT_MODEL, + DEFAULT_PROMPT, + SceneLabel, + label_scenes, +) from scenedetect.output.video import ( PathFormatter, SceneMetadata, diff --git a/scenedetect/output/labels.py b/scenedetect/output/labels.py new file mode 100644 index 00000000..b7455c13 --- /dev/null +++ b/scenedetect/output/labels.py @@ -0,0 +1,217 @@ +# +# PySceneDetect: Python-Based Video Scene Detector +# ------------------------------------------------------------------- +# [ Site: https://scenedetect.com ] +# [ Docs: https://scenedetect.com/docs/ ] +# [ Github: https://github.com/Breakthrough/PySceneDetect/ ] +# +# Copyright (C) 2025 Brandon Castellano . +# PySceneDetect is licensed under the BSD 3-Clause License; see the +# included LICENSE file, or visit one of the above pages for details. +# + +"""The ``scenedetect.output.labels`` module adds *optional* semantic labels to the scenes found by +PySceneDetect. Pixel-based detectors locate the cuts; this helper sends each detected time range to +the `TwelveLabs `_ Pegasus video-understanding model and attaches a short +natural-language description to each scene. + +This integration is entirely opt-in: it is never invoked by detection and requires the optional +``twelvelabs`` dependency (``pip install scenedetect[twelvelabs]``) plus a TwelveLabs API key. A +free key with a generous free tier is available at https://twelvelabs.io. + +Per-scene labelling relies on Pegasus' ``start_time``/``end_time`` support, which currently requires +the ``pegasus1.5`` model with a ``video_url`` or an uploaded ``asset_id`` source. (``pegasus1.5`` +does *not* accept the ``video_id`` of an already-indexed video for per-scene analysis.) Use the same +video you ran detection on so the timecodes line up:: + + from scenedetect import detect, ContentDetector + from scenedetect.output import label_scenes + + scenes = detect("my_video.mp4", ContentDetector()) + labels = label_scenes(scenes, video_url="https://example.com/my_video.mp4") + for label in labels: + print(label.index, label.label) + +Pegasus 1.5 also requires each analysed window to span at least four seconds, so scenes shorter than +that are skipped (with a logged note) rather than aborting the whole pass; see +:data:`MIN_PEGASUS_SCENE_SECONDS`. +""" + +import logging +import os +import typing as ty +from dataclasses import dataclass + +from scenedetect.common import SceneList + +logger = logging.getLogger("pyscenedetect") + +DEFAULT_MODEL: str = "pegasus1.5" +"""Default TwelveLabs Pegasus model used for scene labelling.""" + +DEFAULT_PROMPT: str = ( + "Describe what happens in this part of the video in a single concise sentence." +) +"""Default prompt sent to Pegasus for each scene.""" + +MIN_PEGASUS_SCENE_SECONDS: float = 4.0 +"""Minimum scene duration Pegasus 1.5 will analyse. The API rejects any window where +``end_time - start_time`` is below this with a 400 error, so shorter scenes are skipped.""" + + +@dataclass +class SceneLabel: + """A semantic label generated for a single detected scene. The list returned by + :func:`label_scenes` runs parallel to (and in the same order as) the input scene list.""" + + index: int + """0-based index of the scene this label describes.""" + start_time: float + """Scene start, in seconds from the beginning of the video.""" + end_time: float + """Scene end, in seconds from the beginning of the video.""" + label: str + """Natural-language description of the scene returned by Pegasus.""" + + +def label_scenes( + scene_list: SceneList, + asset_id: str | None = None, + *, + video_url: str | None = None, + video_id: str | None = None, + model_name: str = DEFAULT_MODEL, + prompt: str = DEFAULT_PROMPT, + max_tokens: int = 512, + api_key: str | None = None, + client: ty.Any | None = None, +) -> list[SceneLabel]: + """Generate a semantic label for each scene using the TwelveLabs Pegasus model. + + Each scene's start/end timecode is forwarded to Pegasus so the description covers only that + portion of the video. This is opt-in and does not affect detection in any way. + + Scenes shorter than :data:`MIN_PEGASUS_SCENE_SECONDS` are skipped with a logged note (Pegasus + 1.5 rejects windows under four seconds), and a per-scene API error is logged and skipped rather + than aborting the whole pass. Either case simply omits that scene from the result; use + :attr:`SceneLabel.index` to map a label back to its scene. + + Arguments: + scene_list: Scenes to label, as returned by + :meth:`SceneManager.get_scene_list() + ` + or :func:`detect() `. + asset_id: ID of a video *asset* previously uploaded to TwelveLabs. One of `asset_id` or + `video_url` is required; they are mutually exclusive. + video_url: Public URL of the video. Mutually exclusive with `asset_id`. + video_id: ID of an already-*indexed* video. Accepted for completeness but **not supported + by** ``pegasus1.5`` for per-scene analysis; passing it raises a clear ``ValueError`` + instead of letting the API return a confusing 400. Use `asset_id` or `video_url`. + model_name: TwelveLabs Pegasus model to use (default: ``"pegasus1.5"``). + prompt: Instruction sent to Pegasus for each scene. + max_tokens: Maximum number of tokens Pegasus may generate per scene. Note that + ``pegasus1.5`` requires this to be at least 512. + api_key: TwelveLabs API key. Defaults to the ``TWELVELABS_API_KEY`` environment variable. + Ignored when `client` is provided. + client: Pre-configured ``twelvelabs.TwelveLabs`` client. If omitted, one is created from + `api_key`. + + Returns: + A list of :class:`SceneLabel`, one per *successfully labelled* scene, in input order. + Scenes that are too short or that error out are omitted; each remaining label carries the + original scene's :attr:`~SceneLabel.index`. + + Raises: + ImportError: If the optional ``twelvelabs`` package is not installed. + ValueError: If not exactly one of `asset_id`/`video_url` is given, if `video_id` is passed + (unsupported by ``pegasus1.5``), or if no API key is available when constructing a + client. + """ + if video_id is not None: + raise ValueError( + "'video_id' (an already-indexed video) is not supported by pegasus1.5 for per-scene " + "analysis. Pass 'video_url=' (a public URL) or 'asset_id=' (an uploaded asset) instead." + ) + if (asset_id is None) == (video_url is None): + raise ValueError("Exactly one of 'asset_id' or 'video_url' must be provided.") + + if client is None: + client = _create_client(api_key) + + # Imported lazily so the module stays importable without the optional dependency. + from twelvelabs.errors.bad_request_error import BadRequestError + + if video_url is not None: + from twelvelabs.types.video_context import VideoContext_Url + + video_context: ty.Any = VideoContext_Url(url=video_url) + else: + from twelvelabs.types.video_context import VideoContext_AssetId + + video_context = VideoContext_AssetId(asset_id=asset_id) + + labels: list[SceneLabel] = [] + for index, (start, end) in enumerate(scene_list): + start_seconds = start.seconds + end_seconds = end.seconds + if end_seconds - start_seconds < MIN_PEGASUS_SCENE_SECONDS: + logger.warning( + "Skipping scene %d [%.3fs-%.3fs]: under %.1fs, below the pegasus1.5 minimum.", + index, + start_seconds, + end_seconds, + MIN_PEGASUS_SCENE_SECONDS, + ) + continue + try: + response = client.analyze( + model_name=model_name, + video=video_context, + prompt=prompt, + max_tokens=max_tokens, + start_time=start_seconds, + end_time=end_seconds, + ) + except BadRequestError as ex: + # A single rejected scene must not abort the batch (e.g. the 4s window check we already + # guard above, but the API may reject other windows too). Auth/quota/server errors are + # deliberately *not* caught here so they still fail fast. + logger.warning( + "Skipping scene %d [%.3fs-%.3fs]: TwelveLabs rejected the request (%s).", + index, + start_seconds, + end_seconds, + ex, + ) + continue + labels.append( + SceneLabel( + index=index, + start_time=start_seconds, + end_time=end_seconds, + label=response.data.strip() if response.data else "", + ) + ) + logger.debug("Labelled scene %d [%.3fs-%.3fs]", index, start_seconds, end_seconds) + return labels + + +def _create_client(api_key: str | None) -> ty.Any: + """Create a ``twelvelabs.TwelveLabs`` client, surfacing a friendly error if the optional + dependency is missing or no API key is available.""" + try: + from twelvelabs import TwelveLabs + except ImportError as ex: + raise ImportError( + "The 'twelvelabs' package is required for scene labelling. Install it with:\n\n" + " pip install scenedetect[twelvelabs]\n\n" + "Get a free API key at https://twelvelabs.io." + ) from ex + + key = api_key if api_key is not None else os.environ.get("TWELVELABS_API_KEY") + if not key: + raise ValueError( + "No TwelveLabs API key provided. Pass 'api_key=' or set the TWELVELABS_API_KEY " + "environment variable. Get a free key at https://twelvelabs.io." + ) + return TwelveLabs(api_key=key) diff --git a/tests/test_labels.py b/tests/test_labels.py new file mode 100644 index 00000000..66851474 --- /dev/null +++ b/tests/test_labels.py @@ -0,0 +1,127 @@ +# +# PySceneDetect: Python-Based Video Scene Detector +# ------------------------------------------------------------------- +# [ Site: https://scenedetect.com ] +# [ Docs: https://scenedetect.com/docs/ ] +# [ Github: https://github.com/Breakthrough/PySceneDetect/ ] +# +# Copyright (C) 2025 Brandon Castellano . +# PySceneDetect is licensed under the BSD 3-Clause License; see the +# included LICENSE file, or visit one of the above pages for details. +# +"""Tests for scenedetect.output.labels (optional TwelveLabs scene labelling).""" + +import os +from fractions import Fraction + +import pytest + +from scenedetect import FrameTimecode +from scenedetect.output import SceneLabel, label_scenes +from scenedetect.output.labels import MIN_PEGASUS_SCENE_SECONDS + +FPS = Fraction(30) +# Two scenes, each well over the pegasus1.5 4s minimum (5s then 10s). +SCENE_LIST = [ + (FrameTimecode(0, FPS), FrameTimecode(150, FPS)), + (FrameTimecode(150, FPS), FrameTimecode(450, FPS)), +] + + +class _FakeAnalyzeResponse: + def __init__(self, data): + self.data = data + + +class _FakeClient: + """Records each analyze() call so wiring can be asserted without hitting the network.""" + + def __init__(self, error_on_call=None): + self.calls = [] + # If set, analyze() raises this exception on the matching (0-based) call index. + self._error_on_call = error_on_call + + def analyze(self, **kwargs): + if self._error_on_call is not None and len(self.calls) == self._error_on_call: + self.calls.append(kwargs) + from twelvelabs.errors.bad_request_error import BadRequestError + + raise BadRequestError(body={"code": "parameter_invalid", "message": "boom"}) + self.calls.append(kwargs) + return _FakeAnalyzeResponse(f" scene at {kwargs['start_time']}s ") + + +def test_label_scenes_wires_per_scene_timecodes(): + client = _FakeClient() + labels = label_scenes(SCENE_LIST, video_url="https://example.com/v.mp4", client=client) + + assert [type(label) for label in labels] == [SceneLabel, SceneLabel] + # One Pegasus call per scene, with that scene's start/end in seconds. + assert [(c["start_time"], c["end_time"]) for c in client.calls] == [(0.0, 5.0), (5.0, 15.0)] + # The source is forwarded as a VideoContext, not a raw video_id (unsupported on pegasus1.5). + assert all("video_id" not in c for c in client.calls) + assert all(c["video"].url == "https://example.com/v.mp4" for c in client.calls) + # Response text is stripped and indices run parallel to the input scene list. + assert labels[0].index == 0 and labels[0].label == "scene at 0.0s" + assert labels[1].start_time == 5.0 and labels[1].end_time == 15.0 + + +def test_label_scenes_accepts_asset_id(): + client = _FakeClient() + label_scenes(SCENE_LIST, asset_id="asset-1", client=client) + # An asset_id is sent as a VideoContext_AssetId, which pegasus1.5 supports. + assert all(c["video"].asset_id == "asset-1" for c in client.calls) + + +def test_label_scenes_requires_exactly_one_source(): + client = _FakeClient() + with pytest.raises(ValueError): + label_scenes(SCENE_LIST, client=client) + with pytest.raises(ValueError): + label_scenes(SCENE_LIST, asset_id="a", video_url="http://x", client=client) + + +def test_label_scenes_rejects_video_id(): + # video_id is unsupported by pegasus1.5; we fail fast with a clear error instead of a raw 400. + client = _FakeClient() + with pytest.raises(ValueError, match="video_id"): + label_scenes(SCENE_LIST, video_id="vid123", client=client) + + +def test_label_scenes_skips_short_scene_without_aborting(): + # A sub-4s scene sits between two valid ones; it must be skipped, not fatal. + assert MIN_PEGASUS_SCENE_SECONDS == 4.0 + scene_list = [ + (FrameTimecode(0, FPS), FrameTimecode(150, FPS)), # 5s, kept + (FrameTimecode(150, FPS), FrameTimecode(269, FPS)), # 119 frames ≈ 3.97s, skipped + (FrameTimecode(450, FPS), FrameTimecode(600, FPS)), # 5s, kept + ] + client = _FakeClient() + labels = label_scenes(scene_list, video_url="https://example.com/v.mp4", client=client) + + # The short scene never reached analyze(), and the run continued past it. + assert len(client.calls) == 2 + assert [label.index for label in labels] == [0, 2] + + +def test_label_scenes_skips_per_scene_api_error(): + # A BadRequestError on the first scene is logged and skipped; the batch keeps going. + client = _FakeClient(error_on_call=0) + labels = label_scenes(SCENE_LIST, video_url="https://example.com/v.mp4", client=client) + + assert len(client.calls) == 2 # both attempted + assert [label.index for label in labels] == [1] # only the second succeeded + + +@pytest.mark.skipif( + not os.environ.get("TWELVELABS_API_KEY"), + reason="requires TWELVELABS_API_KEY and a reachable video", +) +def test_label_scenes_integration(): + # Opt-in: needs a real key and a public video URL via TWELVELABS_TEST_VIDEO_URL. + video_url = os.environ.get("TWELVELABS_TEST_VIDEO_URL") + if not video_url: + pytest.skip("set TWELVELABS_TEST_VIDEO_URL to a public video to run this test") + labels = label_scenes(SCENE_LIST[:1], video_url=video_url) + assert len(labels) == 1 + assert isinstance(labels[0].label, str) and labels[0].label