From 0f2b97ae5f10a590adc21df03e608e1d1fa53b81 Mon Sep 17 00:00:00 2001 From: Mohit Varikuti Date: Thu, 25 Jun 2026 13:38:57 -0700 Subject: [PATCH] [output] Add optional TwelveLabs scene labelling Adds scenedetect.output.label_scenes, an opt-in helper that attaches a short natural-language description to each detected scene using the TwelveLabs Pegasus video-understanding model. Pixel-based detectors locate the cuts; this forwards each scene's start/end timecode to Pegasus so the description covers only that portion of the video. The integration is gated behind the optional 'twelvelabs' extra and is never invoked during normal detection. --- README.md | 15 +++ pyproject.toml | 2 + scenedetect/__init__.py | 2 + scenedetect/output/__init__.py | 6 ++ scenedetect/output/labels.py | 166 +++++++++++++++++++++++++++++++++ tests/test_labels.py | 77 +++++++++++++++ 6 files changed, 268 insertions(+) create mode 100644 scenedetect/output/labels.py create mode 100644 tests/test_labels.py diff --git a/README.md b/README.md index 85a20c22..386e1d84 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,21 @@ def split_video_into_scenes(video_path, threshold=27.0): See [the documentation](https://www.scenedetect.com/docs/latest/api.html) for more examples. +**Optional: Semantic Scene Labels**: + +PySceneDetect finds *where* the cuts are; if you also want a short description of *what* is in each scene, the optional `label_scenes` helper runs the detected scenes through the [TwelveLabs](https://twelvelabs.io) Pegasus video-understanding model. Install with `pip install scenedetect[twelvelabs]` and set `TWELVELABS_API_KEY`: + +```python +from scenedetect import detect, ContentDetector +from scenedetect.output import label_scenes + +scenes = detect("my_video.mp4", ContentDetector()) +for label in label_scenes(scenes, video_url="https://example.com/my_video.mp4"): + print(label.index, label.label) +``` + +This is entirely opt-in and never runs during normal detection. A free API key with a generous free tier is available at [twelvelabs.io](https://twelvelabs.io). + **Benchmark**: We evaluate the performance of different detectors in terms of accuracy and processing speed. See the [benchmark report](benchmark/README.md) for details. diff --git a/pyproject.toml b/pyproject.toml index 79ba94b2..9f050864 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,8 @@ dependencies = [ [project.optional-dependencies] pyav = ["av>=9.2"] moviepy = ["moviepy"] +# Optional semantic scene labelling via TwelveLabs Pegasus (see scenedetect.output.label_scenes). +twelvelabs = ["twelvelabs>=1.2.8"] dev = ["av>=9.2", "moviepy", "pytest>=7.0", "pytest-rerunfailures"] docs = ["Sphinx==7.0.1", "sphinx-copybutton==0.5.2"] website = ["mkdocs==1.5.2", "jinja2>=3.1.6"] diff --git a/scenedetect/__init__.py b/scenedetect/__init__.py index 395741e1..07390e65 100644 --- a/scenedetect/__init__.py +++ b/scenedetect/__init__.py @@ -51,9 +51,11 @@ is_mkvmerge_available, write_scene_list, write_scene_list_html, + label_scenes, PathFormatter, VideoMetadata, SceneMetadata, + SceneLabel, ) from scenedetect.detector import SceneDetector from scenedetect.detectors import ( diff --git a/scenedetect/output/__init__.py b/scenedetect/output/__init__.py index eef360af..453ddc72 100644 --- a/scenedetect/output/__init__.py +++ b/scenedetect/output/__init__.py @@ -40,6 +40,12 @@ # Commonly used classes/functions exported under the `scenedetect.output` namespace for brevity. from scenedetect.output.image import save_images +from scenedetect.output.labels import ( + DEFAULT_MODEL, + DEFAULT_PROMPT, + SceneLabel, + label_scenes, +) from scenedetect.output.video import ( PathFormatter, SceneMetadata, diff --git a/scenedetect/output/labels.py b/scenedetect/output/labels.py new file mode 100644 index 00000000..acf0ca6a --- /dev/null +++ b/scenedetect/output/labels.py @@ -0,0 +1,166 @@ +# +# PySceneDetect: Python-Based Video Scene Detector +# ------------------------------------------------------------------- +# [ Site: https://scenedetect.com ] +# [ Docs: https://scenedetect.com/docs/ ] +# [ Github: https://github.com/Breakthrough/PySceneDetect/ ] +# +# Copyright (C) 2025 Brandon Castellano . +# PySceneDetect is licensed under the BSD 3-Clause License; see the +# included LICENSE file, or visit one of the above pages for details. +# + +"""The ``scenedetect.output.labels`` module adds *optional* semantic labels to the scenes found by +PySceneDetect. Pixel-based detectors locate the cuts; this helper sends each detected time range to +the `TwelveLabs `_ Pegasus video-understanding model and attaches a short +natural-language description to each scene. + +This integration is entirely opt-in: it is never invoked by detection and requires the optional +``twelvelabs`` dependency (``pip install scenedetect[twelvelabs]``) plus a TwelveLabs API key. A +free key with a generous free tier is available at https://twelvelabs.io. + +Per-scene labelling relies on Pegasus' ``start_time``/``end_time`` support, which currently requires +the ``pegasus1.5`` model with a ``video_url`` (or asset) source rather than a ``video_id``. Use the +same video you ran detection on so the timecodes line up:: + + from scenedetect import detect, ContentDetector + from scenedetect.output import label_scenes + + scenes = detect("my_video.mp4", ContentDetector()) + labels = label_scenes(scenes, video_url="https://example.com/my_video.mp4") + for label in labels: + print(label.index, label.label) + +A ``video_id`` (already indexed) source is also accepted for models that support it. +""" + +import logging +import os +import typing as ty +from dataclasses import dataclass + +from scenedetect.common import SceneList + +logger = logging.getLogger("pyscenedetect") + +DEFAULT_MODEL: str = "pegasus1.5" +"""Default TwelveLabs Pegasus model used for scene labelling.""" + +DEFAULT_PROMPT: str = ( + "Describe what happens in this part of the video in a single concise sentence." +) +"""Default prompt sent to Pegasus for each scene.""" + + +@dataclass +class SceneLabel: + """A semantic label generated for a single detected scene. The list returned by + :func:`label_scenes` runs parallel to (and in the same order as) the input scene list.""" + + index: int + """0-based index of the scene this label describes.""" + start_time: float + """Scene start, in seconds from the beginning of the video.""" + end_time: float + """Scene end, in seconds from the beginning of the video.""" + label: str + """Natural-language description of the scene returned by Pegasus.""" + + +def label_scenes( + scene_list: SceneList, + video_id: str | None = None, + *, + video_url: str | None = None, + model_name: str = DEFAULT_MODEL, + prompt: str = DEFAULT_PROMPT, + max_tokens: int = 512, + api_key: str | None = None, + client: ty.Any | None = None, +) -> list[SceneLabel]: + """Generate a semantic label for each scene using the TwelveLabs Pegasus model. + + Each scene's start/end timecode is forwarded to Pegasus so the description covers only that + portion of the video. This is opt-in and does not affect detection in any way. + + Arguments: + scene_list: Scenes to label, as returned by + :meth:`SceneManager.get_scene_list() + ` + or :func:`detect() `. + video_id: ID of the video as already uploaded/indexed with TwelveLabs. Mutually exclusive + with `video_url`; one of the two is required. + video_url: Public URL of the video. Mutually exclusive with `video_id`. + model_name: TwelveLabs Pegasus model to use (default: ``"pegasus1.5"``). + prompt: Instruction sent to Pegasus for each scene. + max_tokens: Maximum number of tokens Pegasus may generate per scene. Note that + ``pegasus1.5`` requires this to be at least 512. + api_key: TwelveLabs API key. Defaults to the ``TWELVELABS_API_KEY`` environment variable. + Ignored when `client` is provided. + client: Pre-configured ``twelvelabs.TwelveLabs`` client. If omitted, one is created from + `api_key`. + + Returns: + A list of :class:`SceneLabel`, one per input scene, in the same order as `scene_list`. + + Raises: + ImportError: If the optional ``twelvelabs`` package is not installed. + ValueError: If neither or both of `video_id`/`video_url` are given, or if no API key is + available when constructing a client. + """ + if (video_id is None) == (video_url is None): + raise ValueError("Exactly one of 'video_id' or 'video_url' must be provided.") + + if client is None: + client = _create_client(api_key) + + video_context = None + if video_url is not None: + from twelvelabs.types.video_context import VideoContext_Url + + video_context = VideoContext_Url(url=video_url) + + labels: list[SceneLabel] = [] + for index, (start, end) in enumerate(scene_list): + start_seconds = start.seconds + end_seconds = end.seconds + response = client.analyze( + model_name=model_name, + video_id=video_id, + video=video_context, + prompt=prompt, + max_tokens=max_tokens, + start_time=start_seconds, + end_time=end_seconds, + ) + labels.append( + SceneLabel( + index=index, + start_time=start_seconds, + end_time=end_seconds, + label=response.data.strip() if response.data else "", + ) + ) + logger.debug("Labelled scene %d [%.3fs-%.3fs]", index, start_seconds, end_seconds) + return labels + + +def _create_client(api_key: str | None) -> ty.Any: + """Create a ``twelvelabs.TwelveLabs`` client, surfacing a friendly error if the optional + dependency is missing or no API key is available.""" + try: + from twelvelabs import TwelveLabs + except ImportError as ex: + raise ImportError( + "The 'twelvelabs' package is required for scene labelling. Install it with:\n\n" + " pip install scenedetect[twelvelabs]\n\n" + "Get a free API key at https://twelvelabs.io." + ) from ex + + key = api_key if api_key is not None else os.environ.get("TWELVELABS_API_KEY") + if not key: + raise ValueError( + "No TwelveLabs API key provided. Pass 'api_key=' or set the TWELVELABS_API_KEY " + "environment variable. Get a free key at https://twelvelabs.io." + ) + return TwelveLabs(api_key=key) diff --git a/tests/test_labels.py b/tests/test_labels.py new file mode 100644 index 00000000..23d0abd7 --- /dev/null +++ b/tests/test_labels.py @@ -0,0 +1,77 @@ +# +# PySceneDetect: Python-Based Video Scene Detector +# ------------------------------------------------------------------- +# [ Site: https://scenedetect.com ] +# [ Docs: https://scenedetect.com/docs/ ] +# [ Github: https://github.com/Breakthrough/PySceneDetect/ ] +# +# Copyright (C) 2025 Brandon Castellano . +# PySceneDetect is licensed under the BSD 3-Clause License; see the +# included LICENSE file, or visit one of the above pages for details. +# +"""Tests for scenedetect.output.labels (optional TwelveLabs scene labelling).""" + +import os +from fractions import Fraction + +import pytest + +from scenedetect import FrameTimecode +from scenedetect.output import SceneLabel, label_scenes + +FPS = Fraction(30) +SCENE_LIST = [ + (FrameTimecode(0, FPS), FrameTimecode(30, FPS)), + (FrameTimecode(30, FPS), FrameTimecode(90, FPS)), +] + + +class _FakeAnalyzeResponse: + def __init__(self, data): + self.data = data + + +class _FakeClient: + """Records each analyze() call so wiring can be asserted without hitting the network.""" + + def __init__(self): + self.calls = [] + + def analyze(self, **kwargs): + self.calls.append(kwargs) + return _FakeAnalyzeResponse(f" scene at {kwargs['start_time']}s ") + + +def test_label_scenes_wires_per_scene_timecodes(): + client = _FakeClient() + labels = label_scenes(SCENE_LIST, video_id="vid123", client=client) + + assert [type(label) for label in labels] == [SceneLabel, SceneLabel] + # One Pegasus call per scene, with that scene's start/end in seconds. + assert [(c["start_time"], c["end_time"]) for c in client.calls] == [(0.0, 1.0), (1.0, 3.0)] + assert all(c["video_id"] == "vid123" for c in client.calls) + # Response text is stripped and indices run parallel to the input scene list. + assert labels[0].index == 0 and labels[0].label == "scene at 0.0s" + assert labels[1].start_time == 1.0 and labels[1].end_time == 3.0 + + +def test_label_scenes_requires_exactly_one_source(): + client = _FakeClient() + with pytest.raises(ValueError): + label_scenes(SCENE_LIST, client=client) + with pytest.raises(ValueError): + label_scenes(SCENE_LIST, video_id="a", video_url="http://x", client=client) + + +@pytest.mark.skipif( + not os.environ.get("TWELVELABS_API_KEY"), + reason="requires TWELVELABS_API_KEY and a reachable video", +) +def test_label_scenes_integration(): + # Opt-in: needs a real key and a public video URL via TWELVELABS_TEST_VIDEO_URL. + video_url = os.environ.get("TWELVELABS_TEST_VIDEO_URL") + if not video_url: + pytest.skip("set TWELVELABS_TEST_VIDEO_URL to a public video to run this test") + labels = label_scenes(SCENE_LIST[:1], video_url=video_url) + assert len(labels) == 1 + assert isinstance(labels[0].label, str) and labels[0].label