From 0f2b97ae5f10a590adc21df03e608e1d1fa53b81 Mon Sep 17 00:00:00 2001
From: Mohit Varikuti <mohit.varikuti@twelvelabs.io>
Date: Thu, 25 Jun 2026 13:38:57 -0700
Subject: [PATCH] [output] Add optional TwelveLabs scene labelling

Adds scenedetect.output.label_scenes, an opt-in helper that attaches a
short natural-language description to each detected scene using the
TwelveLabs Pegasus video-understanding model. Pixel-based detectors
locate the cuts; this forwards each scene's start/end timecode to
Pegasus so the description covers only that portion of the video.

The integration is gated behind the optional 'twelvelabs' extra and is
never invoked during normal detection.
---
 README.md                      |  15 +++
 pyproject.toml                 |   2 +
 scenedetect/__init__.py        |   2 +
 scenedetect/output/__init__.py |   6 ++
 scenedetect/output/labels.py   | 166 +++++++++++++++++++++++++++++++++
 tests/test_labels.py           |  77 +++++++++++++++
 6 files changed, 268 insertions(+)
 create mode 100644 scenedetect/output/labels.py
 create mode 100644 tests/test_labels.py

diff --git a/README.md b/README.md
index 85a20c22..386e1d84 100644
--- a/README.md
+++ b/README.md
@@ -100,6 +100,21 @@ def split_video_into_scenes(video_path, threshold=27.0):
 
 See [the documentation](https://www.scenedetect.com/docs/latest/api.html) for more examples.
 
+**Optional: Semantic Scene Labels**:
+
+PySceneDetect finds *where* the cuts are; if you also want a short description of *what* is in each scene, the optional `label_scenes` helper runs the detected scenes through the [TwelveLabs](https://twelvelabs.io) Pegasus video-understanding model. Install with `pip install scenedetect[twelvelabs]` and set `TWELVELABS_API_KEY`:
+
+```python
+from scenedetect import detect, ContentDetector
+from scenedetect.output import label_scenes
+
+scenes = detect("my_video.mp4", ContentDetector())
+for label in label_scenes(scenes, video_url="https://example.com/my_video.mp4"):
+    print(label.index, label.label)
+```
+
+This is entirely opt-in and never runs during normal detection. A free API key with a generous free tier is available at [twelvelabs.io](https://twelvelabs.io).
+
 **Benchmark**:
 
 We evaluate the performance of different detectors in terms of accuracy and processing speed. See the [benchmark report](benchmark/README.md) for details.
diff --git a/pyproject.toml b/pyproject.toml
index 79ba94b2..9f050864 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,8 @@ dependencies = [
 [project.optional-dependencies]
 pyav = ["av>=9.2"]
 moviepy = ["moviepy"]
+# Optional semantic scene labelling via TwelveLabs Pegasus (see scenedetect.output.label_scenes).
+twelvelabs = ["twelvelabs>=1.2.8"]
 dev = ["av>=9.2", "moviepy", "pytest>=7.0", "pytest-rerunfailures"]
 docs = ["Sphinx==7.0.1", "sphinx-copybutton==0.5.2"]
 website = ["mkdocs==1.5.2", "jinja2>=3.1.6"]
diff --git a/scenedetect/__init__.py b/scenedetect/__init__.py
index 395741e1..07390e65 100644
--- a/scenedetect/__init__.py
+++ b/scenedetect/__init__.py
@@ -51,9 +51,11 @@
     is_mkvmerge_available,
     write_scene_list,
     write_scene_list_html,
+    label_scenes,
     PathFormatter,
     VideoMetadata,
     SceneMetadata,
+    SceneLabel,
 )
 from scenedetect.detector import SceneDetector
 from scenedetect.detectors import (
diff --git a/scenedetect/output/__init__.py b/scenedetect/output/__init__.py
index eef360af..453ddc72 100644
--- a/scenedetect/output/__init__.py
+++ b/scenedetect/output/__init__.py
@@ -40,6 +40,12 @@
 
 # Commonly used classes/functions exported under the `scenedetect.output` namespace for brevity.
 from scenedetect.output.image import save_images
+from scenedetect.output.labels import (
+    DEFAULT_MODEL,
+    DEFAULT_PROMPT,
+    SceneLabel,
+    label_scenes,
+)
 from scenedetect.output.video import (
     PathFormatter,
     SceneMetadata,
diff --git a/scenedetect/output/labels.py b/scenedetect/output/labels.py
new file mode 100644
index 00000000..acf0ca6a
--- /dev/null
+++ b/scenedetect/output/labels.py
@@ -0,0 +1,166 @@
+#
+#            PySceneDetect: Python-Based Video Scene Detector
+#   -------------------------------------------------------------------
+#     [  Site:    https://scenedetect.com                           ]
+#     [  Docs:    https://scenedetect.com/docs/                     ]
+#     [  Github:  https://github.com/Breakthrough/PySceneDetect/    ]
+#
+# Copyright (C) 2025 Brandon Castellano <http://www.bcastell.com>.
+# PySceneDetect is licensed under the BSD 3-Clause License; see the
+# included LICENSE file, or visit one of the above pages for details.
+#
+
+"""The ``scenedetect.output.labels`` module adds *optional* semantic labels to the scenes found by
+PySceneDetect. Pixel-based detectors locate the cuts; this helper sends each detected time range to
+the `TwelveLabs <https://twelvelabs.io>`_ Pegasus video-understanding model and attaches a short
+natural-language description to each scene.
+
+This integration is entirely opt-in: it is never invoked by detection and requires the optional
+``twelvelabs`` dependency (``pip install scenedetect[twelvelabs]``) plus a TwelveLabs API key. A
+free key with a generous free tier is available at https://twelvelabs.io.
+
+Per-scene labelling relies on Pegasus' ``start_time``/``end_time`` support, which currently requires
+the ``pegasus1.5`` model with a ``video_url`` (or asset) source rather than a ``video_id``. Use the
+same video you ran detection on so the timecodes line up::
+
+    from scenedetect import detect, ContentDetector
+    from scenedetect.output import label_scenes
+
+    scenes = detect("my_video.mp4", ContentDetector())
+    labels = label_scenes(scenes, video_url="https://example.com/my_video.mp4")
+    for label in labels:
+        print(label.index, label.label)
+
+A ``video_id`` (already indexed) source is also accepted for models that support it.
+"""
+
+import logging
+import os
+import typing as ty
+from dataclasses import dataclass
+
+from scenedetect.common import SceneList
+
+logger = logging.getLogger("pyscenedetect")
+
+DEFAULT_MODEL: str = "pegasus1.5"
+"""Default TwelveLabs Pegasus model used for scene labelling."""
+
+DEFAULT_PROMPT: str = (
+    "Describe what happens in this part of the video in a single concise sentence."
+)
+"""Default prompt sent to Pegasus for each scene."""
+
+
+@dataclass
+class SceneLabel:
+    """A semantic label generated for a single detected scene. The list returned by
+    :func:`label_scenes` runs parallel to (and in the same order as) the input scene list."""
+
+    index: int
+    """0-based index of the scene this label describes."""
+    start_time: float
+    """Scene start, in seconds from the beginning of the video."""
+    end_time: float
+    """Scene end, in seconds from the beginning of the video."""
+    label: str
+    """Natural-language description of the scene returned by Pegasus."""
+
+
+def label_scenes(
+    scene_list: SceneList,
+    video_id: str | None = None,
+    *,
+    video_url: str | None = None,
+    model_name: str = DEFAULT_MODEL,
+    prompt: str = DEFAULT_PROMPT,
+    max_tokens: int = 512,
+    api_key: str | None = None,
+    client: ty.Any | None = None,
+) -> list[SceneLabel]:
+    """Generate a semantic label for each scene using the TwelveLabs Pegasus model.
+
+    Each scene's start/end timecode is forwarded to Pegasus so the description covers only that
+    portion of the video. This is opt-in and does not affect detection in any way.
+
+    Arguments:
+        scene_list: Scenes to label, as returned by
+            :meth:`SceneManager.get_scene_list()
+            <scenedetect.scene_manager.SceneManager.get_scene_list>`
+            or :func:`detect() <scenedetect.detect>`.
+        video_id: ID of the video as already uploaded/indexed with TwelveLabs. Mutually exclusive
+            with `video_url`; one of the two is required.
+        video_url: Public URL of the video. Mutually exclusive with `video_id`.
+        model_name: TwelveLabs Pegasus model to use (default: ``"pegasus1.5"``).
+        prompt: Instruction sent to Pegasus for each scene.
+        max_tokens: Maximum number of tokens Pegasus may generate per scene. Note that
+            ``pegasus1.5`` requires this to be at least 512.
+        api_key: TwelveLabs API key. Defaults to the ``TWELVELABS_API_KEY`` environment variable.
+            Ignored when `client` is provided.
+        client: Pre-configured ``twelvelabs.TwelveLabs`` client. If omitted, one is created from
+            `api_key`.
+
+    Returns:
+        A list of :class:`SceneLabel`, one per input scene, in the same order as `scene_list`.
+
+    Raises:
+        ImportError: If the optional ``twelvelabs`` package is not installed.
+        ValueError: If neither or both of `video_id`/`video_url` are given, or if no API key is
+            available when constructing a client.
+    """
+    if (video_id is None) == (video_url is None):
+        raise ValueError("Exactly one of 'video_id' or 'video_url' must be provided.")
+
+    if client is None:
+        client = _create_client(api_key)
+
+    video_context = None
+    if video_url is not None:
+        from twelvelabs.types.video_context import VideoContext_Url
+
+        video_context = VideoContext_Url(url=video_url)
+
+    labels: list[SceneLabel] = []
+    for index, (start, end) in enumerate(scene_list):
+        start_seconds = start.seconds
+        end_seconds = end.seconds
+        response = client.analyze(
+            model_name=model_name,
+            video_id=video_id,
+            video=video_context,
+            prompt=prompt,
+            max_tokens=max_tokens,
+            start_time=start_seconds,
+            end_time=end_seconds,
+        )
+        labels.append(
+            SceneLabel(
+                index=index,
+                start_time=start_seconds,
+                end_time=end_seconds,
+                label=response.data.strip() if response.data else "",
+            )
+        )
+        logger.debug("Labelled scene %d [%.3fs-%.3fs]", index, start_seconds, end_seconds)
+    return labels
+
+
+def _create_client(api_key: str | None) -> ty.Any:
+    """Create a ``twelvelabs.TwelveLabs`` client, surfacing a friendly error if the optional
+    dependency is missing or no API key is available."""
+    try:
+        from twelvelabs import TwelveLabs
+    except ImportError as ex:
+        raise ImportError(
+            "The 'twelvelabs' package is required for scene labelling. Install it with:\n\n"
+            "    pip install scenedetect[twelvelabs]\n\n"
+            "Get a free API key at https://twelvelabs.io."
+        ) from ex
+
+    key = api_key if api_key is not None else os.environ.get("TWELVELABS_API_KEY")
+    if not key:
+        raise ValueError(
+            "No TwelveLabs API key provided. Pass 'api_key=' or set the TWELVELABS_API_KEY "
+            "environment variable. Get a free key at https://twelvelabs.io."
+        )
+    return TwelveLabs(api_key=key)
diff --git a/tests/test_labels.py b/tests/test_labels.py
new file mode 100644
index 00000000..23d0abd7
--- /dev/null
+++ b/tests/test_labels.py
@@ -0,0 +1,77 @@
+#
+#            PySceneDetect: Python-Based Video Scene Detector
+#   -------------------------------------------------------------------
+#     [  Site:    https://scenedetect.com                           ]
+#     [  Docs:    https://scenedetect.com/docs/                     ]
+#     [  Github:  https://github.com/Breakthrough/PySceneDetect/    ]
+#
+# Copyright (C) 2025 Brandon Castellano <http://www.bcastell.com>.
+# PySceneDetect is licensed under the BSD 3-Clause License; see the
+# included LICENSE file, or visit one of the above pages for details.
+#
+"""Tests for scenedetect.output.labels (optional TwelveLabs scene labelling)."""
+
+import os
+from fractions import Fraction
+
+import pytest
+
+from scenedetect import FrameTimecode
+from scenedetect.output import SceneLabel, label_scenes
+
+FPS = Fraction(30)
+SCENE_LIST = [
+    (FrameTimecode(0, FPS), FrameTimecode(30, FPS)),
+    (FrameTimecode(30, FPS), FrameTimecode(90, FPS)),
+]
+
+
+class _FakeAnalyzeResponse:
+    def __init__(self, data):
+        self.data = data
+
+
+class _FakeClient:
+    """Records each analyze() call so wiring can be asserted without hitting the network."""
+
+    def __init__(self):
+        self.calls = []
+
+    def analyze(self, **kwargs):
+        self.calls.append(kwargs)
+        return _FakeAnalyzeResponse(f"  scene at {kwargs['start_time']}s  ")
+
+
+def test_label_scenes_wires_per_scene_timecodes():
+    client = _FakeClient()
+    labels = label_scenes(SCENE_LIST, video_id="vid123", client=client)
+
+    assert [type(label) for label in labels] == [SceneLabel, SceneLabel]
+    # One Pegasus call per scene, with that scene's start/end in seconds.
+    assert [(c["start_time"], c["end_time"]) for c in client.calls] == [(0.0, 1.0), (1.0, 3.0)]
+    assert all(c["video_id"] == "vid123" for c in client.calls)
+    # Response text is stripped and indices run parallel to the input scene list.
+    assert labels[0].index == 0 and labels[0].label == "scene at 0.0s"
+    assert labels[1].start_time == 1.0 and labels[1].end_time == 3.0
+
+
+def test_label_scenes_requires_exactly_one_source():
+    client = _FakeClient()
+    with pytest.raises(ValueError):
+        label_scenes(SCENE_LIST, client=client)
+    with pytest.raises(ValueError):
+        label_scenes(SCENE_LIST, video_id="a", video_url="http://x", client=client)
+
+
+@pytest.mark.skipif(
+    not os.environ.get("TWELVELABS_API_KEY"),
+    reason="requires TWELVELABS_API_KEY and a reachable video",
+)
+def test_label_scenes_integration():
+    # Opt-in: needs a real key and a public video URL via TWELVELABS_TEST_VIDEO_URL.
+    video_url = os.environ.get("TWELVELABS_TEST_VIDEO_URL")
+    if not video_url:
+        pytest.skip("set TWELVELABS_TEST_VIDEO_URL to a public video to run this test")
+    labels = label_scenes(SCENE_LIST[:1], video_url=video_url)
+    assert len(labels) == 1
+    assert isinstance(labels[0].label, str) and labels[0].label