Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,21 @@ def split_video_into_scenes(video_path, threshold=27.0):

See [the documentation](https://www.scenedetect.com/docs/latest/api.html) for more examples.

**Optional: Semantic Scene Labels**:

PySceneDetect finds *where* the cuts are; if you also want a short description of *what* is in each scene, the optional `label_scenes` helper runs the detected scenes through the [TwelveLabs](https://twelvelabs.io) Pegasus video-understanding model. Install with `pip install scenedetect[twelvelabs]` and set `TWELVELABS_API_KEY`:

```python
from scenedetect import detect, ContentDetector
from scenedetect.output import label_scenes

scenes = detect("my_video.mp4", ContentDetector())
for label in label_scenes(scenes, video_url="https://example.com/my_video.mp4"):
print(label.index, label.label)
```

This is entirely opt-in and never runs during normal detection. A free API key with a generous free tier is available at [twelvelabs.io](https://twelvelabs.io).

**Benchmark**:

We evaluate the performance of different detectors in terms of accuracy and processing speed. See the [benchmark report](benchmark/README.md) for details.
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ dependencies = [
[project.optional-dependencies]
pyav = ["av>=9.2"]
moviepy = ["moviepy"]
# Optional semantic scene labelling via TwelveLabs Pegasus (see scenedetect.output.label_scenes).
twelvelabs = ["twelvelabs>=1.2.8"]
dev = ["av>=9.2", "moviepy", "pytest>=7.0", "pytest-rerunfailures"]
docs = ["Sphinx==7.0.1", "sphinx-copybutton==0.5.2"]
website = ["mkdocs==1.5.2", "jinja2>=3.1.6"]
Expand Down
2 changes: 2 additions & 0 deletions scenedetect/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@
is_mkvmerge_available,
write_scene_list,
write_scene_list_html,
label_scenes,
PathFormatter,
VideoMetadata,
SceneMetadata,
SceneLabel,
)
from scenedetect.detector import SceneDetector
from scenedetect.detectors import (
Expand Down
6 changes: 6 additions & 0 deletions scenedetect/output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@

# Commonly used classes/functions exported under the `scenedetect.output` namespace for brevity.
from scenedetect.output.image import save_images
from scenedetect.output.labels import (
DEFAULT_MODEL,
DEFAULT_PROMPT,
SceneLabel,
label_scenes,
)
from scenedetect.output.video import (
PathFormatter,
SceneMetadata,
Expand Down
166 changes: 166 additions & 0 deletions scenedetect/output/labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#
# PySceneDetect: Python-Based Video Scene Detector
# -------------------------------------------------------------------
# [ Site: https://scenedetect.com ]
# [ Docs: https://scenedetect.com/docs/ ]
# [ Github: https://github.com/Breakthrough/PySceneDetect/ ]
#
# Copyright (C) 2025 Brandon Castellano <http://www.bcastell.com>.
# PySceneDetect is licensed under the BSD 3-Clause License; see the
# included LICENSE file, or visit one of the above pages for details.
#

"""The ``scenedetect.output.labels`` module adds *optional* semantic labels to the scenes found by
PySceneDetect. Pixel-based detectors locate the cuts; this helper sends each detected time range to
the `TwelveLabs <https://twelvelabs.io>`_ Pegasus video-understanding model and attaches a short
natural-language description to each scene.

This integration is entirely opt-in: it is never invoked by detection and requires the optional
``twelvelabs`` dependency (``pip install scenedetect[twelvelabs]``) plus a TwelveLabs API key. A
free key with a generous free tier is available at https://twelvelabs.io.

Per-scene labelling relies on Pegasus' ``start_time``/``end_time`` support, which currently requires
the ``pegasus1.5`` model with a ``video_url`` (or asset) source rather than a ``video_id``. Use the
same video you ran detection on so the timecodes line up::

from scenedetect import detect, ContentDetector
from scenedetect.output import label_scenes

scenes = detect("my_video.mp4", ContentDetector())
labels = label_scenes(scenes, video_url="https://example.com/my_video.mp4")
for label in labels:
print(label.index, label.label)

A ``video_id`` (already indexed) source is also accepted for models that support it.
"""

import logging
import os
import typing as ty
from dataclasses import dataclass

from scenedetect.common import SceneList

logger = logging.getLogger("pyscenedetect")

DEFAULT_MODEL: str = "pegasus1.5"
"""Default TwelveLabs Pegasus model used for scene labelling."""

DEFAULT_PROMPT: str = (
"Describe what happens in this part of the video in a single concise sentence."
)
"""Default prompt sent to Pegasus for each scene."""


@dataclass
class SceneLabel:
"""A semantic label generated for a single detected scene. The list returned by
:func:`label_scenes` runs parallel to (and in the same order as) the input scene list."""

index: int
"""0-based index of the scene this label describes."""
start_time: float
"""Scene start, in seconds from the beginning of the video."""
end_time: float
"""Scene end, in seconds from the beginning of the video."""
label: str
"""Natural-language description of the scene returned by Pegasus."""


def label_scenes(
scene_list: SceneList,
video_id: str | None = None,
*,
video_url: str | None = None,
model_name: str = DEFAULT_MODEL,
prompt: str = DEFAULT_PROMPT,
max_tokens: int = 512,
api_key: str | None = None,
client: ty.Any | None = None,
) -> list[SceneLabel]:
"""Generate a semantic label for each scene using the TwelveLabs Pegasus model.

Each scene's start/end timecode is forwarded to Pegasus so the description covers only that
portion of the video. This is opt-in and does not affect detection in any way.

Arguments:
scene_list: Scenes to label, as returned by
:meth:`SceneManager.get_scene_list()
<scenedetect.scene_manager.SceneManager.get_scene_list>`
or :func:`detect() <scenedetect.detect>`.
video_id: ID of the video as already uploaded/indexed with TwelveLabs. Mutually exclusive
with `video_url`; one of the two is required.
video_url: Public URL of the video. Mutually exclusive with `video_id`.
model_name: TwelveLabs Pegasus model to use (default: ``"pegasus1.5"``).
prompt: Instruction sent to Pegasus for each scene.
max_tokens: Maximum number of tokens Pegasus may generate per scene. Note that
``pegasus1.5`` requires this to be at least 512.
api_key: TwelveLabs API key. Defaults to the ``TWELVELABS_API_KEY`` environment variable.
Ignored when `client` is provided.
client: Pre-configured ``twelvelabs.TwelveLabs`` client. If omitted, one is created from
`api_key`.

Returns:
A list of :class:`SceneLabel`, one per input scene, in the same order as `scene_list`.

Raises:
ImportError: If the optional ``twelvelabs`` package is not installed.
ValueError: If neither or both of `video_id`/`video_url` are given, or if no API key is
available when constructing a client.
"""
if (video_id is None) == (video_url is None):
raise ValueError("Exactly one of 'video_id' or 'video_url' must be provided.")

if client is None:
client = _create_client(api_key)

video_context = None
if video_url is not None:
from twelvelabs.types.video_context import VideoContext_Url

video_context = VideoContext_Url(url=video_url)

labels: list[SceneLabel] = []
for index, (start, end) in enumerate(scene_list):
start_seconds = start.seconds
end_seconds = end.seconds
response = client.analyze(
model_name=model_name,
video_id=video_id,
video=video_context,
prompt=prompt,
max_tokens=max_tokens,
start_time=start_seconds,
end_time=end_seconds,
)
labels.append(
SceneLabel(
index=index,
start_time=start_seconds,
end_time=end_seconds,
label=response.data.strip() if response.data else "",
)
)
logger.debug("Labelled scene %d [%.3fs-%.3fs]", index, start_seconds, end_seconds)
return labels


def _create_client(api_key: str | None) -> ty.Any:
"""Create a ``twelvelabs.TwelveLabs`` client, surfacing a friendly error if the optional
dependency is missing or no API key is available."""
try:
from twelvelabs import TwelveLabs
except ImportError as ex:
raise ImportError(
"The 'twelvelabs' package is required for scene labelling. Install it with:\n\n"
" pip install scenedetect[twelvelabs]\n\n"
"Get a free API key at https://twelvelabs.io."
) from ex

key = api_key if api_key is not None else os.environ.get("TWELVELABS_API_KEY")
if not key:
raise ValueError(
"No TwelveLabs API key provided. Pass 'api_key=' or set the TWELVELABS_API_KEY "
"environment variable. Get a free key at https://twelvelabs.io."
)
return TwelveLabs(api_key=key)
77 changes: 77 additions & 0 deletions tests/test_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#
# PySceneDetect: Python-Based Video Scene Detector
# -------------------------------------------------------------------
# [ Site: https://scenedetect.com ]
# [ Docs: https://scenedetect.com/docs/ ]
# [ Github: https://github.com/Breakthrough/PySceneDetect/ ]
#
# Copyright (C) 2025 Brandon Castellano <http://www.bcastell.com>.
# PySceneDetect is licensed under the BSD 3-Clause License; see the
# included LICENSE file, or visit one of the above pages for details.
#
"""Tests for scenedetect.output.labels (optional TwelveLabs scene labelling)."""

import os
from fractions import Fraction

import pytest

from scenedetect import FrameTimecode
from scenedetect.output import SceneLabel, label_scenes

FPS = Fraction(30)
SCENE_LIST = [
(FrameTimecode(0, FPS), FrameTimecode(30, FPS)),
(FrameTimecode(30, FPS), FrameTimecode(90, FPS)),
]


class _FakeAnalyzeResponse:
def __init__(self, data):
self.data = data


class _FakeClient:
"""Records each analyze() call so wiring can be asserted without hitting the network."""

def __init__(self):
self.calls = []

def analyze(self, **kwargs):
self.calls.append(kwargs)
return _FakeAnalyzeResponse(f" scene at {kwargs['start_time']}s ")


def test_label_scenes_wires_per_scene_timecodes():
client = _FakeClient()
labels = label_scenes(SCENE_LIST, video_id="vid123", client=client)

assert [type(label) for label in labels] == [SceneLabel, SceneLabel]
# One Pegasus call per scene, with that scene's start/end in seconds.
assert [(c["start_time"], c["end_time"]) for c in client.calls] == [(0.0, 1.0), (1.0, 3.0)]
assert all(c["video_id"] == "vid123" for c in client.calls)
# Response text is stripped and indices run parallel to the input scene list.
assert labels[0].index == 0 and labels[0].label == "scene at 0.0s"
assert labels[1].start_time == 1.0 and labels[1].end_time == 3.0


def test_label_scenes_requires_exactly_one_source():
client = _FakeClient()
with pytest.raises(ValueError):
label_scenes(SCENE_LIST, client=client)
with pytest.raises(ValueError):
label_scenes(SCENE_LIST, video_id="a", video_url="http://x", client=client)


@pytest.mark.skipif(
not os.environ.get("TWELVELABS_API_KEY"),
reason="requires TWELVELABS_API_KEY and a reachable video",
)
def test_label_scenes_integration():
# Opt-in: needs a real key and a public video URL via TWELVELABS_TEST_VIDEO_URL.
video_url = os.environ.get("TWELVELABS_TEST_VIDEO_URL")
if not video_url:
pytest.skip("set TWELVELABS_TEST_VIDEO_URL to a public video to run this test")
labels = label_scenes(SCENE_LIST[:1], video_url=video_url)
assert len(labels) == 1
assert isinstance(labels[0].label, str) and labels[0].label