LaiTool/resources/scripts/shotSplit.py

# pip install scenedetect opencv-python -i https://pypi.tuna.tsinghua.edu.cn/simple

from scenedetect.video_manager import VideoManager
from scenedetect.scene_manager import SceneManager
from scenedetect.stats_manager import StatsManager
from scenedetect.detectors.content_detector import ContentDetector
import os
import sys
import subprocess
from huggingface_hub import hf_hub_download
from faster_whisper import WhisperModel
import public_tools
from pathlib import Path


# 获取智能画面分割的时间或者秒数
def find_scenes(video_path, sensitivity):
    print(
        "正在计算分镜数据" + "sensitivity：" + str(sensitivity) + "path : " + video_path
    )
    sys.stdout.flush()
    video_manager = VideoManager([video_path])
    stats_manager = StatsManager()
    scene_manager = SceneManager(stats_manager)

    # 使用contect-detector
    scene_manager.add_detector(ContentDetector(threshold=float(sensitivity)))

    shijian_list = []

    try:
        video_manager.set_downscale_factor()
        video_manager.start()
        scene_manager.detect_scenes(frame_source=video_manager)
        scene_list = scene_manager.get_scene_list()
        print("分镜数据列表：")
        sys.stdout.flush()
        for i, scene in enumerate(scene_list):
            shijian_list.append([scene[0].get_timecode(), scene[1].get_timecode()])
            print(
                "Scene %2d: Start %s / Frame %d, End %s / Frame %d"
                % (
                    i + 1,
                    scene[0].get_timecode(),
                    scene[0].get_frames(),
                    scene[1].get_timecode(),
                    scene[1].get_frames(),
                )
            )
            sys.stdout.flush()
    finally:
        video_manager.release()

    return shijian_list


# 如果不存在就创建
def createDir(file_dir):
    # 如果不存在文件夹，就创建
    if not os.path.isdir(file_dir):
        os.mkdir(file_dir)


# 切分一个视频
def ClipVideo(video_path, out_folder, image_out_folder, sensitivity, gpu_type):
    shijian_list = find_scenes(video_path, sensitivity)  # 多组时间列表
    shijian_list_len = len(shijian_list)

    print("总共有%s个场景" % str(shijian_list_len))
    sys.stdout.flush()
    video_list = []
    for i in range(0, shijian_list_len):
        start_time_str = shijian_list[i][0]
        end_time_str = shijian_list[i][1]

        print("开始输出第" + str(i + 1) + "个分镜")
        video_name = "{:05d}".format(i + 1)
        out_video_file = os.path.join(out_folder, video_name + ".mp4")
        sys.stdout.flush()
        video_list.append(
            {
                "start_time_str": start_time_str,
                "end_time_str": end_time_str,
                "out_video_file": out_video_file,
                "video_name": video_name,
            }
        )

        # 使用 ffmpeg 裁剪视频
        command = []
        command.append("ffmpeg")
        command.append("-i")
        command.append(video_path)
        command.append("-ss")
        command.append(start_time_str)
        command.append("-to")
        command.append(end_time_str)
        command.append("-c:v")

        if gpu_type == "NVIDIA":
            command.append("h264_nvenc")
        elif gpu_type == "AMD":
            command.append("h264_vaapi")
        else:
            command.append("libx264")

        command.append("-preset")
        command.append("fast")
        command.append("-c:a")
        command.append("copy")
        command.append(out_video_file)
        command.append("-loglevel")
        command.append("error")

        subprocess.run(
            command,
            check=True,
            stderr=subprocess.PIPE,
        )

    print("分镜输出完成。开始抽帧")
    sys.stdout.flush()
    for vi in video_list:
        h, m, s = vi["start_time_str"].split(":")
        start_seconds = int(h) * 3600 + int(m) * 60 + float(s)

        h, m, s = vi["end_time_str"].split(":")
        end_seconds = int(h) * 3600 + int(m) * 60 + float(s)
        print("正在抽帧：" + vi["video_name"])
        sys.stdout.flush()
        subprocess.run(
            [
                "ffmpeg",
                "-ss",
                str((end_seconds - start_seconds) / 2),
                "-i",
                vi["out_video_file"],
                "-frames:v",
                "1",
                os.path.join(image_out_folder, vi["video_name"] + ".png"),
                "-loglevel",
                "error",
            ]
        )

    print("抽帧完成，开始识别文案")
    sys.stdout.flush()
    return video_list


def SplitAudio(video_out_folder, video_list):
    # ffmpeg -i input_file.mp4 -vn -ab 128k output_file.mp3
    print("正在分离音频!!")
    mp3_list = []
    sys.stdout.flush()
    for v in video_list:
        mp3_path = os.path.join(video_out_folder, v["video_name"] + ".mp3")
        mp3_list.append(mp3_path)
        subprocess.run(
            [
                "ffmpeg",
                "-i",
                v["out_video_file"],
                "-vn",
                "-ab",
                "128k",
                mp3_path,
                "-loglevel",
                "error",
            ],
            check=True,
        )
    return mp3_list


def GetText(out_folder, mp3_list):
    text = []
    # 先获取模型
    print("正在下载或加载模型")
    sys.stdout.flush()
    model_path = Path(
        hf_hub_download(repo_id="Systran/faster-whisper-large-v3", filename="model.bin")
    )
    hf_hub_download(
        repo_id="Systran/faster-whisper-large-v3",
        filename="config.json",
    )
    hf_hub_download(
        repo_id="Systran/faster-whisper-large-v3",
        filename="preprocessor_config.json",
    )
    hf_hub_download(
        repo_id="Systran/faster-whisper-large-v3",
        filename="tokenizer.json",
    )
    hf_hub_download(
        repo_id="Systran/faster-whisper-large-v3",
        filename="vocabulary.json",
    )
    model = WhisperModel(
        model_size_or_path=os.path.dirname(model_path),
        device="auto",
        local_files_only=True,
    )
    print("模型加载成功，开始识别")
    sys.stdout.flush()
    for mp in mp3_list:
        segments, info = model.transcribe(
            mp,
            beam_size=5,
            language="zh",
            vad_filter=True,
            vad_parameters=dict(min_silence_duration_ms=1000),
        )
        tmp_text = ""
        for segment in segments:
            tmp_text += segment.text + "。"
        print(mp + "识别完成")
        sys.stdout.flush()
        text.append(tmp_text)

    # 数据写出
    print("文本全部识别成功，正在写出")
    sys.stdout.flush()
    tools = public_tools.PublicTools()
    tools.write_to_file(text, os.path.join(out_folder, "文案.txt"))
    print("写出完成")
    sys.stdout.flush()


def init(video_path, video_out_folder, image_out_folder, sensitivity, gpu_type):
    v_l = ClipVideo(
        video_path, video_out_folder, image_out_folder, sensitivity, gpu_type
    )

    # 开始分离音频
    m_l = SplitAudio(video_out_folder, v_l)
    # 开始识别字幕
    GetText(os.path.dirname(video_out_folder), m_l)