FunClip / app.py
Zhifu Gao
feat: initial FunClip demo - AI video clipping with FunASR
a9f639a
import os
import json
import tempfile
import subprocess
import gradio as gr
import numpy as np
import torch
from funasr import AutoModel
model = AutoModel(
model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
hub="hf",
model_hub="hf",
device="cpu",
)
def extract_audio(video_path):
audio_path = tempfile.mktemp(suffix=".wav")
cmd = [
"ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le",
"-ar", "16000", "-ac", "1", "-y", audio_path
]
subprocess.run(cmd, capture_output=True)
return audio_path
def transcribe_video(video_path, progress=gr.Progress()):
if video_path is None:
return "Please upload a video file.", [], None
progress(0.1, desc="Extracting audio...")
audio_path = extract_audio(video_path)
if not os.path.exists(audio_path):
return "Failed to extract audio from video. Make sure it contains an audio track.", [], None
progress(0.3, desc="Transcribing speech...")
try:
res = model.generate(input=audio_path, batch_size_s=300)
except Exception as e:
return f"Transcription error: {str(e)}", [], None
finally:
if os.path.exists(audio_path):
os.unlink(audio_path)
if not res or not res[0].get("sentence_info"):
text = res[0].get("text", "") if res else ""
return text, [], None
progress(0.8, desc="Processing timestamps...")
sentences = []
for sent in res[0]["sentence_info"]:
start_ms = sent["start"]
end_ms = sent["end"]
text = sent["text"]
sentences.append({
"start": start_ms / 1000.0,
"end": end_ms / 1000.0,
"text": text,
})
full_text = "\n".join(
[f"[{s['start']:.1f}s - {s['end']:.1f}s] {s['text']}" for s in sentences]
)
progress(1.0, desc="Done!")
return full_text, sentences, json.dumps(sentences, ensure_ascii=False)
def clip_video(video_path, sentences_json, selected_indices):
if not video_path or not sentences_json or not selected_indices:
return None, "Please transcribe a video first, then select segments to clip."
sentences = json.loads(sentences_json)
indices = [int(i) for i in selected_indices]
if not indices:
return None, "No segments selected."
clips = []
for idx in sorted(indices):
if 0 <= idx < len(sentences):
clips.append((sentences[idx]["start"], sentences[idx]["end"]))
if not clips:
return None, "Invalid selection."
merged = [clips[0]]
for start, end in clips[1:]:
if start - merged[-1][1] < 0.5:
merged[-1] = (merged[-1][0], end)
else:
merged.append((start, end))
output_path = tempfile.mktemp(suffix=".mp4")
filter_parts = []
for i, (start, end) in enumerate(merged):
filter_parts.append(
f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}];"
f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}];"
)
concat_v = "".join(f"[v{i}]" for i in range(len(merged)))
concat_a = "".join(f"[a{i}]" for i in range(len(merged)))
filter_parts.append(f"{concat_v}{concat_a}concat=n={len(merged)}:v=1:a=1[outv][outa]")
filter_complex = "".join(filter_parts)
cmd = [
"ffmpeg", "-i", video_path, "-filter_complex", filter_complex,
"-map", "[outv]", "-map", "[outa]", "-y", output_path
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
return None, f"FFmpeg error: {result.stderr[-500:]}"
total_duration = sum(end - start for start, end in merged)
return output_path, f"Clipped {len(merged)} segment(s), total {total_duration:.1f}s"
description_html = """
<div style="text-align: center; max-width: 850px; margin: 0 auto;">
<h1 style="font-size: 2.2em; margin-bottom: 0.1em;">✂️ FunClip</h1>
<p style="font-size: 1.3em; color: #444;">AI Video Clipping — Speak to Clip</p>
<p style="font-size: 1em; color: #666;">
Upload a video → Auto-transcribe with timestamps → Select text segments → Export precise clips
</p>
<p style="font-size: 0.9em; margin-top: 0.8em;">
<a href="https://github.com/modelscope/FunClip" target="_blank">⭐ GitHub (5.6k+ stars)</a> ·
<a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR</a> ·
<a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR</a>
</p>
</div>
"""
how_it_works = """
### How It Works
1. **Upload** a video (any format with audio)
2. **Transcribe** — FunASR extracts speech with precise timestamps
3. **Select** the sentences you want to keep (by index)
4. **Clip** — FFmpeg cuts and concatenates the selected segments
For the full experience with LLM-assisted smart clipping, install [FunClip](https://github.com/modelscope/FunClip) locally.
"""
def build_selector(sentences_json):
if not sentences_json:
return gr.update(choices=[], value=[])
sentences = json.loads(sentences_json)
choices = [f"{i}: [{s['start']:.1f}s-{s['end']:.1f}s] {s['text']}" for i, s in enumerate(sentences)]
return gr.update(choices=choices, value=[])
def launch():
with gr.Blocks(theme=gr.themes.Soft(), title="FunClip - AI Video Clipping") as demo:
gr.HTML(description_html)
sentences_state = gr.State("")
with gr.Tab("1. Transcribe"):
with gr.Row():
video_input = gr.Video(label="Upload Video")
transcribe_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg")
transcript_output = gr.Textbox(label="Transcription with Timestamps", lines=12, show_copy_button=True)
with gr.Tab("2. Clip"):
segment_selector = gr.CheckboxGroup(
label="Select segments to clip",
choices=[],
)
clip_btn = gr.Button("✂️ Generate Clip", variant="primary", size="lg")
with gr.Row():
clip_output = gr.Video(label="Output Clip")
clip_info = gr.Textbox(label="Info", lines=2)
transcribe_btn.click(
transcribe_video,
inputs=[video_input],
outputs=[transcript_output, gr.State(), sentences_state],
).then(
build_selector,
inputs=[sentences_state],
outputs=[segment_selector],
)
clip_btn.click(
clip_video,
inputs=[video_input, sentences_state, segment_selector],
outputs=[clip_output, clip_info],
)
gr.Markdown(how_it_works)
demo.launch()
if __name__ == "__main__":
launch()