Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import tempfile | |
| import subprocess | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from funasr import AutoModel | |
| model = AutoModel( | |
| model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch", | |
| hub="hf", | |
| model_hub="hf", | |
| device="cpu", | |
| ) | |
| def extract_audio(video_path): | |
| audio_path = tempfile.mktemp(suffix=".wav") | |
| cmd = [ | |
| "ffmpeg", "-i", video_path, "-vn", "-acodec", "pcm_s16le", | |
| "-ar", "16000", "-ac", "1", "-y", audio_path | |
| ] | |
| subprocess.run(cmd, capture_output=True) | |
| return audio_path | |
| def transcribe_video(video_path, progress=gr.Progress()): | |
| if video_path is None: | |
| return "Please upload a video file.", [], None | |
| progress(0.1, desc="Extracting audio...") | |
| audio_path = extract_audio(video_path) | |
| if not os.path.exists(audio_path): | |
| return "Failed to extract audio from video. Make sure it contains an audio track.", [], None | |
| progress(0.3, desc="Transcribing speech...") | |
| try: | |
| res = model.generate(input=audio_path, batch_size_s=300) | |
| except Exception as e: | |
| return f"Transcription error: {str(e)}", [], None | |
| finally: | |
| if os.path.exists(audio_path): | |
| os.unlink(audio_path) | |
| if not res or not res[0].get("sentence_info"): | |
| text = res[0].get("text", "") if res else "" | |
| return text, [], None | |
| progress(0.8, desc="Processing timestamps...") | |
| sentences = [] | |
| for sent in res[0]["sentence_info"]: | |
| start_ms = sent["start"] | |
| end_ms = sent["end"] | |
| text = sent["text"] | |
| sentences.append({ | |
| "start": start_ms / 1000.0, | |
| "end": end_ms / 1000.0, | |
| "text": text, | |
| }) | |
| full_text = "\n".join( | |
| [f"[{s['start']:.1f}s - {s['end']:.1f}s] {s['text']}" for s in sentences] | |
| ) | |
| progress(1.0, desc="Done!") | |
| return full_text, sentences, json.dumps(sentences, ensure_ascii=False) | |
| def clip_video(video_path, sentences_json, selected_indices): | |
| if not video_path or not sentences_json or not selected_indices: | |
| return None, "Please transcribe a video first, then select segments to clip." | |
| sentences = json.loads(sentences_json) | |
| indices = [int(i) for i in selected_indices] | |
| if not indices: | |
| return None, "No segments selected." | |
| clips = [] | |
| for idx in sorted(indices): | |
| if 0 <= idx < len(sentences): | |
| clips.append((sentences[idx]["start"], sentences[idx]["end"])) | |
| if not clips: | |
| return None, "Invalid selection." | |
| merged = [clips[0]] | |
| for start, end in clips[1:]: | |
| if start - merged[-1][1] < 0.5: | |
| merged[-1] = (merged[-1][0], end) | |
| else: | |
| merged.append((start, end)) | |
| output_path = tempfile.mktemp(suffix=".mp4") | |
| filter_parts = [] | |
| for i, (start, end) in enumerate(merged): | |
| filter_parts.append( | |
| f"[0:v]trim=start={start:.3f}:end={end:.3f},setpts=PTS-STARTPTS[v{i}];" | |
| f"[0:a]atrim=start={start:.3f}:end={end:.3f},asetpts=PTS-STARTPTS[a{i}];" | |
| ) | |
| concat_v = "".join(f"[v{i}]" for i in range(len(merged))) | |
| concat_a = "".join(f"[a{i}]" for i in range(len(merged))) | |
| filter_parts.append(f"{concat_v}{concat_a}concat=n={len(merged)}:v=1:a=1[outv][outa]") | |
| filter_complex = "".join(filter_parts) | |
| cmd = [ | |
| "ffmpeg", "-i", video_path, "-filter_complex", filter_complex, | |
| "-map", "[outv]", "-map", "[outa]", "-y", output_path | |
| ] | |
| result = subprocess.run(cmd, capture_output=True, text=True) | |
| if result.returncode != 0: | |
| return None, f"FFmpeg error: {result.stderr[-500:]}" | |
| total_duration = sum(end - start for start, end in merged) | |
| return output_path, f"Clipped {len(merged)} segment(s), total {total_duration:.1f}s" | |
| description_html = """ | |
| <div style="text-align: center; max-width: 850px; margin: 0 auto;"> | |
| <h1 style="font-size: 2.2em; margin-bottom: 0.1em;">✂️ FunClip</h1> | |
| <p style="font-size: 1.3em; color: #444;">AI Video Clipping — Speak to Clip</p> | |
| <p style="font-size: 1em; color: #666;"> | |
| Upload a video → Auto-transcribe with timestamps → Select text segments → Export precise clips | |
| </p> | |
| <p style="font-size: 0.9em; margin-top: 0.8em;"> | |
| <a href="https://github.com/modelscope/FunClip" target="_blank">⭐ GitHub (5.6k+ stars)</a> · | |
| <a href="https://github.com/modelscope/FunASR" target="_blank">🛠️ FunASR</a> · | |
| <a href="https://github.com/FunAudioLLM/Fun-ASR" target="_blank">🚀 Fun-ASR</a> | |
| </p> | |
| </div> | |
| """ | |
| how_it_works = """ | |
| ### How It Works | |
| 1. **Upload** a video (any format with audio) | |
| 2. **Transcribe** — FunASR extracts speech with precise timestamps | |
| 3. **Select** the sentences you want to keep (by index) | |
| 4. **Clip** — FFmpeg cuts and concatenates the selected segments | |
| For the full experience with LLM-assisted smart clipping, install [FunClip](https://github.com/modelscope/FunClip) locally. | |
| """ | |
| def build_selector(sentences_json): | |
| if not sentences_json: | |
| return gr.update(choices=[], value=[]) | |
| sentences = json.loads(sentences_json) | |
| choices = [f"{i}: [{s['start']:.1f}s-{s['end']:.1f}s] {s['text']}" for i, s in enumerate(sentences)] | |
| return gr.update(choices=choices, value=[]) | |
| def launch(): | |
| with gr.Blocks(theme=gr.themes.Soft(), title="FunClip - AI Video Clipping") as demo: | |
| gr.HTML(description_html) | |
| sentences_state = gr.State("") | |
| with gr.Tab("1. Transcribe"): | |
| with gr.Row(): | |
| video_input = gr.Video(label="Upload Video") | |
| transcribe_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg") | |
| transcript_output = gr.Textbox(label="Transcription with Timestamps", lines=12, show_copy_button=True) | |
| with gr.Tab("2. Clip"): | |
| segment_selector = gr.CheckboxGroup( | |
| label="Select segments to clip", | |
| choices=[], | |
| ) | |
| clip_btn = gr.Button("✂️ Generate Clip", variant="primary", size="lg") | |
| with gr.Row(): | |
| clip_output = gr.Video(label="Output Clip") | |
| clip_info = gr.Textbox(label="Info", lines=2) | |
| transcribe_btn.click( | |
| transcribe_video, | |
| inputs=[video_input], | |
| outputs=[transcript_output, gr.State(), sentences_state], | |
| ).then( | |
| build_selector, | |
| inputs=[sentences_state], | |
| outputs=[segment_selector], | |
| ) | |
| clip_btn.click( | |
| clip_video, | |
| inputs=[video_input, sentences_state, segment_selector], | |
| outputs=[clip_output, clip_info], | |
| ) | |
| gr.Markdown(how_it_works) | |
| demo.launch() | |
| if __name__ == "__main__": | |
| launch() | |