Spaces:

karesaeedff
/

singing-segment-detector

Running

App Files Files Community

karesaeedff commited on Oct 23

Commit

57c5b85

verified ·

1 Parent(s): fc9c607

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -39

app.py CHANGED Viewed

@@ -2,60 +2,56 @@ import gradio as gr
 import librosa
 import numpy as np
 import torch
-from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, pipeline
 import tempfile
 import soundfile as sf
 import json
 SAMPLE_RATE = 16000
-WINDOW = 10
-STEP = 5
-MUSIC_THRESHOLD = 0.4
 VOICE_THRESHOLD = 0.3
-MIN_SING_DURATION = 8
-# === 模型加载 ===
 music_model_id = "AI-Music-Detection/ai_music_detection_large_60s"
-music_feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
 music_model = AutoModelForAudioClassification.from_pretrained(music_model_id)
-voice_pipe = pipeline("audio-classification", model="superb/hubert-large-superb-sid")
-def predict_music_score(snippet):
-    """
-    直接手动跑 feature_extractor + model
-    避免 pipeline 自动切片问题
-    """
-    inputs = music_feature_extractor(snippet, sampling_rate=SAMPLE_RATE, return_tensors="pt", truncation=True, padding="max_length")
     with torch.no_grad():
         outputs = music_model(**inputs)
-        scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
-        labels = music_model.config.id2label
-        label_scores = {labels[i].lower(): float(scores[i]) for i in range(len(scores))}
-    # 找 music 或 singing 相关标签
-    music_score = max([v for k, v in label_scores.items() if "music" in k or "sing" in k] or [0])
     return music_score
 def detect_singing(audio_path):
     wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
     duration = len(wav) / SAMPLE_RATE
     results = []
-    for start in np.arange(0, duration - WINDOW, STEP):
-        end = start + WINDOW
         snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
-        # 模型输入安全长度
-        max_len = SAMPLE_RATE * 60
-        if len(snippet) < SAMPLE_RATE * 3:  # 过短片段跳过
-            continue
-        if len(snippet) > max_len:
-            snippet = snippet[:max_len]
         music_score = predict_music_score(snippet)
-        voice_pred = voice_pipe(snippet, sampling_rate=SAMPLE_RATE)
-        voice_score = max([p['score'] for p in voice_pred if 'speech' in p['label'].lower()] or [0])
         if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
             results.append((float(start), float(end)))
@@ -67,7 +63,7 @@ def detect_singing(audio_path):
             merged.append(list(seg))
         else:
             merged[-1][1] = seg[1]
-    merged = [(s, e) for s, e in merged if e - s >= MIN_SING_DURATION]
     return merged
@@ -75,14 +71,13 @@ def analyze_audio(file):
     if file is None:
         return "请上传音频文件", None
-    audio_path = file
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-        data, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
         sf.write(tmp.name, data, sr)
         segments = detect_singing(tmp.name)
     if not segments:
-        return "未检测到明显唱歌片段", json.dumps([], indent=2)
     json_output = json.dumps(
         [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
@@ -91,9 +86,13 @@ def analyze_audio(file):
     return f"检测到 {len(segments)} 段唱歌片段", json_output
-with gr.Blocks(title="🎵 Singing Segment Detector") as demo:
-    gr.Markdown("# 🎤 自动识别唱歌片段\n上传音频文件（从视频提取后），返回检测到的���歌时间段 JSON。")
-    audio_in = gr.Audio(type="filepath", label="上传音频文件（WAV）")
     btn = gr.Button("开始分析")
     status = gr.Textbox(label="分析状态", interactive=False)
     json_out = gr.Code(label="唱歌片段时间戳（JSON）", language="json")

 import librosa
 import numpy as np
 import torch
+from transformers import AutoProcessor, AutoModelForAudioClassification
 import tempfile
 import soundfile as sf
 import json
 SAMPLE_RATE = 16000
+CHUNK_SIZE = 60  # 模型要求60秒输入
+STEP = 10        # 滑动步长（秒）
+MUSIC_THRESHOLD = 0.5
 VOICE_THRESHOLD = 0.3
+MIN_SEG_DURATION = 8  # 最小合并段时长
+# === 加载固定模型（适用于 60s 音频输入） ===
 music_model_id = "AI-Music-Detection/ai_music_detection_large_60s"
+music_processor = AutoProcessor.from_pretrained(music_model_id)
 music_model = AutoModelForAudioClassification.from_pretrained(music_model_id)
+voice_model_id = "superb/hubert-large-superb-sid"
+voice_processor = AutoProcessor.from_pretrained(voice_model_id)
+voice_model = AutoModelForAudioClassification.from_pretrained(voice_model_id)
+def predict_music_score(wav):
+    wav = librosa.util.fix_length(wav, size=SAMPLE_RATE * CHUNK_SIZE)
+    inputs = music_processor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
     with torch.no_grad():
         outputs = music_model(**inputs)
+        scores = torch.softmax(outputs.logits, dim=-1).squeeze()
+        music_score = float(scores[1]) if scores.numel() > 1 else float(scores[0])
     return music_score
+def predict_voice_score(wav):
+    wav = librosa.util.fix_length(wav, size=SAMPLE_RATE * CHUNK_SIZE)
+    inputs = voice_processor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
+    with torch.no_grad():
+        outputs = voice_model(**inputs)
+        scores = torch.softmax(outputs.logits, dim=-1).squeeze()
+        voice_score = float(scores.mean())  # 简单平均
+    return voice_score
 def detect_singing(audio_path):
     wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
     duration = len(wav) / SAMPLE_RATE
     results = []
+    for start in np.arange(0, max(0, duration - CHUNK_SIZE), STEP):
+        end = start + CHUNK_SIZE
         snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
         music_score = predict_music_score(snippet)
+        voice_score = predict_voice_score(snippet)
         if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
             results.append((float(start), float(end)))
             merged.append(list(seg))
         else:
             merged[-1][1] = seg[1]
+    merged = [(s, e) for s, e in merged if e - s >= MIN_SEG_DURATION]
     return merged
     if file is None:
         return "请上传音频文件", None
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        data, sr = librosa.load(file, sr=SAMPLE_RATE)
         sf.write(tmp.name, data, sr)
         segments = detect_singing(tmp.name)
     if not segments:
+        return "未检测到唱歌片段", json.dumps([], indent=2)
     json_output = json.dumps(
         [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
     return f"检测到 {len(segments)} 段唱歌片段", json_output
+with gr.Blocks(title="🎵 Singing Segment Detector (Plan A)") as demo:
+    gr.Markdown(
+        "# 🎤 高精度唱歌片段检测\n"
+        "使用 `AI-Music-Detection/ai_music_detection_large_60s` 模型。\n"
+        "将视频音频分块分析（60s输入），输出唱歌时间戳 JSON。"
+    )
+    audio_in = gr.Audio(type="filepath", label="上传音频文件（从视频抽取）")
     btn = gr.Button("开始分析")
     status = gr.Textbox(label="分析状态", interactive=False)
     json_out = gr.Code(label="唱歌片段时间戳（JSON）", language="json")