karesaeedff commited on
Commit
57c5b85
·
verified ·
1 Parent(s): fc9c607

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -39
app.py CHANGED
@@ -2,60 +2,56 @@ import gradio as gr
2
  import librosa
3
  import numpy as np
4
  import torch
5
- from transformers import AutoModelForAudioClassification, AutoFeatureExtractor, pipeline
6
  import tempfile
7
  import soundfile as sf
8
  import json
9
 
10
  SAMPLE_RATE = 16000
11
- WINDOW = 10
12
- STEP = 5
13
- MUSIC_THRESHOLD = 0.4
14
  VOICE_THRESHOLD = 0.3
15
- MIN_SING_DURATION = 8
16
 
17
- # === 模型加载 ===
18
  music_model_id = "AI-Music-Detection/ai_music_detection_large_60s"
19
- music_feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
20
  music_model = AutoModelForAudioClassification.from_pretrained(music_model_id)
21
- voice_pipe = pipeline("audio-classification", model="superb/hubert-large-superb-sid")
22
-
23
- def predict_music_score(snippet):
24
- """
25
- 直接手动跑 feature_extractor + model
26
- 避免 pipeline 自动切片问题
27
- """
28
- inputs = music_feature_extractor(snippet, sampling_rate=SAMPLE_RATE, return_tensors="pt", truncation=True, padding="max_length")
29
  with torch.no_grad():
30
  outputs = music_model(**inputs)
31
- scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
32
- labels = music_model.config.id2label
33
- label_scores = {labels[i].lower(): float(scores[i]) for i in range(len(scores))}
34
- # 找 music 或 singing 相关标签
35
- music_score = max([v for k, v in label_scores.items() if "music" in k or "sing" in k] or [0])
36
  return music_score
37
 
 
 
 
 
 
 
 
 
38
 
39
  def detect_singing(audio_path):
40
  wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
41
  duration = len(wav) / SAMPLE_RATE
42
  results = []
43
 
44
- for start in np.arange(0, duration - WINDOW, STEP):
45
- end = start + WINDOW
46
  snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
47
 
48
- # 模型输入安全长度
49
- max_len = SAMPLE_RATE * 60
50
- if len(snippet) < SAMPLE_RATE * 3: # 过短片段跳过
51
- continue
52
- if len(snippet) > max_len:
53
- snippet = snippet[:max_len]
54
-
55
  music_score = predict_music_score(snippet)
56
-
57
- voice_pred = voice_pipe(snippet, sampling_rate=SAMPLE_RATE)
58
- voice_score = max([p['score'] for p in voice_pred if 'speech' in p['label'].lower()] or [0])
59
 
60
  if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
61
  results.append((float(start), float(end)))
@@ -67,7 +63,7 @@ def detect_singing(audio_path):
67
  merged.append(list(seg))
68
  else:
69
  merged[-1][1] = seg[1]
70
- merged = [(s, e) for s, e in merged if e - s >= MIN_SING_DURATION]
71
  return merged
72
 
73
 
@@ -75,14 +71,13 @@ def analyze_audio(file):
75
  if file is None:
76
  return "请上传音频文件", None
77
 
78
- audio_path = file
79
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
80
- data, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
81
  sf.write(tmp.name, data, sr)
82
  segments = detect_singing(tmp.name)
83
 
84
  if not segments:
85
- return "未检测到明显唱歌片段", json.dumps([], indent=2)
86
 
87
  json_output = json.dumps(
88
  [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
@@ -91,9 +86,13 @@ def analyze_audio(file):
91
  return f"检测到 {len(segments)} 段唱歌片段", json_output
92
 
93
 
94
- with gr.Blocks(title="🎵 Singing Segment Detector") as demo:
95
- gr.Markdown("# 🎤 自动识别唱歌片段\n上传音频文件(从视频提取后),返回检测到的���歌时间段 JSON。")
96
- audio_in = gr.Audio(type="filepath", label="上传音频文件(WAV)")
 
 
 
 
97
  btn = gr.Button("开始分析")
98
  status = gr.Textbox(label="分析状态", interactive=False)
99
  json_out = gr.Code(label="唱歌片段时间戳(JSON)", language="json")
 
2
  import librosa
3
  import numpy as np
4
  import torch
5
+ from transformers import AutoProcessor, AutoModelForAudioClassification
6
  import tempfile
7
  import soundfile as sf
8
  import json
9
 
10
  SAMPLE_RATE = 16000
11
+ CHUNK_SIZE = 60 # 模型要求60秒输入
12
+ STEP = 10 # 滑动步长(秒)
13
+ MUSIC_THRESHOLD = 0.5
14
  VOICE_THRESHOLD = 0.3
15
+ MIN_SEG_DURATION = 8 # 最小合并段时长
16
 
17
+ # === 加载固定模型(适用于 60s 音频输入) ===
18
  music_model_id = "AI-Music-Detection/ai_music_detection_large_60s"
19
+ music_processor = AutoProcessor.from_pretrained(music_model_id)
20
  music_model = AutoModelForAudioClassification.from_pretrained(music_model_id)
21
+
22
+ voice_model_id = "superb/hubert-large-superb-sid"
23
+ voice_processor = AutoProcessor.from_pretrained(voice_model_id)
24
+ voice_model = AutoModelForAudioClassification.from_pretrained(voice_model_id)
25
+
26
+ def predict_music_score(wav):
27
+ wav = librosa.util.fix_length(wav, size=SAMPLE_RATE * CHUNK_SIZE)
28
+ inputs = music_processor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
29
  with torch.no_grad():
30
  outputs = music_model(**inputs)
31
+ scores = torch.softmax(outputs.logits, dim=-1).squeeze()
32
+ music_score = float(scores[1]) if scores.numel() > 1 else float(scores[0])
 
 
 
33
  return music_score
34
 
35
+ def predict_voice_score(wav):
36
+ wav = librosa.util.fix_length(wav, size=SAMPLE_RATE * CHUNK_SIZE)
37
+ inputs = voice_processor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
38
+ with torch.no_grad():
39
+ outputs = voice_model(**inputs)
40
+ scores = torch.softmax(outputs.logits, dim=-1).squeeze()
41
+ voice_score = float(scores.mean()) # 简单平均
42
+ return voice_score
43
 
44
  def detect_singing(audio_path):
45
  wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE)
46
  duration = len(wav) / SAMPLE_RATE
47
  results = []
48
 
49
+ for start in np.arange(0, max(0, duration - CHUNK_SIZE), STEP):
50
+ end = start + CHUNK_SIZE
51
  snippet = wav[int(start * SAMPLE_RATE):int(end * SAMPLE_RATE)]
52
 
 
 
 
 
 
 
 
53
  music_score = predict_music_score(snippet)
54
+ voice_score = predict_voice_score(snippet)
 
 
55
 
56
  if music_score > MUSIC_THRESHOLD and voice_score > VOICE_THRESHOLD:
57
  results.append((float(start), float(end)))
 
63
  merged.append(list(seg))
64
  else:
65
  merged[-1][1] = seg[1]
66
+ merged = [(s, e) for s, e in merged if e - s >= MIN_SEG_DURATION]
67
  return merged
68
 
69
 
 
71
  if file is None:
72
  return "请上传音频文件", None
73
 
 
74
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
75
+ data, sr = librosa.load(file, sr=SAMPLE_RATE)
76
  sf.write(tmp.name, data, sr)
77
  segments = detect_singing(tmp.name)
78
 
79
  if not segments:
80
+ return "未检测到唱歌片段", json.dumps([], indent=2)
81
 
82
  json_output = json.dumps(
83
  [{"start": s, "end": e, "duration": round(e - s, 2)} for s, e in segments],
 
86
  return f"检测到 {len(segments)} 段唱歌片段", json_output
87
 
88
 
89
+ with gr.Blocks(title="🎵 Singing Segment Detector (Plan A)") as demo:
90
+ gr.Markdown(
91
+ "# 🎤 高精度唱歌片段检测\n"
92
+ "使用 `AI-Music-Detection/ai_music_detection_large_60s` 模型。\n"
93
+ "将视频音频分块分析(60s输入),输出唱歌时间戳 JSON。"
94
+ )
95
+ audio_in = gr.Audio(type="filepath", label="上传音频文件(从视频抽取)")
96
  btn = gr.Button("开始分析")
97
  status = gr.Textbox(label="分析状态", interactive=False)
98
  json_out = gr.Code(label="唱歌片段时间戳(JSON)", language="json")