|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import warnings |
|
|
warnings.filterwarnings('ignore') |
|
|
|
|
|
print("π MMS Language Identification Test (Final Verified Version)") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
from google.colab import drive |
|
|
|
|
|
|
|
|
print("π¦ Installing and updating packages...") |
|
|
|
|
|
print("β
Setup complete! Please restart the runtime now to apply updates.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
import librosa |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from datetime import datetime |
|
|
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification |
|
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
|
|
|
|
|
|
CUSTOM_FOLDER_MAPPING = { |
|
|
'as': 'asm', 'bn': 'ben', 'br': 'brx', 'doi': 'dgo', 'en': 'eng', |
|
|
'gu': 'guj', 'hi': 'hin', 'kn': 'kan', 'kok': 'kok', 'ks': 'kas', |
|
|
'mai': 'mai', 'ml': 'mal', 'mni': 'mni', 'mr': 'mar', 'ne': 'nep', |
|
|
'or': 'ory', 'pa': 'pan', 'sa': 'san', 'sat': 'sat', 'sd': 'snd', |
|
|
'ta': 'tam', 'te': 'tel', 'ur': 'urd' |
|
|
} |
|
|
ISO_TO_FULL_NAME = { |
|
|
'asm': 'Assamese', 'ben': 'Bengali', 'brx': 'Bodo', 'dgo': 'Dogri', 'eng': 'English', |
|
|
'guj': 'Gujarati', 'hin': 'Hindi', 'kan': 'Kannada', 'kok': 'Konkani', 'kas': 'Kashmiri', |
|
|
'mai': 'Maithili', 'mal': 'Malayalam', 'mni': 'Manipuri', 'mar': 'Marathi', 'nep': 'Nepali', |
|
|
'ory': 'Odia', 'pan': 'Punjabi', 'san': 'Sanskrit', 'sat': 'Santali', 'snd': 'Sindhi', |
|
|
'tam': 'Tamil', 'tel': 'Telugu', 'urd': 'Urdu' |
|
|
} |
|
|
|
|
|
|
|
|
AUDIO_FOLDER = "/content/drive/MyDrive/Audio_files" |
|
|
RESULTS_FOLDER = "/content/drive/MyDrive/mms_lid_results" |
|
|
os.makedirs(RESULTS_FOLDER, exist_ok=True) |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
print(f"π§ Device: {device}") |
|
|
|
|
|
MODEL_NAME = "facebook/mms-lid-256" |
|
|
|
|
|
|
|
|
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME) |
|
|
|
|
|
|
|
|
model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device) |
|
|
model.eval() |
|
|
|
|
|
print(f"β
MMS LID model and feature extractor loaded successfully: {MODEL_NAME}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_audio_raw(file_path): |
|
|
try: |
|
|
audio, sr = librosa.load(file_path, sr=16000, mono=True) |
|
|
duration = len(audio) / 16000 |
|
|
return audio, duration |
|
|
except Exception as e: |
|
|
print(f"Error loading {file_path}: {e}") |
|
|
return None, 0 |
|
|
|
|
|
def predict_language_mms(audio_array): |
|
|
try: |
|
|
|
|
|
inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt") |
|
|
inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
|
|
|
with torch.no_grad(): |
|
|
outputs = model(**inputs) |
|
|
|
|
|
logits = outputs.logits |
|
|
pred_idx = torch.argmax(logits, dim=-1).item() |
|
|
pred_lang_code = model.config.id2label[pred_idx] |
|
|
|
|
|
probabilities = torch.softmax(logits, dim=-1)[0] |
|
|
confidence = probabilities[pred_idx].item() |
|
|
|
|
|
return pred_lang_code, confidence |
|
|
|
|
|
except Exception as e: |
|
|
return "error", 0.0 |
|
|
|
|
|
def find_audio_files(base_path): |
|
|
audio_files = [] |
|
|
for root, _, files in os.walk(base_path): |
|
|
folder_code = os.path.basename(root).lower() |
|
|
if folder_code in CUSTOM_FOLDER_MAPPING: |
|
|
ground_truth_iso = CUSTOM_FOLDER_MAPPING[folder_code] |
|
|
for file in files: |
|
|
if file.lower().endswith(('.wav', '.mp3', '.m4a', '.flac', '.ogg')): |
|
|
audio_files.append({ |
|
|
"file_path": os.path.join(root, file), |
|
|
"filename": file, |
|
|
"ground_truth": ground_truth_iso |
|
|
}) |
|
|
return audio_files |
|
|
|
|
|
print("β
Functions are ready!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def run_full_analysis(): |
|
|
print("π Processing FULL dataset with MMS LID Model...") |
|
|
|
|
|
audio_files = find_audio_files(AUDIO_FOLDER) |
|
|
if not audio_files: |
|
|
print("β No audio files found. Please check your AUDIO_FOLDER path.") |
|
|
return |
|
|
|
|
|
total_files = len(audio_files) |
|
|
results = [] |
|
|
|
|
|
print(f"π Processing {total_files} files...") |
|
|
print("-" * 50) |
|
|
|
|
|
for i, file_info in enumerate(audio_files): |
|
|
if (i + 1) % 50 == 0: |
|
|
print(f"Progress: {i+1}/{total_files} ({(i+1)/total_files*100:.1f}%)") |
|
|
|
|
|
audio, duration = load_audio_raw(str(file_info['file_path'])) |
|
|
if audio is None: |
|
|
result = {**file_info, "predicted_language": "load_error", "confidence": 0.0, "duration": 0.0, "is_short_file": False} |
|
|
else: |
|
|
pred_lang_code, confidence = predict_language_mms(audio) |
|
|
is_short = duration < 3.0 |
|
|
result = {**file_info, "predicted_language": pred_lang_code, "confidence": confidence, "duration": duration, "is_short_file": is_short} |
|
|
|
|
|
if is_short and pred_lang_code != "error": |
|
|
print(f"β οΈ SHORT ({duration:.1f}s): {file_info['filename']} -> {ISO_TO_FULL_NAME.get(pred_lang_code, pred_lang_code)} ({confidence:.3f})") |
|
|
|
|
|
results.append(result) |
|
|
|
|
|
results_df = pd.DataFrame(results) |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
csv_path = f"{RESULTS_FOLDER}/mms_lid_results_{timestamp}.csv" |
|
|
results_df.to_csv(csv_path, index=False) |
|
|
print(f"\nβ
Processing complete! Results saved to: {csv_path}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("π MMS LID MODEL - DETAILED ANALYSIS") |
|
|
print("=" * 60) |
|
|
|
|
|
valid_data = results_df[(results_df['predicted_language'] != 'error') & (results_df['predicted_language'] != 'load_error')] |
|
|
|
|
|
if len(valid_data) > 0: |
|
|
overall_accuracy = accuracy_score(valid_data['ground_truth'], valid_data['predicted_language']) |
|
|
print(f"\nπ― OVERALL MODEL ACCURACY: {overall_accuracy:.2%}") |
|
|
|
|
|
print(f"\nπ LANGUAGE-WISE ACCURACY:") |
|
|
report_true = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['ground_truth']] |
|
|
report_pred = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['predicted_language']] |
|
|
print(classification_report(report_true, report_pred, zero_division=0)) |
|
|
|
|
|
short_files = results_df[results_df.get('is_short_file', False) == True] |
|
|
valid_short = short_files[(short_files['predicted_language'] != 'error') & (short_files['predicted_language'] != 'load_error')] |
|
|
|
|
|
print(f"\nβ οΈ SHORT FILES ANALYSIS (<3 seconds):") |
|
|
print(f"Total short files: {len(short_files)}") |
|
|
if len(valid_short) > 0: |
|
|
avg_conf = valid_short['confidence'].mean() |
|
|
print(f"Average confidence for short files: {avg_conf:.3f}") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("π ANALYSIS COMPLETE") |
|
|
|
|
|
|
|
|
run_full_analysis() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from sklearn.metrics import accuracy_score |
|
|
|
|
|
|
|
|
|
|
|
def generate_filtered_excel_report(df, folder_path): |
|
|
""" |
|
|
Generates an Excel report with overall and per-language accuracy, |
|
|
excluding files shorter than 3 seconds from the accuracy calculation. |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
print("β No results DataFrame found. Please run the analysis in Cell 4 first.") |
|
|
return |
|
|
|
|
|
print("π Generating filtered accuracy report...") |
|
|
|
|
|
|
|
|
|
|
|
accuracy_df = df[ |
|
|
(df['duration'] >= 3) & |
|
|
(df['predicted_language'] != 'error') & |
|
|
(df['predicted_language'] != 'load_error') |
|
|
].copy() |
|
|
|
|
|
print(f"Total files in accuracy calculation (>= 3s): {len(accuracy_df)} out of {len(df)}") |
|
|
|
|
|
|
|
|
if not accuracy_df.empty: |
|
|
overall_accuracy = accuracy_score(accuracy_df['ground_truth'], accuracy_df['predicted_language']) |
|
|
summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': f"{overall_accuracy:.2%}"}]) |
|
|
else: |
|
|
summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': "N/A"}]) |
|
|
|
|
|
|
|
|
per_language_stats = [] |
|
|
if not accuracy_df.empty: |
|
|
|
|
|
accuracy_df['ground_truth_name'] = accuracy_df['ground_truth'].map(ISO_TO_FULL_NAME) |
|
|
accuracy_df['predicted_language_name'] = accuracy_df['predicted_language'].map(ISO_TO_FULL_NAME) |
|
|
|
|
|
for lang_code, lang_name in sorted(ISO_TO_FULL_NAME.items()): |
|
|
lang_df = accuracy_df[accuracy_df['ground_truth'] == lang_code] |
|
|
if not lang_df.empty: |
|
|
lang_accuracy = accuracy_score(lang_df['ground_truth'], lang_df['predicted_language']) |
|
|
per_language_stats.append({ |
|
|
'Language': lang_name, |
|
|
'Accuracy': f"{lang_accuracy:.2%}", |
|
|
'File Count (>= 3s)': len(lang_df) |
|
|
}) |
|
|
|
|
|
per_language_df = pd.DataFrame(per_language_stats) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
report_path = os.path.join(folder_path, f"filtered_accuracy_report_{timestamp}.xlsx") |
|
|
|
|
|
with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer: |
|
|
summary_df.to_excel(writer, sheet_name='Summary', index=False) |
|
|
per_language_df.to_excel(writer, sheet_name='Per_Language_Accuracy', index=False) |
|
|
df.to_excel(writer, sheet_name='All_Results', index=False) |
|
|
accuracy_df.to_excel(writer, sheet_name='Filtered_Results (for accuracy)', index=False) |
|
|
|
|
|
|
|
|
for sheet_name in writer.sheets: |
|
|
worksheet = writer.sheets[sheet_name] |
|
|
for idx, col in enumerate(pd.read_excel(report_path, sheet_name=sheet_name).columns): |
|
|
max_len = max( |
|
|
df[col].astype(str).map(len).max() if col in df else 0, |
|
|
len(str(col)) |
|
|
) + 2 |
|
|
worksheet.set_column(idx, idx, max_len) |
|
|
|
|
|
print(f"\nβ
Filtered Excel report saved successfully to: {report_path}") |
|
|
|
|
|
|
|
|
|
|
|
if 'full_results_df' in locals(): |
|
|
generate_filtered_excel_report(full_results_df, RESULTS_FOLDER) |
|
|
else: |
|
|
print("β 'full_results_df' not found. Please run the previous cell to process the dataset first.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import librosa |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv" |
|
|
|
|
|
try: |
|
|
full_results_df = pd.read_csv(csv_path) |
|
|
print(f"β
Successfully loaded {len(full_results_df)} records from {csv_path}") |
|
|
except FileNotFoundError: |
|
|
print(f"β ERROR: File not found at '{csv_path}'. Please check the path and try again.") |
|
|
|
|
|
raise |
|
|
|
|
|
|
|
|
print("\nπ Starting in-depth feature extraction...") |
|
|
|
|
|
def extract_audio_features(row): |
|
|
"""Calculates SNR proxy and silence ratio for a given audio file.""" |
|
|
try: |
|
|
audio, sr = librosa.load(row['file_path'], sr=16000, mono=True) |
|
|
|
|
|
|
|
|
rms = librosa.feature.rms(y=audio, frame_length=2048, hop_length=512)[0] |
|
|
|
|
|
|
|
|
silence_threshold = 0.2 * np.max(rms) if rms.size > 0 else 0 |
|
|
silence_ratio = np.mean(rms < silence_threshold) if rms.size > 0 else 1.0 |
|
|
|
|
|
|
|
|
loud_rms = np.mean(rms[rms >= silence_threshold]) if np.any(rms >= silence_threshold) else 0 |
|
|
quiet_rms = np.mean(rms[rms < silence_threshold]) if np.any(rms < silence_threshold) else 0 |
|
|
snr_proxy = 20 * np.log10(loud_rms / (quiet_rms + 1e-7) + 1e-7) if quiet_rms > 0 else 50.0 |
|
|
|
|
|
return pd.Series([snr_proxy, silence_ratio]) |
|
|
|
|
|
except Exception as e: |
|
|
return pd.Series([np.nan, np.nan]) |
|
|
|
|
|
|
|
|
print("Calculating SNR and silence ratios for all files... (This may take a few minutes)") |
|
|
features_df = full_results_df.apply(extract_audio_features, axis=1) |
|
|
features_df.columns = ['snr_proxy', 'silence_ratio'] |
|
|
|
|
|
|
|
|
analysis_df = pd.concat([full_results_df, features_df], axis=1) |
|
|
|
|
|
print("β
Feature extraction complete!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from sklearn.metrics import accuracy_score, confusion_matrix |
|
|
|
|
|
|
|
|
|
|
|
def generate_comprehensive_report(df, folder_path): |
|
|
""" |
|
|
Generates a comprehensive Excel report with multiple analysis sheets. |
|
|
""" |
|
|
if 'analysis_df' not in locals(): |
|
|
print("β 'analysis_df' with features not found. Please run the feature extraction cell first.") |
|
|
return |
|
|
|
|
|
print("π Generating comprehensive analysis report...") |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx") |
|
|
writer = pd.ExcelWriter(report_path, engine='xlsxwriter') |
|
|
|
|
|
|
|
|
df.to_excel(writer, sheet_name='Results_with_Features', index=False) |
|
|
|
|
|
|
|
|
valid_df = df[ |
|
|
(df['predicted_language'] != 'error') & |
|
|
(df['predicted_language'] != 'load_error') |
|
|
].copy() |
|
|
|
|
|
|
|
|
n_bins = 10 |
|
|
bins = np.linspace(0, 1, n_bins + 1) |
|
|
valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True) |
|
|
|
|
|
|
|
|
valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str) |
|
|
|
|
|
calib_data = valid_df.groupby('confidence_bin').apply(lambda x: pd.Series({ |
|
|
'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']), |
|
|
'avg_confidence': x['confidence'].mean(), |
|
|
'sample_count': len(x) |
|
|
})).reset_index() |
|
|
|
|
|
overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df))) |
|
|
|
|
|
calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}]) |
|
|
calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False) |
|
|
calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False) |
|
|
|
|
|
|
|
|
def get_accuracy_slice(dataframe, column, bins): |
|
|
dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True) |
|
|
return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy') |
|
|
|
|
|
acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf]) |
|
|
acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf]) |
|
|
acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0]) |
|
|
|
|
|
acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False) |
|
|
acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False) |
|
|
acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False) |
|
|
|
|
|
|
|
|
labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique()))) |
|
|
cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels) |
|
|
cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels]) |
|
|
|
|
|
confusion_asymmetry_df = cm_df.subtract(cm_df.T) |
|
|
|
|
|
cm_df.to_excel(writer, sheet_name='Confusion_Matrix') |
|
|
confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry') |
|
|
|
|
|
|
|
|
hard_misclassifications = valid_df[ |
|
|
(valid_df['ground_truth'] != valid_df['predicted_language']) & |
|
|
(valid_df['confidence'] > 0.8) |
|
|
].sort_values('confidence', ascending=False) |
|
|
|
|
|
ambiguous_correct = valid_df[ |
|
|
(valid_df['ground_truth'] == valid_df['predicted_language']) & |
|
|
(valid_df['confidence'] < 0.5) |
|
|
].sort_values('confidence', ascending=True) |
|
|
|
|
|
hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False) |
|
|
ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False) |
|
|
|
|
|
|
|
|
writer.close() |
|
|
print(f"\nβ
Comprehensive analysis report saved successfully to: {report_path}") |
|
|
|
|
|
|
|
|
|
|
|
if 'analysis_df' in locals(): |
|
|
generate_comprehensive_report(analysis_df, RESULTS_FOLDER) |
|
|
else: |
|
|
print("β 'analysis_df' not found. Please run the feature extraction in the previous cell first.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from sklearn.metrics import accuracy_score, confusion_matrix |
|
|
|
|
|
|
|
|
|
|
|
def generate_comprehensive_report(df, folder_path): |
|
|
""" |
|
|
Generates a comprehensive Excel report with multiple analysis sheets. |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
print("β The 'analysis_df' DataFrame is empty. Please check the previous cell.") |
|
|
return |
|
|
|
|
|
print("π Generating comprehensive analysis report...") |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx") |
|
|
|
|
|
with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer: |
|
|
|
|
|
df.to_excel(writer, sheet_name='Results_with_Features', index=False) |
|
|
|
|
|
|
|
|
valid_df = df[ |
|
|
(df['predicted_language'] != 'error') & |
|
|
(df['predicted_language'] != 'load_error') |
|
|
].copy() |
|
|
|
|
|
|
|
|
n_bins = 10 |
|
|
bins = np.linspace(0, 1, n_bins + 1) |
|
|
valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True) |
|
|
valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str) |
|
|
|
|
|
calib_data = valid_df.groupby('confidence_bin', observed=False).apply(lambda x: pd.Series({ |
|
|
'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0, |
|
|
'avg_confidence': x['confidence'].mean() if not x.empty else 0, |
|
|
'sample_count': len(x) |
|
|
})).reset_index() |
|
|
|
|
|
overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df))) |
|
|
|
|
|
calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}]) |
|
|
calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False) |
|
|
calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False) |
|
|
|
|
|
|
|
|
def get_accuracy_slice(dataframe, column, bins): |
|
|
dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True) |
|
|
return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy') |
|
|
|
|
|
acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf]) |
|
|
acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf]) |
|
|
acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0]) |
|
|
|
|
|
acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False) |
|
|
acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False) |
|
|
acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False) |
|
|
|
|
|
|
|
|
labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique()))) |
|
|
cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels) |
|
|
cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels]) |
|
|
|
|
|
confusion_asymmetry_df = cm_df.subtract(cm_df.T) |
|
|
|
|
|
cm_df.to_excel(writer, sheet_name='Confusion_Matrix') |
|
|
confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry') |
|
|
|
|
|
|
|
|
hard_misclassifications = valid_df[ |
|
|
(valid_df['ground_truth'] != valid_df['predicted_language']) & |
|
|
(valid_df['confidence'] > 0.8) |
|
|
].sort_values('confidence', ascending=False) |
|
|
|
|
|
ambiguous_correct = valid_df[ |
|
|
(valid_df['ground_truth'] == valid_df['predicted_language']) & |
|
|
(valid_df['confidence'] < 0.5) |
|
|
].sort_values('confidence', ascending=True) |
|
|
|
|
|
hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False) |
|
|
ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False) |
|
|
|
|
|
print(f"\nβ
Comprehensive analysis report saved successfully to: {report_path}") |
|
|
|
|
|
|
|
|
|
|
|
if 'analysis_df' in locals(): |
|
|
generate_comprehensive_report(analysis_df, RESULTS_FOLDER) |
|
|
else: |
|
|
print("β 'analysis_df' not found. Please re-run the previous cell to load and process the data.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.metrics import accuracy_score, classification_report |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv" |
|
|
|
|
|
try: |
|
|
results_df = pd.read_csv(csv_path) |
|
|
print(f"β
Successfully loaded {len(results_df)} records from {csv_path}") |
|
|
except FileNotFoundError: |
|
|
print(f"β ERROR: File not found at '{csv_path}'. Please check the path and try again.") |
|
|
raise |
|
|
|
|
|
|
|
|
|
|
|
NORMALIZATION_MAPPING = { |
|
|
|
|
|
'asm': 'as', 'ben': 'bn', 'brx': 'br', 'dgo': 'doi', 'eng': 'en', |
|
|
'guj': 'gu', 'hin': 'hi', 'kan': 'kn', 'kok': 'kok', 'kas': 'ks', |
|
|
'mai': 'mai', 'mal': 'ml', 'mni': 'mni', 'mar': 'mr', 'nep': 'ne', |
|
|
'ory': 'or', 'pan': 'pa', 'san': 'sa', 'sat': 'sat', 'snd': 'sd', |
|
|
'tam': 'ta', 'tel': 'te', 'urd': 'ur', |
|
|
|
|
|
'npi': 'ne' |
|
|
} |
|
|
|
|
|
|
|
|
print("\nApplying comprehensive normalization to language codes...") |
|
|
results_df['normalized_prediction'] = results_df['predicted_language'].map(NORMALIZATION_MAPPING) |
|
|
|
|
|
results_df['normalized_prediction'].fillna('unknown', inplace=True) |
|
|
|
|
|
|
|
|
def generate_accuracy_report(df, report_title): |
|
|
"""Calculates and returns overall and per-language accuracy DataFrames.""" |
|
|
print(f"\n--- Generating Report: {report_title} ---") |
|
|
|
|
|
|
|
|
valid_df = df[df['normalized_prediction'] != 'unknown'].copy() |
|
|
print(f"Calculating accuracy on {len(valid_df)} valid predictions.") |
|
|
|
|
|
if valid_df.empty: |
|
|
print("No valid data to report on.") |
|
|
return pd.DataFrame([{'Overall Accuracy': 'N/A'}]), pd.DataFrame() |
|
|
|
|
|
|
|
|
overall_accuracy = accuracy_score(valid_df['ground_truth'], valid_df['normalized_prediction']) |
|
|
summary_df = pd.DataFrame([{'Overall Accuracy': f"{overall_accuracy:.2%}"}]) |
|
|
print(f"Overall Accuracy: {overall_accuracy:.2%}") |
|
|
|
|
|
|
|
|
report_dict = classification_report(valid_df['ground_truth'], valid_df['normalized_prediction'], output_dict=True, zero_division=0) |
|
|
per_language_df = pd.DataFrame(report_dict).transpose().reset_index().rename(columns={'index': 'Language'}) |
|
|
|
|
|
|
|
|
per_language_df = per_language_df[per_language_df['Language'].isin(valid_df['ground_truth'].unique())] |
|
|
|
|
|
return summary_df, per_language_df |
|
|
|
|
|
|
|
|
|
|
|
all_files_summary_df, all_files_per_lang_df = generate_accuracy_report(results_df, "All Audio Files") |
|
|
|
|
|
|
|
|
df_filtered = results_df[results_df['duration'] >= 3].copy() |
|
|
filtered_summary_df, filtered_per_lang_df = generate_accuracy_report(df_filtered, "Audio Files >= 3 Seconds") |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
report_path = os.path.join(os.path.dirname(csv_path), f"final_corrected_analysis_{timestamp}.xlsx") |
|
|
|
|
|
print(f"\nπΎ Saving final corrected analysis to: {report_path}") |
|
|
|
|
|
with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer: |
|
|
all_files_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_ALL_Files', index=False) |
|
|
all_files_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_ALL_Files', index=False) |
|
|
filtered_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_>=3_Sec', index=False) |
|
|
filtered_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_>=3_Sec', index=False) |
|
|
results_df.to_excel(writer, sheet_name='Raw_Normalized_Results', index=False) |
|
|
|
|
|
print("β
Analysis complete. All reports saved.") |
|
|
|