# XLS-R1B # ============================================================================ # CELL 1: SETUP AND INSTALLATION # ============================================================================ import os import warnings warnings.filterwarnings('ignore') print("šŸš€ MMS Language Identification Test (Final Verified Version)") print("=" * 60) # Mount Google Drive from google.colab import drive # Install and update necessary packages print("šŸ“¦ Installing and updating packages...") print("āœ… Setup complete! Please restart the runtime now to apply updates.") # ============================================================================ # CELL 2: MODEL LOADING (Final Verified Version) # ============================================================================ import torch import librosa import pandas as pd import numpy as np from datetime import datetime from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification from sklearn.metrics import accuracy_score, classification_report # --- Your Folder and Language Mappings --- CUSTOM_FOLDER_MAPPING = { 'as': 'asm', 'bn': 'ben', 'br': 'brx', 'doi': 'dgo', 'en': 'eng', 'gu': 'guj', 'hi': 'hin', 'kn': 'kan', 'kok': 'kok', 'ks': 'kas', 'mai': 'mai', 'ml': 'mal', 'mni': 'mni', 'mr': 'mar', 'ne': 'nep', 'or': 'ory', 'pa': 'pan', 'sa': 'san', 'sat': 'sat', 'sd': 'snd', 'ta': 'tam', 'te': 'tel', 'ur': 'urd' } ISO_TO_FULL_NAME = { 'asm': 'Assamese', 'ben': 'Bengali', 'brx': 'Bodo', 'dgo': 'Dogri', 'eng': 'English', 'guj': 'Gujarati', 'hin': 'Hindi', 'kan': 'Kannada', 'kok': 'Konkani', 'kas': 'Kashmiri', 'mai': 'Maithili', 'mal': 'Malayalam', 'mni': 'Manipuri', 'mar': 'Marathi', 'nep': 'Nepali', 'ory': 'Odia', 'pan': 'Punjabi', 'san': 'Sanskrit', 'sat': 'Santali', 'snd': 'Sindhi', 'tam': 'Tamil', 'tel': 'Telugu', 'urd': 'Urdu' } # --- Update Your Paths --- AUDIO_FOLDER = "/content/drive/MyDrive/Audio_files" # <-- Update this RESULTS_FOLDER = "/content/drive/MyDrive/mms_lid_results" os.makedirs(RESULTS_FOLDER, exist_ok=True) # --- Load Components Separately (The Fix) --- device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"šŸ”§ Device: {device}") MODEL_NAME = "facebook/mms-lid-256" # 1. Load the feature extractor ONLY feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME) # 2. Load the model for classification model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(device) model.eval() print(f"āœ… MMS LID model and feature extractor loaded successfully: {MODEL_NAME}") # ============================================================================ # CELL 3: AUDIO PROCESSING AND PREDICTION # ============================================================================ def load_audio_raw(file_path): try: audio, sr = librosa.load(file_path, sr=16000, mono=True) duration = len(audio) / 16000 return audio, duration except Exception as e: print(f"Error loading {file_path}: {e}") return None, 0 def predict_language_mms(audio_array): try: # Use the feature_extractor directly inputs = feature_extractor(audio_array, sampling_rate=16000, return_tensors="pt") inputs = {k: v.to(device) for k, v in inputs.items()} with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits pred_idx = torch.argmax(logits, dim=-1).item() pred_lang_code = model.config.id2label[pred_idx] probabilities = torch.softmax(logits, dim=-1)[0] confidence = probabilities[pred_idx].item() return pred_lang_code, confidence except Exception as e: return "error", 0.0 def find_audio_files(base_path): audio_files = [] for root, _, files in os.walk(base_path): folder_code = os.path.basename(root).lower() if folder_code in CUSTOM_FOLDER_MAPPING: ground_truth_iso = CUSTOM_FOLDER_MAPPING[folder_code] for file in files: if file.lower().endswith(('.wav', '.mp3', '.m4a', '.flac', '.ogg')): audio_files.append({ "file_path": os.path.join(root, file), "filename": file, "ground_truth": ground_truth_iso }) return audio_files print("āœ… Functions are ready!") # ============================================================================ # CELL 4: PROCESS ALL FILES AND GENERATE REPORT # ============================================================================ def run_full_analysis(): print("šŸš€ Processing FULL dataset with MMS LID Model...") audio_files = find_audio_files(AUDIO_FOLDER) if not audio_files: print("āŒ No audio files found. Please check your AUDIO_FOLDER path.") return total_files = len(audio_files) results = [] print(f"šŸ”„ Processing {total_files} files...") print("-" * 50) for i, file_info in enumerate(audio_files): if (i + 1) % 50 == 0: print(f"Progress: {i+1}/{total_files} ({(i+1)/total_files*100:.1f}%)") audio, duration = load_audio_raw(str(file_info['file_path'])) if audio is None: result = {**file_info, "predicted_language": "load_error", "confidence": 0.0, "duration": 0.0, "is_short_file": False} else: pred_lang_code, confidence = predict_language_mms(audio) is_short = duration < 3.0 result = {**file_info, "predicted_language": pred_lang_code, "confidence": confidence, "duration": duration, "is_short_file": is_short} if is_short and pred_lang_code != "error": print(f"āš ļø SHORT ({duration:.1f}s): {file_info['filename']} -> {ISO_TO_FULL_NAME.get(pred_lang_code, pred_lang_code)} ({confidence:.3f})") results.append(result) results_df = pd.DataFrame(results) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") csv_path = f"{RESULTS_FOLDER}/mms_lid_results_{timestamp}.csv" results_df.to_csv(csv_path, index=False) print(f"\nāœ… Processing complete! Results saved to: {csv_path}") # --- Detailed Analysis --- print("\n" + "=" * 60) print("šŸ“Š MMS LID MODEL - DETAILED ANALYSIS") print("=" * 60) valid_data = results_df[(results_df['predicted_language'] != 'error') & (results_df['predicted_language'] != 'load_error')] if len(valid_data) > 0: overall_accuracy = accuracy_score(valid_data['ground_truth'], valid_data['predicted_language']) print(f"\nšŸŽÆ OVERALL MODEL ACCURACY: {overall_accuracy:.2%}") print(f"\nšŸ“‹ LANGUAGE-WISE ACCURACY:") report_true = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['ground_truth']] report_pred = [ISO_TO_FULL_NAME.get(code, code) for code in valid_data['predicted_language']] print(classification_report(report_true, report_pred, zero_division=0)) short_files = results_df[results_df.get('is_short_file', False) == True] valid_short = short_files[(short_files['predicted_language'] != 'error') & (short_files['predicted_language'] != 'load_error')] print(f"\nāš ļø SHORT FILES ANALYSIS (<3 seconds):") print(f"Total short files: {len(short_files)}") if len(valid_short) > 0: avg_conf = valid_short['confidence'].mean() print(f"Average confidence for short files: {avg_conf:.3f}") print("\n" + "=" * 60) print("šŸ ANALYSIS COMPLETE") # Run the full analysis run_full_analysis() # ============================================================================ # CELL 5: GENERATE FILTERED EXCEL REPORT # ============================================================================ import pandas as pd from sklearn.metrics import accuracy_score # Install the package needed to write Excel files def generate_filtered_excel_report(df, folder_path): """ Generates an Excel report with overall and per-language accuracy, excluding files shorter than 3 seconds from the accuracy calculation. """ if df is None or df.empty: print("āŒ No results DataFrame found. Please run the analysis in Cell 4 first.") return print("šŸ“Š Generating filtered accuracy report...") # --- 1. Filter the DataFrame --- # Exclude errors and files shorter than 3 seconds accuracy_df = df[ (df['duration'] >= 3) & (df['predicted_language'] != 'error') & (df['predicted_language'] != 'load_error') ].copy() print(f"Total files in accuracy calculation (>= 3s): {len(accuracy_df)} out of {len(df)}") # --- 2. Calculate Overall Accuracy --- if not accuracy_df.empty: overall_accuracy = accuracy_score(accuracy_df['ground_truth'], accuracy_df['predicted_language']) summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': f"{overall_accuracy:.2%}"}]) else: summary_df = pd.DataFrame([{'Overall Accuracy (>= 3s)': "N/A"}]) # --- 3. Calculate Per-Language Accuracy --- per_language_stats = [] if not accuracy_df.empty: # Use full names for the report accuracy_df['ground_truth_name'] = accuracy_df['ground_truth'].map(ISO_TO_FULL_NAME) accuracy_df['predicted_language_name'] = accuracy_df['predicted_language'].map(ISO_TO_FULL_NAME) for lang_code, lang_name in sorted(ISO_TO_FULL_NAME.items()): lang_df = accuracy_df[accuracy_df['ground_truth'] == lang_code] if not lang_df.empty: lang_accuracy = accuracy_score(lang_df['ground_truth'], lang_df['predicted_language']) per_language_stats.append({ 'Language': lang_name, 'Accuracy': f"{lang_accuracy:.2%}", 'File Count (>= 3s)': len(lang_df) }) per_language_df = pd.DataFrame(per_language_stats) # --- 4. Save to Excel --- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = os.path.join(folder_path, f"filtered_accuracy_report_{timestamp}.xlsx") with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer: summary_df.to_excel(writer, sheet_name='Summary', index=False) per_language_df.to_excel(writer, sheet_name='Per_Language_Accuracy', index=False) df.to_excel(writer, sheet_name='All_Results', index=False) accuracy_df.to_excel(writer, sheet_name='Filtered_Results (for accuracy)', index=False) # Auto-adjust column widths for readability for sheet_name in writer.sheets: worksheet = writer.sheets[sheet_name] for idx, col in enumerate(pd.read_excel(report_path, sheet_name=sheet_name).columns): max_len = max( df[col].astype(str).map(len).max() if col in df else 0, len(str(col)) ) + 2 worksheet.set_column(idx, idx, max_len) print(f"\nāœ… Filtered Excel report saved successfully to: {report_path}") # Run the function to generate the report # This assumes 'full_results_df' was created in the previous cell if 'full_results_df' in locals(): generate_filtered_excel_report(full_results_df, RESULTS_FOLDER) else: print("āŒ 'full_results_df' not found. Please run the previous cell to process the dataset first.") # ============================================================================ # CELL 5: LOAD EXISTING RESULTS AND EXTRACT FEATURES # ============================================================================ import pandas as pd import numpy as np import librosa import os # --- 1. Load Your Existing CSV File --- # āš ļø PASTE THE FULL PATH to your CSV file here csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv" try: full_results_df = pd.read_csv(csv_path) print(f"āœ… Successfully loaded {len(full_results_df)} records from {csv_path}") except FileNotFoundError: print(f"āŒ ERROR: File not found at '{csv_path}'. Please check the path and try again.") # Stop execution if the file is not found raise # --- 2. In-Depth Feature Extraction --- print("\nšŸš€ Starting in-depth feature extraction...") def extract_audio_features(row): """Calculates SNR proxy and silence ratio for a given audio file.""" try: audio, sr = librosa.load(row['file_path'], sr=16000, mono=True) # Calculate RMS energy for silence detection rms = librosa.feature.rms(y=audio, frame_length=2048, hop_length=512)[0] # Silence Ratio: Percentage of frames below 20% of max energy silence_threshold = 0.2 * np.max(rms) if rms.size > 0 else 0 silence_ratio = np.mean(rms < silence_threshold) if rms.size > 0 else 1.0 # SNR Proxy: Ratio of energy in loud parts vs. quiet parts loud_rms = np.mean(rms[rms >= silence_threshold]) if np.any(rms >= silence_threshold) else 0 quiet_rms = np.mean(rms[rms < silence_threshold]) if np.any(rms < silence_threshold) else 0 snr_proxy = 20 * np.log10(loud_rms / (quiet_rms + 1e-7) + 1e-7) if quiet_rms > 0 else 50.0 return pd.Series([snr_proxy, silence_ratio]) except Exception as e: return pd.Series([np.nan, np.nan]) # Apply the feature extraction to each row print("Calculating SNR and silence ratios for all files... (This may take a few minutes)") features_df = full_results_df.apply(extract_audio_features, axis=1) features_df.columns = ['snr_proxy', 'silence_ratio'] # Combine the new features with your existing results analysis_df = pd.concat([full_results_df, features_df], axis=1) print("āœ… Feature extraction complete!") # ============================================================================ # CELL 6: COMPREHENSIVE ANALYSIS AND EXCEL REPORT # ============================================================================ import pandas as pd from sklearn.metrics import accuracy_score, confusion_matrix # Install xlsxwriter if not already installed def generate_comprehensive_report(df, folder_path): """ Generates a comprehensive Excel report with multiple analysis sheets. """ if 'analysis_df' not in locals(): print("āŒ 'analysis_df' with features not found. Please run the feature extraction cell first.") return print("šŸ“Š Generating comprehensive analysis report...") # --- Create a new Excel writer --- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx") writer = pd.ExcelWriter(report_path, engine='xlsxwriter') # --- Sheet 1: All Results with Features --- df.to_excel(writer, sheet_name='Results_with_Features', index=False) # Filter for valid predictions for all subsequent analyses valid_df = df[ (df['predicted_language'] != 'error') & (df['predicted_language'] != 'load_error') ].copy() # --- Sheet 2 & 3: Calibration Analysis --- n_bins = 10 bins = np.linspace(0, 1, n_bins + 1) valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True) # Ensure all bins are present for groupby valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str) calib_data = valid_df.groupby('confidence_bin').apply(lambda x: pd.Series({ 'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']), 'avg_confidence': x['confidence'].mean(), 'sample_count': len(x) })).reset_index() overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df))) calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}]) calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False) calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False) # --- Sheets 4, 5, 6: Accuracy vs. Features --- def get_accuracy_slice(dataframe, column, bins): dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True) return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy') acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf]) acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf]) acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0]) acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False) acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False) acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False) # --- Sheet 7 & 8: Confusion Matrix and Asymmetry --- labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique()))) cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels) cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels]) confusion_asymmetry_df = cm_df.subtract(cm_df.T) cm_df.to_excel(writer, sheet_name='Confusion_Matrix') confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry') # --- Sheet 9 & 10: Hard Cases Analysis --- hard_misclassifications = valid_df[ (valid_df['ground_truth'] != valid_df['predicted_language']) & (valid_df['confidence'] > 0.8) ].sort_values('confidence', ascending=False) ambiguous_correct = valid_df[ (valid_df['ground_truth'] == valid_df['predicted_language']) & (valid_df['confidence'] < 0.5) ].sort_values('confidence', ascending=True) hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False) ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False) # --- Save the Excel file --- writer.close() print(f"\nāœ… Comprehensive analysis report saved successfully to: {report_path}") # Run the function to generate the final report if 'analysis_df' in locals(): generate_comprehensive_report(analysis_df, RESULTS_FOLDER) else: print("āŒ 'analysis_df' not found. Please run the feature extraction in the previous cell first.") # ============================================================================ # CELL 6: COMPREHENSIVE ANALYSIS AND EXCEL REPORT (UNIFIED) # ============================================================================ import pandas as pd from sklearn.metrics import accuracy_score, confusion_matrix # Install xlsxwriter if not already installed def generate_comprehensive_report(df, folder_path): """ Generates a comprehensive Excel report with multiple analysis sheets. """ if df is None or df.empty: print("āŒ The 'analysis_df' DataFrame is empty. Please check the previous cell.") return print("šŸ“Š Generating comprehensive analysis report...") # --- Create a new Excel writer --- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = os.path.join(folder_path, f"comprehensive_analysis_report_{timestamp}.xlsx") with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer: # --- Sheet 1: All Results with Features --- df.to_excel(writer, sheet_name='Results_with_Features', index=False) # Filter for valid predictions for all subsequent analyses valid_df = df[ (df['predicted_language'] != 'error') & (df['predicted_language'] != 'load_error') ].copy() # --- Sheet 2 & 3: Calibration Analysis --- n_bins = 10 bins = np.linspace(0, 1, n_bins + 1) valid_df['confidence_bin'] = pd.cut(valid_df['confidence'], bins=bins, include_lowest=True, right=True) valid_df['confidence_bin'] = valid_df['confidence_bin'].astype(str) calib_data = valid_df.groupby('confidence_bin', observed=False).apply(lambda x: pd.Series({ 'bin_accuracy': accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0, 'avg_confidence': x['confidence'].mean() if not x.empty else 0, 'sample_count': len(x) })).reset_index() overall_ece = np.sum(np.abs(calib_data['bin_accuracy'] - calib_data['avg_confidence']) * (calib_data['sample_count'] / len(valid_df))) calibration_overview_df = pd.DataFrame([{'Expected Calibration Error (ECE)': f"{overall_ece:.4f}"}]) calibration_overview_df.to_excel(writer, sheet_name='Calibration_Overview', index=False) calib_data.to_excel(writer, sheet_name='Calibration_Bins', index=False) # --- Sheets 4, 5, 6: Accuracy vs. Features --- def get_accuracy_slice(dataframe, column, bins): dataframe[f'{column}_bin'] = pd.cut(dataframe[column], bins=bins, include_lowest=True) return dataframe.groupby(f'{column}_bin', observed=False).apply(lambda x: accuracy_score(x['ground_truth'], x['predicted_language']) if not x.empty else 0).reset_index(name='accuracy') acc_vs_duration = get_accuracy_slice(valid_df.copy(), 'duration', bins=[0, 1, 2, 3, 5, 10, np.inf]) acc_vs_snr = get_accuracy_slice(valid_df.copy(), 'snr_proxy', bins=[-np.inf, 0, 10, 20, 30, 40, np.inf]) acc_vs_silence = get_accuracy_slice(valid_df.copy(), 'silence_ratio', bins=[-0.01, 0.1, 0.3, 0.5, 0.7, 1.0]) acc_vs_duration.to_excel(writer, sheet_name='Acc_vs_Duration', index=False) acc_vs_snr.to_excel(writer, sheet_name='Acc_vs_SNR', index=False) acc_vs_silence.to_excel(writer, sheet_name='Acc_vs_Silence', index=False) # --- Sheet 7 & 8: Confusion Matrix and Asymmetry --- labels = sorted(list(set(valid_df['ground_truth'].unique()) | set(valid_df['predicted_language'].unique()))) cm = confusion_matrix(valid_df['ground_truth'], valid_df['predicted_language'], labels=labels) cm_df = pd.DataFrame(cm, index=[ISO_TO_FULL_NAME.get(l, l) for l in labels], columns=[ISO_TO_FULL_NAME.get(l, l) for l in labels]) confusion_asymmetry_df = cm_df.subtract(cm_df.T) cm_df.to_excel(writer, sheet_name='Confusion_Matrix') confusion_asymmetry_df.to_excel(writer, sheet_name='Confusion_Asymmetry') # --- Sheet 9 & 10: Hard Cases Analysis --- hard_misclassifications = valid_df[ (valid_df['ground_truth'] != valid_df['predicted_language']) & (valid_df['confidence'] > 0.8) ].sort_values('confidence', ascending=False) ambiguous_correct = valid_df[ (valid_df['ground_truth'] == valid_df['predicted_language']) & (valid_df['confidence'] < 0.5) ].sort_values('confidence', ascending=True) hard_misclassifications.to_excel(writer, sheet_name='Hard_Misclassifications', index=False) ambiguous_correct.to_excel(writer, sheet_name='Ambiguous_Correct', index=False) print(f"\nāœ… Comprehensive analysis report saved successfully to: {report_path}") # Run the function to generate the final report # This will now work because 'analysis_df' was created in the cell right above if 'analysis_df' in locals(): generate_comprehensive_report(analysis_df, RESULTS_FOLDER) else: print("āŒ 'analysis_df' not found. Please re-run the previous cell to load and process the data.") # ============================================================================ # FINAL ANALYSIS CELL: NORMALIZATION AND DUAL ACCURACY REPORTS # ============================================================================ import pandas as pd import numpy as np from sklearn.metrics import accuracy_score, classification_report import os # Install xlsxwriter for Excel reporting # --- 1. Load Your Existing CSV File --- # āš ļø PASTE THE FULL PATH to your most recent CSV file here csv_path = "/content/drive/MyDrive/mms_lid_results/mms_lid_results_20250925_072344.csv" try: results_df = pd.read_csv(csv_path) print(f"āœ… Successfully loaded {len(results_df)} records from {csv_path}") except FileNotFoundError: print(f"āŒ ERROR: File not found at '{csv_path}'. Please check the path and try again.") raise # --- 2. Define the Comprehensive Normalization Mapping --- # This dictionary will standardize all known language code variations. NORMALIZATION_MAPPING = { # MMS model's 3-letter codes (prediction) to your 2-letter folder names (ground truth) 'asm': 'as', 'ben': 'bn', 'brx': 'br', 'dgo': 'doi', 'eng': 'en', 'guj': 'gu', 'hin': 'hi', 'kan': 'kn', 'kok': 'kok', 'kas': 'ks', 'mai': 'mai', 'mal': 'ml', 'mni': 'mni', 'mar': 'mr', 'nep': 'ne', 'ory': 'or', 'pan': 'pa', 'san': 'sa', 'sat': 'sat', 'snd': 'sd', 'tam': 'ta', 'tel': 'te', 'urd': 'ur', # Crucial fix for Nepali 'npi': 'ne' } # --- 3. Apply Normalization --- print("\nApplying comprehensive normalization to language codes...") results_df['normalized_prediction'] = results_df['predicted_language'].map(NORMALIZATION_MAPPING) # Fill any unmapped predictions with a placeholder to mark them as incorrect results_df['normalized_prediction'].fillna('unknown', inplace=True) # --- 4. Define the Analysis Function --- def generate_accuracy_report(df, report_title): """Calculates and returns overall and per-language accuracy DataFrames.""" print(f"\n--- Generating Report: {report_title} ---") # Filter for valid predictions (where normalization resulted in a known language) valid_df = df[df['normalized_prediction'] != 'unknown'].copy() print(f"Calculating accuracy on {len(valid_df)} valid predictions.") if valid_df.empty: print("No valid data to report on.") return pd.DataFrame([{'Overall Accuracy': 'N/A'}]), pd.DataFrame() # Calculate Overall Accuracy overall_accuracy = accuracy_score(valid_df['ground_truth'], valid_df['normalized_prediction']) summary_df = pd.DataFrame([{'Overall Accuracy': f"{overall_accuracy:.2%}"}]) print(f"Overall Accuracy: {overall_accuracy:.2%}") # Calculate Per-Language Accuracy report_dict = classification_report(valid_df['ground_truth'], valid_df['normalized_prediction'], output_dict=True, zero_division=0) per_language_df = pd.DataFrame(report_dict).transpose().reset_index().rename(columns={'index': 'Language'}) # Keep only the rows for actual languages, not the summary rows per_language_df = per_language_df[per_language_df['Language'].isin(valid_df['ground_truth'].unique())] return summary_df, per_language_df # --- 5. Generate Both Reports --- # Report 1: Including ALL files all_files_summary_df, all_files_per_lang_df = generate_accuracy_report(results_df, "All Audio Files") # Report 2: Excluding files < 3 seconds df_filtered = results_df[results_df['duration'] >= 3].copy() filtered_summary_df, filtered_per_lang_df = generate_accuracy_report(df_filtered, "Audio Files >= 3 Seconds") # --- 6. Save Everything to a Single Excel File --- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = os.path.join(os.path.dirname(csv_path), f"final_corrected_analysis_{timestamp}.xlsx") print(f"\nšŸ’¾ Saving final corrected analysis to: {report_path}") with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer: all_files_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_ALL_Files', index=False) all_files_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_ALL_Files', index=False) filtered_summary_df.to_excel(writer, sheet_name='Overall_Accuracy_>=3_Sec', index=False) filtered_per_lang_df.to_excel(writer, sheet_name='Per_Lang_Accuracy_>=3_Sec', index=False) results_df.to_excel(writer, sheet_name='Raw_Normalized_Results', index=False) print("āœ… Analysis complete. All reports saved.")