Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 4,270 Bytes
87a952c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
#!/usr/bin/env python3
"""
Generate ablation study data for D3 line chart embeds.
This script generates CSV files for:
1. From scratch ablation - single learning rate schedule
2. Annealing ablation - comparison between main pretraining and ablation decay
"""
import pandas as pd
import numpy as np
import os
# Parameters
total_tokens = 100e9 # 100B tokens
max_lr = 2e-4
def generate_from_scratch_schedule():
"""Generate from scratch learning rate schedule"""
# Based on the HTML: warmup to 5%, plateau until 85%, then decay
warmup_end = 0.05 # 5% of total tokens
decay_start = 0.85 # 85% of total tokens
schedule = []
for i in range(1000): # 1000 points for smooth curve
progress = i / 999 # 0 to 1
if progress < warmup_end:
# Linear warmup
lr = max_lr * (progress / warmup_end)
elif progress < decay_start:
# Plateau at max LR
lr = max_lr
else:
# Linear decay to 0
decay_progress = (progress - decay_start) / (1 - decay_start)
lr = max_lr * (1 - decay_progress)
tokens = progress * total_tokens
schedule.append({
'run_name': 'From scratch',
'tokens': tokens,
'learning_rate': lr
})
return schedule
def generate_annealing_schedules():
"""Generate annealing ablation schedules"""
# Main pretraining run parameters
main_warmup_end = 0.012 # 1.2% of total tokens
main_decay_start = 0.80 # 80% of total tokens
main_end = 0.95 # 95% of total tokens
# Ablation run parameters
ablation_start = 0.64 # 64% of total tokens
ablation_duration = 0.10 # 10% of total tokens
ablation_end = ablation_start + ablation_duration
schedules = []
# Main pretraining run
for i in range(1000):
progress = i / 999
if progress < main_warmup_end:
lr = max_lr * (progress / main_warmup_end)
elif progress < main_decay_start:
lr = max_lr
elif progress < main_end:
# Linear decay
decay_progress = (progress - main_decay_start) / (main_end - main_decay_start)
lr = max_lr * (1 - decay_progress)
else:
lr = 0
tokens = progress * total_tokens
schedules.append({
'run_name': 'Main pretraining',
'tokens': tokens,
'learning_rate': lr
})
# Ablation run (starts from plateau and decays)
for i in range(1000):
progress = i / 999
if progress < ablation_start:
lr = max_lr # Plateau
elif progress < ablation_end:
# Linear decay during ablation period
decay_progress = (progress - ablation_start) / (ablation_end - ablation_start)
lr = max_lr * (1 - decay_progress)
else:
lr = 0
tokens = progress * total_tokens
schedules.append({
'run_name': 'Ablation decay',
'tokens': tokens,
'learning_rate': lr
})
return schedules
def main():
# Create output directory if it doesn't exist
output_dir = "src/content/assets/data"
os.makedirs(output_dir, exist_ok=True)
print("Generating ablation study data...")
# Generate from scratch schedule
from_scratch_data = generate_from_scratch_schedule()
df_from_scratch = pd.DataFrame(from_scratch_data)
df_from_scratch.to_csv(f'{output_dir}/from_scratch_ablation.csv', index=False)
print(f"✓ Saved {output_dir}/from_scratch_ablation.csv with {len(df_from_scratch)} rows")
# Generate annealing schedules
annealing_data = generate_annealing_schedules()
df_annealing = pd.DataFrame(annealing_data)
df_annealing.to_csv(f'{output_dir}/annealing_ablation.csv', index=False)
print(f"✓ Saved {output_dir}/annealing_ablation.csv with {len(df_annealing)} rows")
print("\n✓ Done! CSV files generated successfully.")
print("\nNext steps:")
print("1. Use from_scratch_ablation.csv for the first plot")
print("2. Use annealing_ablation.csv for the second plot")
if __name__ == "__main__":
main()
|