Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,597 Bytes
87a952c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
#!/usr/bin/env python3
"""
Generate ablation study data for D3 line chart embeds.
This script generates CSV files for:
1. From scratch ablation - single learning rate schedule
2. Annealing ablation - comparison between main pretraining and ablation decay
"""
import pandas as pd
import numpy as np
import os
# Parameters
max_lr = 2e-4
def generate_from_scratch_schedule():
"""Generate from scratch learning rate schedule - goes to 100B tokens"""
total_tokens = 100e9 # 100B tokens
warmup_end = 0.05 # 5% of total tokens
decay_start = 0.85 # 85% of total tokens
schedule = []
for i in range(1000): # 1000 points for smooth curve
progress = i / 999 # 0 to 1
if progress < warmup_end:
# Linear warmup
lr = max_lr * (progress / warmup_end)
elif progress < decay_start:
# Plateau at max LR
lr = max_lr
else:
# Linear decay to 0
decay_progress = (progress - decay_start) / (1 - decay_start)
lr = max_lr * (1 - decay_progress)
tokens = progress * total_tokens
schedule.append({
'run_name': 'From scratch',
'tokens': tokens,
'learning_rate': lr
})
# Filter out points after learning rate reaches 0
filtered_schedule = []
for point in schedule:
filtered_schedule.append(point)
if point['learning_rate'] == 0 and len(filtered_schedule) > 1:
break
return filtered_schedule
def generate_annealing_schedules():
"""Generate annealing ablation schedules - goes to 11T tokens"""
total_tokens = 11e12 # 11T tokens
main_warmup_end = 0.012 # 1.2% of total tokens
main_decay_start = 0.80 # 80% of total tokens
main_end = 0.95 # 95% of total tokens
# Ablation run parameters - start earlier so it reaches 0 at 7.1T
ablation_start = 0.55 # Start earlier
ablation_end = 0.645 # End at 7.1T (64.5% of 11T)
schedules = []
# Main pretraining run
for i in range(1000):
progress = i / 999
if progress < main_warmup_end:
lr = max_lr * (progress / main_warmup_end)
elif progress < main_decay_start:
lr = max_lr
elif progress < main_end:
# Linear decay
decay_progress = (progress - main_decay_start) / (main_end - main_decay_start)
lr = max_lr * (1 - decay_progress)
else:
lr = 0
tokens = progress * total_tokens
schedules.append({
'run_name': 'Main pretraining',
'tokens': tokens,
'learning_rate': lr
})
# Ablation run (identical to main pretraining until decay starts at 7.1T)
for i in range(1000):
progress = i / 999
if progress < main_warmup_end:
# Same warmup as main pretraining
lr = max_lr * (progress / main_warmup_end)
elif progress < ablation_start:
# Same plateau as main pretraining
lr = max_lr
elif progress < ablation_end:
# Linear decay during ablation period (starts at 7.1T)
decay_progress = (progress - ablation_start) / (ablation_end - ablation_start)
lr = max_lr * (1 - decay_progress)
else:
lr = 0
tokens = progress * total_tokens
schedules.append({
'run_name': 'Ablation decay',
'tokens': tokens,
'learning_rate': lr
})
# Filter out points after learning rate reaches 0 for each series
filtered_schedules = []
main_pretraining_data = [s for s in schedules if s['run_name'] == 'Main pretraining']
ablation_decay_data = [s for s in schedules if s['run_name'] == 'Ablation decay']
# Filter main pretraining - keep all points until 11T
for point in main_pretraining_data:
filtered_schedules.append(point)
# Stop when learning rate reaches 0 (should be around 11T)
if point['learning_rate'] == 0 and len([s for s in filtered_schedules if s['run_name'] == 'Main pretraining']) > 1:
break
# Filter ablation decay
for point in ablation_decay_data:
filtered_schedules.append(point)
if point['learning_rate'] == 0 and len([s for s in filtered_schedules if s['run_name'] == 'Ablation decay']) > 1:
break
return filtered_schedules
def main():
# Create output directory if it doesn't exist
output_dir = "src/content/assets/data"
os.makedirs(output_dir, exist_ok=True)
print("Generating ablation study data...")
# Generate from scratch schedule
from_scratch_data = generate_from_scratch_schedule()
df_from_scratch = pd.DataFrame(from_scratch_data)
df_from_scratch.to_csv(f'{output_dir}/from_scratch_ablation.csv', index=False)
print(f"✓ Saved {output_dir}/from_scratch_ablation.csv with {len(df_from_scratch)} rows")
# Generate annealing schedules
annealing_data = generate_annealing_schedules()
df_annealing = pd.DataFrame(annealing_data)
df_annealing.to_csv(f'{output_dir}/annealing_ablation.csv', index=False)
print(f"✓ Saved {output_dir}/annealing_ablation.csv with {len(df_annealing)} rows")
print("\n✓ Done! CSV files generated successfully.")
print("\nNext steps:")
print("1. Use from_scratch_ablation.csv for the first plot")
print("2. Use annealing_ablation.csv for the second plot")
if __name__ == "__main__":
main()
|