aai/clay_manager.py

198 lines
6.8 KiB
Python
Raw Normal View History

2026-01-26 15:19:15 +00:00
#!/usr/bin/env python3
"""
🛠 Clay Checkpoint Manager - CLI do zarządzania checkpointami
"""
import argparse
import json
import shutil
from pathlib import Path
from datetime import datetime
from config import cfg
def list_checkpoints():
"""Wyświetla dostępne checkpointy"""
checkpoints = list(Path(cfg.checkpoints_dir).glob("clay_checkpoint_*.pt"))
if not checkpoints:
print("❌ Brak checkpointów")
return
print(f"\n📁 CLAY CHECKPOINTS ({len(checkpoints)}):")
print("=" * 80)
for cp in sorted(checkpoints, key=lambda x: x.stat().st_mtime, reverse=True):
# Wczytaj info z JSON
json_file = cp.with_suffix('.json')
if json_file.exists():
with open(json_file, 'r') as f:
info = json.load(f)['checkpoint_info']
size_mb = info['file_size'] / (1024 * 1024)
print(f"📄 {cp.name}")
print(f" • Epoka: {info['epoch']} | Krok: {info['step']:,}")
print(f" • Loss: {info['loss']:.4f} | Rozmiar: {size_mb:.1f}MB")
print(f" • Data: {info['timestamp']}")
print("-" * 40)
else:
size_mb = cp.stat().st_size / (1024 * 1024)
print(f"📄 {cp.name} ({size_mb:.1f}MB)")
def show_training_stats():
"""Pokazuje statystyki treningu"""
stats_file = Path(cfg.checkpoints_dir) / "training_stats.json"
if stats_file.exists():
with open(stats_file, 'r') as f:
stats = json.load(f)
print("\n📊 STATYSTYKI TRENINGU:")
print("=" * 60)
total_time = stats.get('total_time', 0)
hours = total_time / 3600
minutes = (total_time % 3600) / 60
print(f" • Całkowity czas: {hours:.0f}h {minutes:.0f}m")
print(f" • Ostatni loss: {stats.get('final_loss', 0):.4f}")
print(f" • Najlepszy loss: {stats.get('best_loss', 0):.4f}")
print(f" • Średni loss: {stats.get('avg_loss', 0):.4f}")
print(f" • Sprawdzone kroki: {stats.get('total_steps', 0):,}")
print(f" • Zakończono: {stats.get('completion_time', 'N/A')}")
else:
print("❌ Brak statystyk treningu")
def cleanup_checkpoints(keep=5):
"""Czyści stare checkpointy"""
checkpoints = list(Path(cfg.checkpoints_dir).glob("clay_checkpoint_*.pt"))
if len(checkpoints) <= keep:
print(f"✅ Wszystkie checkpointy zachowane (mniej niż {keep})")
return
checkpoints.sort(key=lambda x: x.stat().st_mtime)
to_delete = checkpoints[:-keep]
print(f"\n🗑️ Usuwanie {len(to_delete)} starych checkpointów:")
total_freed = 0
for cp in to_delete:
size_mb = cp.stat().st_size / (1024 * 1024)
total_freed += size_mb
print(f"{cp.name} ({size_mb:.1f}MB)")
cp.unlink()
# Usuń też JSON
json_file = cp.with_suffix('.json')
if json_file.exists():
json_file.unlink()
print(f"\n✅ Zachowano {keep} najnowszych checkpointów")
print(f"💰 Zwolniono {total_freed:.1f}MB")
def export_checkpoint(checkpoint_name, export_dir="exports"):
"""Eksportuje checkpoint do osobnego folderu"""
cp_path = Path(cfg.checkpoints_dir) / checkpoint_name
if not cp_path.exists():
print(f"❌ Checkpoint {checkpoint_name} nie istnieje")
return
# Stwórz folder eksportu
export_path = Path(export_dir)
export_path.mkdir(exist_ok=True)
# Skopiuj checkpoint i JSON
dest_path = export_path / checkpoint_name
shutil.copy2(cp_path, dest_path)
json_file = cp_path.with_suffix('.json')
if json_file.exists():
shutil.copy2(json_file, export_path / json_file.name)
print(f"✅ Checkpoint wyeksportowany do: {dest_path}")
def show_checkpoint_info(checkpoint_name):
"""Pokazuje szczegółowe info o checkpoincie"""
cp_path = Path(cfg.checkpoints_dir) / checkpoint_name
if not cp_path.exists():
print(f"❌ Checkpoint {checkpoint_name} nie istnieje")
return
json_file = cp_path.with_suffix('.json')
if json_file.exists():
with open(json_file, 'r') as f:
info = json.load(f)
print(f"\n📋 INFO O CHECKPOINCIE: {checkpoint_name}")
print("=" * 60)
cp_info = info['checkpoint_info']
stats = info['training_stats']
print("📁 PODSTAWOWE INFORMACJE:")
print(f" • Epoka: {cp_info['epoch']}")
print(f" • Krok: {cp_info['step']:,}")
print(f" • Loss: {cp_info['loss']:.4f}")
print(f" • Rozmiar: {cp_info['file_size'] / (1024 * 1024):.1f}MB")
print(f" • Data: {cp_info['timestamp']}")
print("\n📊 STATYSTYKI TRENINGU:")
print(f" • Całkowity czas: {stats['total_time']:.0f}s")
print(f" • Średni loss: {stats['avg_loss']:.4f}")
print(f" • Current LR: {stats['current_lr']:.6f}")
print(f" • Kroki: {stats['steps_done']:,}")
else:
print("❌ Brak informacji JSON dla tego checkpointu")
def main():
parser = argparse.ArgumentParser(description="Clay Checkpoint Manager")
parser.add_argument("--list", action="store_true", help="Lista checkpointów")
parser.add_argument("--stats", action="store_true", help="Pokaż statystyki")
parser.add_argument("--cleanup", type=int, nargs='?', const=5, help="Wyczyść stare checkpointy (domyślnie: 5)")
parser.add_argument("--export", type=str, help="Eksportuj checkpoint")
parser.add_argument("--info", type=str, help="Info o konkretnym checkpoincie")
parser.add_argument("--export-all", action="store_true", help="Eksportuj wszystkie checkpointy")
args = parser.parse_args()
if args.list:
list_checkpoints()
elif args.stats:
show_training_stats()
elif args.cleanup is not None:
cleanup_checkpoints(args.cleanup)
elif args.export:
export_checkpoint(args.export)
elif args.info:
show_checkpoint_info(args.info)
elif args.export_all:
checkpoints = list(Path(cfg.checkpoints_dir).glob("clay_checkpoint_*.pt"))
for cp in checkpoints:
export_checkpoint(cp.name)
else:
print("\n🛠️ Clay Checkpoint Manager")
print("=" * 40)
print("Użyj:")
print(" --list # Lista checkpointów")
print(" --stats # Statystyki treningu")
print(" --cleanup [N] # Zostaw N najnowszych (domyślnie 5)")
print(" --export NAME # Eksportuj checkpoint")
print(" --info NAME # Info o checkpoincie")
print(" --export-all # Eksportuj wszystkie")
print("\nPrzykłady:")
print(" python clay_manager.py --list")
print(" python clay_manager.py --cleanup 3")
print(" python clay_manager.py --info clay_checkpoint_ep2_step5000_20240126_143022.pt")
if __name__ == "__main__":
main()