214 lines
7.2 KiB
Python
214 lines
7.2 KiB
Python
|
|
"""
|
||
|
|
🎯 CONFIG - Wspólna konfiguracja dla MiniGPT-60M
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import sys
|
||
|
|
import random
|
||
|
|
import numpy as np
|
||
|
|
import torch
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import List, Dict, Any, Optional
|
||
|
|
import logging
|
||
|
|
import json
|
||
|
|
|
||
|
|
# ==================== LOGGING ====================
|
||
|
|
logging.basicConfig(
|
||
|
|
level=logging.INFO,
|
||
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||
|
|
handlers=[
|
||
|
|
logging.FileHandler('training.log', encoding='utf-8'),
|
||
|
|
logging.StreamHandler()
|
||
|
|
]
|
||
|
|
)
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
# ==================== KONFIGURACJA SYSTEMU ====================
|
||
|
|
class SystemConfig:
|
||
|
|
"""Konfiguracja systemu i urządzeń"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
self.device = self._get_device()
|
||
|
|
self.set_seeds(42)
|
||
|
|
self._print_info()
|
||
|
|
|
||
|
|
def _get_device(self) -> str:
|
||
|
|
"""Automatycznie wybiera najlepsze urządzenie"""
|
||
|
|
if torch.cuda.is_available():
|
||
|
|
return "cuda"
|
||
|
|
elif torch.backends.mps.is_available():
|
||
|
|
return "mps"
|
||
|
|
else:
|
||
|
|
return "cpu"
|
||
|
|
|
||
|
|
def set_seeds(self, seed: int = 42):
|
||
|
|
"""Ustawia seed dla reprodukowalności"""
|
||
|
|
random.seed(seed)
|
||
|
|
np.random.seed(seed)
|
||
|
|
torch.manual_seed(seed)
|
||
|
|
if torch.cuda.is_available():
|
||
|
|
torch.cuda.manual_seed_all(seed)
|
||
|
|
torch.backends.cudnn.deterministic = True
|
||
|
|
torch.backends.cudnn.benchmark = False
|
||
|
|
|
||
|
|
def _print_info(self):
|
||
|
|
"""Wyświetla informacje o systemie"""
|
||
|
|
logger.info("=" * 60)
|
||
|
|
logger.info("🎯 SYSTEM MINIGPT-60M")
|
||
|
|
logger.info("=" * 60)
|
||
|
|
logger.info(f"Python: {sys.version.split()[0]}")
|
||
|
|
logger.info(f"PyTorch: {torch.__version__}")
|
||
|
|
logger.info(f"Device: {self.device.upper()}")
|
||
|
|
|
||
|
|
if self.device == "cuda":
|
||
|
|
gpu_count = torch.cuda.device_count()
|
||
|
|
logger.info(f"CUDA dostępne: {torch.cuda.is_available()}")
|
||
|
|
logger.info(f"Liczba GPU: {gpu_count}")
|
||
|
|
for i in range(gpu_count):
|
||
|
|
mem = torch.cuda.get_device_properties(i).total_memory / 1e9
|
||
|
|
logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)} ({mem:.1f} GB)")
|
||
|
|
|
||
|
|
logger.info("=" * 60)
|
||
|
|
|
||
|
|
# ==================== KONFIGURACJA MODELU ====================
|
||
|
|
class ModelConfig:
|
||
|
|
"""Konfiguracja modelu 60M parametrów"""
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
# Słownik
|
||
|
|
self.vocab_chars = list("aąbcćdeęfghijklłmnńoóprsśtuwyzźżAĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ")
|
||
|
|
self.vocab_chars += list("0123456789")
|
||
|
|
self.vocab_chars += list(" .,?!:;()[]{}+-*/=<>_\"'`~@#$%^&|\\/\n\t")
|
||
|
|
self.vocab_chars += [" ", "\n\n", "\t\t", "->", "::", "=>"]
|
||
|
|
|
||
|
|
self.vocab = self.vocab_chars
|
||
|
|
self.vocab_size = len(self.vocab)
|
||
|
|
|
||
|
|
# Architektura dla ~60M parametrów
|
||
|
|
self.embed_dim = 768
|
||
|
|
self.n_layers = 12
|
||
|
|
self.n_heads = 12
|
||
|
|
self.max_len = 512
|
||
|
|
self.ff_dim = self.embed_dim * 4
|
||
|
|
self.dropout = 0.1
|
||
|
|
self.activation = "gelu"
|
||
|
|
self.norm_eps = 1e-5
|
||
|
|
|
||
|
|
# Trening
|
||
|
|
self.epochs = 3
|
||
|
|
self.batch_size = 16 if torch.cuda.is_available() else 4
|
||
|
|
self.grad_accum_steps = 4
|
||
|
|
self.learning_rate = 3e-4
|
||
|
|
self.weight_decay = 0.1
|
||
|
|
self.adam_beta1 = 0.9
|
||
|
|
self.adam_beta2 = 0.95
|
||
|
|
self.adam_eps = 1e-8
|
||
|
|
self.clip_grad = 1.0
|
||
|
|
self.warmup_steps = 2000
|
||
|
|
|
||
|
|
# Mixed Precision
|
||
|
|
self.use_amp = torch.cuda.is_available()
|
||
|
|
|
||
|
|
# Parallel
|
||
|
|
self.num_workers = 4 if torch.cuda.is_available() else 0
|
||
|
|
self.pin_memory = True
|
||
|
|
|
||
|
|
# Generowanie
|
||
|
|
self.generation_temperature = 0.8
|
||
|
|
self.top_k = 50
|
||
|
|
self.top_p = 0.95
|
||
|
|
self.repetition_penalty = 1.1
|
||
|
|
|
||
|
|
# Ścieżki
|
||
|
|
self.model_dir = "models"
|
||
|
|
self.data_dir = "data"
|
||
|
|
self.prepared_dir = "prepared_data"
|
||
|
|
self.log_dir = "logs"
|
||
|
|
self.tensorboard_dir = "runs"
|
||
|
|
self.cache_dir = ".cache"
|
||
|
|
self.checkpoints_dir = "checkpoints"
|
||
|
|
self.resume_file = "resume_state.json" # Plik stanu do wznowienia
|
||
|
|
|
||
|
|
# Tworzenie katalogów
|
||
|
|
self._create_dirs()
|
||
|
|
|
||
|
|
def _create_dirs(self):
|
||
|
|
"""Tworzy wymagane katalogi"""
|
||
|
|
dirs = [self.model_dir, self.data_dir, self.prepared_dir,
|
||
|
|
self.log_dir, self.tensorboard_dir, self.cache_dir,
|
||
|
|
"backups", "results", self.checkpoints_dir]
|
||
|
|
|
||
|
|
for d in dirs:
|
||
|
|
Path(d).mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
def print_config(self):
|
||
|
|
"""Wyświetla konfigurację"""
|
||
|
|
logger.info("=" * 60)
|
||
|
|
logger.info("⚙️ KONFIGURACJA MODELU")
|
||
|
|
logger.info("=" * 60)
|
||
|
|
logger.info(f"• Vocab size: {self.vocab_size}")
|
||
|
|
logger.info(f"• Embed dim: {self.embed_dim}")
|
||
|
|
logger.info(f"• Warstwy: {self.n_layers}")
|
||
|
|
logger.info(f"• Głowy: {self.n_heads}")
|
||
|
|
logger.info(f"• Kontekst: {self.max_len}")
|
||
|
|
logger.info(f"• Batch size: {self.batch_size}")
|
||
|
|
logger.info(f"• Learning rate: {self.learning_rate}")
|
||
|
|
logger.info(f"• Mixed precision: {self.use_amp}")
|
||
|
|
logger.info("=" * 60)
|
||
|
|
|
||
|
|
def save_resume_state(self, state: Dict[str, Any]):
|
||
|
|
"""Zapisuje stan do wznowienia"""
|
||
|
|
state_path = Path(self.checkpoints_dir) / self.resume_file
|
||
|
|
with open(state_path, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump(state, f, indent=2, ensure_ascii=False)
|
||
|
|
logger.info(f"💾 Stan zapisany do {state_path}")
|
||
|
|
|
||
|
|
def load_resume_state(self) -> Optional[Dict[str, Any]]:
|
||
|
|
"""Wczytuje stan do wznowienia"""
|
||
|
|
state_path = Path(self.checkpoints_dir) / self.resume_file
|
||
|
|
if state_path.exists():
|
||
|
|
with open(state_path, 'r', encoding='utf-8') as f:
|
||
|
|
return json.load(f)
|
||
|
|
return None
|
||
|
|
|
||
|
|
def get_latest_checkpoint(self) -> Optional[Path]:
|
||
|
|
"""Znajduje najnowszy checkpoint"""
|
||
|
|
checkpoints = list(Path(self.checkpoints_dir).glob("checkpoint_*.pt"))
|
||
|
|
if checkpoints:
|
||
|
|
# Sortuj po czasie modyfikacji
|
||
|
|
checkpoints.sort(key=lambda x: x.stat().st_mtime, reverse=True)
|
||
|
|
return checkpoints[0]
|
||
|
|
return None
|
||
|
|
|
||
|
|
def get_latest_model(self) -> Optional[Path]:
|
||
|
|
"""Znajduje najnowszy model"""
|
||
|
|
models = list(Path(self.model_dir).glob("model_*.pt"))
|
||
|
|
if models:
|
||
|
|
# Szukaj model_final.pt, potem model_epoch_X.pt
|
||
|
|
final_model = Path(self.model_dir) / "model_final.pt"
|
||
|
|
if final_model.exists():
|
||
|
|
return final_model
|
||
|
|
|
||
|
|
# Sortuj po numerze epoki
|
||
|
|
def get_epoch_num(path: Path) -> int:
|
||
|
|
try:
|
||
|
|
# model_epoch_10.pt -> 10
|
||
|
|
name = path.stem
|
||
|
|
return int(name.split('_')[-1])
|
||
|
|
except:
|
||
|
|
return 0
|
||
|
|
|
||
|
|
models.sort(key=get_epoch_num, reverse=True)
|
||
|
|
return models[0]
|
||
|
|
return None
|
||
|
|
def get_device(prefer_gpu=True):
|
||
|
|
"""Inteligentnie wybiera urządzenie"""
|
||
|
|
if prefer_gpu and torch.cuda.is_available():
|
||
|
|
return 'cuda'
|
||
|
|
elif torch.backends.mps.is_available(): # Apple Silicon
|
||
|
|
return 'mps'
|
||
|
|
else:
|
||
|
|
return 'cpu'
|
||
|
|
# Inicjalizacja konfiguracji
|
||
|
|
sys_config = SystemConfig()
|
||
|
|
cfg = ModelConfig()
|