GPU & ML Workflows
GPU & ML Workflows in Colab
Section titled โGPU & ML Workflows in ColabโEnabling GPU
Section titled โEnabling GPUโRuntime โ Change runtime type โ Hardware accelerator โ GPU (T4/V100/A100)
# Verify GPUimport torchdevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')print(f"Using: {device}")print(f"GPU: {torch.cuda.get_device_name(0)}")print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
# TensorFlowimport tensorflow as tfgpus = tf.config.list_physical_devices('GPU')print(f"GPUs: {gpus}")PyTorch Training Pattern
Section titled โPyTorch Training Patternโimport torchimport torch.nn as nnfrom torch.utils.data import DataLoader, TensorDataset
# Device setupdevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Simple modelclass MLP(nn.Module): def __init__(self, input_dim, hidden_dim, output_dim): super().__init__() self.net = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Dropout(0.2), nn.Linear(hidden_dim, output_dim) )
def forward(self, x): return self.net(x)
model = MLP(784, 256, 10).to(device)optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)criterion = nn.CrossEntropyLoss()
# Training loopdef train(model, loader, optimizer, criterion, epochs=10): model.train() for epoch in range(epochs): total_loss = 0 for X, y in loader: X, y = X.to(device), y.to(device) optimizer.zero_grad() loss = criterion(model(X), y) loss.backward() optimizer.step() total_loss += loss.item() print(f"Epoch {epoch+1}: loss={total_loss/len(loader):.4f}")
# Save checkpoint to Drivetorch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(),}, '/content/drive/MyDrive/checkpoints/model.pth')
# Load checkpointcheckpoint = torch.load('/content/drive/MyDrive/checkpoints/model.pth')model.load_state_dict(checkpoint['model_state_dict'])TensorFlow / Keras Pattern
Section titled โTensorFlow / Keras Patternโimport tensorflow as tffrom tensorflow import keras
# Build modelmodel = keras.Sequential([ keras.layers.Dense(256, activation='relu', input_shape=(784,)), keras.layers.Dropout(0.2), keras.layers.Dense(128, activation='relu'), keras.layers.Dense(10, activation='softmax')])
model.compile( optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Callbackscallbacks = [ keras.callbacks.ModelCheckpoint( '/content/drive/MyDrive/model_best.h5', save_best_only=True ), keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True), keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3)]
history = model.fit( X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32, callbacks=callbacks)TPU Usage
Section titled โTPU Usageโ# Enable TPU: Runtime โ Change runtime type โ TPU
import tensorflow as tf
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()tf.config.experimental_connect_to_cluster(resolver)tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)print(f"Number of TPU cores: {strategy.num_replicas_in_sync}")
with strategy.scope(): model = keras.Sequential([...]) model.compile(...)
model.fit(dataset, epochs=10)Fine-tuning with Hugging Face
Section titled โFine-tuning with Hugging Faceโ!pip install transformers datasets accelerate
from datasets import load_datasetfrom transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer)import numpy as npfrom google.colab import userdata
# Load datasetdataset = load_dataset("imdb")tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch): return tokenizer(batch["text"], truncation=True, padding=True, max_length=512)
tokenized = dataset.map(tokenize, batched=True)
# Load modelmodel = AutoModelForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=2)
# Training argumentsargs = TrainingArguments( output_dir="/content/drive/MyDrive/distilbert-imdb", num_train_epochs=3, per_device_train_batch_size=16, per_device_eval_batch_size=32, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, fp16=True, # mixed precision โ faster on GPU)
trainer = Trainer( model=model, args=args, train_dataset=tokenized["train"], eval_dataset=tokenized["test"],)
trainer.train()Memory Management
Section titled โMemory Managementโ# Check GPU memory!nvidia-smi
# Clear GPU cache (PyTorch)import torchtorch.cuda.empty_cache()
# Delete large tensorsdel large_tensortorch.cuda.empty_cache()
# Mixed precision training (saves ~50% memory)from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()with autocast(): output = model(input) loss = criterion(output, target)
scaler.scale(loss).backward()scaler.step(optimizer)scaler.update()
# Gradient checkpointing (trade compute for memory)model.gradient_checkpointing_enable()Monitoring Training
Section titled โMonitoring Trainingโ# TensorBoard in Colab%load_ext tensorboard%tensorboard --logdir /content/logs
# With PyTorchfrom torch.utils.tensorboard import SummaryWriterwriter = SummaryWriter('/content/logs')writer.add_scalar('Loss/train', loss.item(), epoch)writer.add_scalar('Accuracy/val', val_acc, epoch)
# Weights & Biases!pip install wandbimport wandbfrom google.colab import userdata
wandb.login(key=userdata.get('WANDB_API_KEY'))wandb.init(project="my-project", config={"lr": 0.001, "epochs": 10})wandb.log({"loss": loss.item(), "accuracy": acc})