Skip to content

GPU & ML Workflows

Runtime โ†’ Change runtime type โ†’ Hardware accelerator โ†’ GPU (T4/V100/A100)

# Verify GPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using: {device}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
# TensorFlow
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
print(f"GPUs: {gpus}")
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Simple model
class MLP(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_dim, output_dim)
)
def forward(self, x):
return self.net(x)
model = MLP(784, 256, 10).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
# Training loop
def train(model, loader, optimizer, criterion, epochs=10):
model.train()
for epoch in range(epochs):
total_loss = 0
for X, y in loader:
X, y = X.to(device), y.to(device)
optimizer.zero_grad()
loss = criterion(model(X), y)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Epoch {epoch+1}: loss={total_loss/len(loader):.4f}")
# Save checkpoint to Drive
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
}, '/content/drive/MyDrive/checkpoints/model.pth')
# Load checkpoint
checkpoint = torch.load('/content/drive/MyDrive/checkpoints/model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
import tensorflow as tf
from tensorflow import keras
# Build model
model = keras.Sequential([
keras.layers.Dense(256, activation='relu', input_shape=(784,)),
keras.layers.Dropout(0.2),
keras.layers.Dense(128, activation='relu'),
keras.layers.Dense(10, activation='softmax')
])
model.compile(
optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
# Callbacks
callbacks = [
keras.callbacks.ModelCheckpoint(
'/content/drive/MyDrive/model_best.h5',
save_best_only=True
),
keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3)
]
history = model.fit(
X_train, y_train,
validation_data=(X_val, y_val),
epochs=50,
batch_size=32,
callbacks=callbacks
)
# Enable TPU: Runtime โ†’ Change runtime type โ†’ TPU
import tensorflow as tf
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)
print(f"Number of TPU cores: {strategy.num_replicas_in_sync}")
with strategy.scope():
model = keras.Sequential([...])
model.compile(...)
model.fit(dataset, epochs=10)
!pip install transformers datasets accelerate
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer
)
import numpy as np
from google.colab import userdata
# Load dataset
dataset = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
return tokenizer(batch["text"], truncation=True, padding=True, max_length=512)
tokenized = dataset.map(tokenize, batched=True)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased", num_labels=2
)
# Training arguments
args = TrainingArguments(
output_dir="/content/drive/MyDrive/distilbert-imdb",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
fp16=True, # mixed precision โ€” faster on GPU
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
)
trainer.train()
# Check GPU memory
!nvidia-smi
# Clear GPU cache (PyTorch)
import torch
torch.cuda.empty_cache()
# Delete large tensors
del large_tensor
torch.cuda.empty_cache()
# Mixed precision training (saves ~50% memory)
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
with autocast():
output = model(input)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
# Gradient checkpointing (trade compute for memory)
model.gradient_checkpointing_enable()
# TensorBoard in Colab
%load_ext tensorboard
%tensorboard --logdir /content/logs
# With PyTorch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('/content/logs')
writer.add_scalar('Loss/train', loss.item(), epoch)
writer.add_scalar('Accuracy/val', val_acc, epoch)
# Weights & Biases
!pip install wandb
import wandb
from google.colab import userdata
wandb.login(key=userdata.get('WANDB_API_KEY'))
wandb.init(project="my-project", config={"lr": 0.001, "epochs": 10})
wandb.log({"loss": loss.item(), "accuracy": acc})