Tensorboard 생성

Tensorboard 생성


학습 로그 저장 경로 확인

  • Tensorboard 생성 전, 학습할 Notebook/Pytorchjob 생성 필수
  • 진행한 학습 로그 저장 경로 확인
    • (예시) LOGDIR = "/home/irteam/tb_logs"
  • Tensorboard를 연결하기 위한 코드 주석 참고
    • import os import time import torch import torch.nn as nn import torch.optim as optim from torchvision import datasets from torch.utils.tensorboard import SummaryWriter # TensorBoard 로그를 "쓰기 가능한 PVC경로"로 저장 # /home -> /home1(심볼릭링크)이고, /home/irteam 아래가 NFS(rw)로 마운트되어 있음 LOGDIR = "/home/irteam/tb_logs" assert torch.cuda.is_available(), "CUDA GPU not available" assert torch.cuda.device_count() >= 4, "Need at least 4 GPUs" device = torch.device("cuda") torch.backends.cuda.matmul.allow_tf32 = True print("Visible GPUs:", torch.cuda.device_count()) # run 별로 폴더 생성 run_name = time.strftime("run-%Y%m%d-%H%M%S") log_path = os.path.join(LOGDIR, run_name) os.makedirs(log_path, exist_ok=True) writer = SummaryWriter(log_dir=log_path) print("TensorBoard logdir:", log_path) raw = datasets.FashionMNIST( root="/tmp/data", train=True, download=True, transform=None ) class FMNISTTensor(torch.utils.data.Dataset): def __init__(self, base): self.x = base.data self.y = base.targets def __len__(self): return self.x.size(0) def __getitem__(self, idx): x = self.x[idx].unsqueeze(0).float().div_(255.0) y = self.y[idx] return x, y train_dataset = FMNISTTensor(raw) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=1024, # 총 배치 (GPU당 256) shuffle=True, num_workers=0, pin_memory=True ) model = nn.Sequential( nn.Conv2d(1, 64, 3, padding=1), nn.ReLU(), nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Flatten(), nn.Linear(128 * 14 * 14, 512), nn.ReLU(), nn.Linear(512, 10) ) # 4-GPU DataParallel model = nn.DataParallel(model, device_ids=[0, 1, 2, 3]).to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) criterion = nn.CrossEntropyLoss() scaler = torch.cuda.amp.GradScaler() print("Start training (4-GPU monitoring test, ~10 min)") end = time.time() + 600 step = 0 model.train() while time.time() < end: for x, y in train_loader: x = x.to(device, non_blocking=True) y = y.to(device, non_blocking=True) optimizer.zero_grad(set_to_none=True) with torch.cuda.amp.autocast(dtype=torch.float16): out = model(x) loss = criterion(out, y) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() step += 1 # TensorBoard 로깅 (너 목적: 학습 진행 + GPU 메모리 확인) if step % 10 == 0: writer.add_scalar("train/loss", float(loss.item()), step) writer.add_scalar("train/lr", float(optimizer.param_groups[0]["lr"]), step) # GPU 메모리 메트릭 writer.add_scalar( "gpu/mem_allocated_MiB", torch.cuda.memory_allocated() / 1024 / 1024, step ) writer.add_scalar( "gpu/mem_reserved_MiB", torch.cuda.memory_reserved() / 1024 / 1024, step ) writer.add_scalar( "gpu/max_mem_allocated_MiB", torch.cuda.max_memory_allocated() / 1024 / 1024, step ) if step % 50 == 0: print(f"step={step}, loss={loss.item():.4f}") if time.time() >= end: break torch.cuda.synchronize() # 로그 flush/close writer.flush() writer.close() print("Done") print("TensorBoard logdir:", log_path) print("Max GPU Mem (MiB):", torch.cuda.max_memory_allocated() / 1024 / 1024)

TensorBoard Create 버튼 클릭

notion image
 

New TensorBoard 팝업에서 Mount Path 설정

  • TensorBoard의 Mount Path는 학습 코드의 LOGDIR 경로와 동일하게 설정
notion image
  • Tensorboard의 crd와 Pod가 생성되며 Connect 버튼이 활성화
    • kim@Clush:~$ k get tensorboards NAME AGE test 103m test2 93m # 생성했던 Tensorboard kim@Clush:~$ k get po NAME READY STATUS RESTARTS AGE gpu-burn-3m-worker-0 0/1 Completed 0 2d3h ml-pipeline-ui-artifact-7cf99c78c4-2422m 2/2 Running 0 3d4h ml-pipeline-visualizationserver-74dd4d6b77-z2h8t 2/2 Running 0 3d4h monitoring-test-0 2/2 Running 0 153m test-1-0 2/2 Running 0 2d7h test-8794f985-f2x9b 2/2 Running 0 103m test-gpu-0 2/2 Running 0 2d4h test-with-config-0 0/2 Pending 0 6h33m test2-766c68c4cf-x642n 2/2 Running 0 93m volume-test-1-0 0/2 Pending 0 128m kim@Clush:~$
notion image
 

Connect 버튼 클릭 후 Tensorboard 활성화

  • Tensorboard로 학습 모니터링
notion image