Table of contents

Pods

Manage GPU containers using CLI commands.

Create Pod

Basic Creation

podstack pod create \
  --name my-pod \
  --image pytorch/pytorch:latest \
  --gpu-type A100

Full Options

podstack pod create \
  --name training-pod \
  --project my-project \
  --image pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime \
  --gpu-type A100 \
  --gpu-count 2 \
  --cpu 8 \
  --memory 64 \
  --disk 200 \
  --port 8888 \
  --port 6006 \
  --env WANDB_API_KEY=xxx \
  --env HF_TOKEN=xxx \
  --volume vol-123:/data \
  --command "jupyter lab --ip=0.0.0.0" \
  --label team=ml \
  --label experiment=gpt \
  --wait

Create Options

FlagDescription
--namePod name (required)
--imageContainer image (required)
--gpu-typeGPU type: A100, H100, V100, L40S, T4
--gpu-countNumber of GPUs (default: 1)
--cpuCPU cores (default: 4)
--memoryMemory in GB (default: 16)
--diskDisk size in GB (default: 50)
--portExpose port (can repeat)
--envEnvironment variable KEY=VALUE (can repeat)
--volumeMount volume ID:PATH (can repeat)
--commandStartup command
--labelLabel KEY=VALUE (can repeat)
--projectProject ID or name
--waitWait for pod to be running
--timeoutWait timeout in seconds

List Pods

# All pods
podstack pod list

# Filter by status
podstack pod list --status running
podstack pod list --status stopped

# Filter by project
podstack pod list --project my-project

# Filter by label
podstack pod list --label team=ml

# Output formats
podstack pod list --output table  # default
podstack pod list --output json
podstack pod list --output yaml
podstack pod list --output wide   # more columns

# IDs only (for scripting)
podstack pod list --quiet

Get Pod Details

# Basic info
podstack pod get my-pod

# JSON output
podstack pod get my-pod --output json

# Watch for changes
podstack pod get my-pod --watch

Pod Lifecycle

Start Pod

podstack pod start my-pod

# Wait for running
podstack pod start my-pod --wait

Stop Pod

# Graceful stop
podstack pod stop my-pod

# Force stop
podstack pod stop my-pod --force

Restart Pod

podstack pod restart my-pod

Delete Pod

# Delete stopped pod
podstack pod delete my-pod

# Force delete running pod
podstack pod delete my-pod --force

# Delete without confirmation
podstack pod delete my-pod --yes

Connect to Pod

SSH

# Interactive SSH
podstack pod ssh my-pod

# SSH as different user
podstack pod ssh my-pod --user ubuntu

# SSH with specific key
podstack pod ssh my-pod --identity ~/.ssh/my_key

Execute Commands

# Single command
podstack pod exec my-pod -- nvidia-smi

# Multiple commands
podstack pod exec my-pod -- bash -c "cd /workspace && python train.py"

# Interactive
podstack pod exec my-pod -it -- /bin/bash

# With timeout
podstack pod exec my-pod --timeout 3600 -- python long_training.py

Web Terminal

Open browser-based terminal:

podstack pod terminal my-pod

File Transfer

Upload

# Single file
podstack pod cp ./train.py my-pod:/workspace/train.py

# Directory
podstack pod cp ./data/ my-pod:/workspace/data/

# With progress
podstack pod cp --progress ./large_file.tar my-pod:/workspace/

Download

# Single file
podstack pod cp my-pod:/workspace/model.pt ./model.pt

# Directory
podstack pod cp my-pod:/workspace/results/ ./results/

Logs

# Recent logs
podstack pod logs my-pod

# Last N lines
podstack pod logs my-pod --tail 100

# Follow logs
podstack pod logs my-pod --follow

# Since timestamp
podstack pod logs my-pod --since 2024-01-15T10:00:00Z

# Since duration
podstack pod logs my-pod --since 1h

Port Forwarding

Forward pod ports to localhost:

# Forward single port
podstack pod port-forward my-pod 8888:8888

# Forward multiple ports
podstack pod port-forward my-pod 8888:8888 6006:6006

# Background
podstack pod port-forward my-pod 8888:8888 &

Pod Events

View pod events:

podstack pod events my-pod

Metrics

podstack pod metrics my-pod

Output:

GPU Utilization: 85%
GPU Memory: 32.5/40.0 GB
CPU Usage: 45%
Memory Usage: 48.2/64.0 GB
Disk Usage: 120.5/200.0 GB

Templates

Save as Template

podstack pod save-template my-pod \
  --name pytorch-a100-template \
  --description "PyTorch training environment"

Create from Template

podstack pod create \
  --template pytorch-a100-template \
  --name new-pod

Batch Operations

Stop All Running Pods

podstack pod list --status running --quiet | xargs -I {} podstack pod stop {}

Delete All Stopped Pods

podstack pod list --status stopped --quiet | xargs -I {} podstack pod delete {} --yes

Examples

Quick Training Job

# Create, train, cleanup
podstack pod create --name train --image pytorch/pytorch:latest --gpu-type A100 --wait
podstack pod cp ./train.py train:/workspace/
podstack pod exec train -- python /workspace/train.py
podstack pod cp train:/workspace/model.pt ./
podstack pod delete train --force

Jupyter Development

podstack pod create \
  --name jupyter-dev \
  --image jupyter/pytorch-notebook \
  --gpu-type A100 \
  --port 8888 \
  --command "jupyter lab --ip=0.0.0.0 --allow-root --NotebookApp.token=''" \
  --wait

# Get Jupyter URL
podstack pod get jupyter-dev --output json | jq -r '.jupyter_url'

Next Steps