Table of contents

Pods

Manage GPU containers (pods) using the Podstack SDK.

Create a Pod

Basic Creation

from podstack import Client

client = Client()

pod = client.pods.create(
    name="my-pod",
    image="pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime",
    gpu_type="A100",
    gpu_count=1
)

print(f"Pod ID: {pod.id}")
print(f"Status: {pod.status}")

Full Configuration

pod = client.pods.create(
    name="training-pod",
    project_id="project-id",  # Optional: specific project
    image="pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime",

    # GPU configuration
    gpu_type="A100",
    gpu_count=2,

    # Compute resources
    cpu=8,
    memory=64,  # GB
    disk=200,   # GB

    # Networking
    ports=[8888, 6006],  # Expose Jupyter and TensorBoard

    # Environment
    env={
        "WANDB_API_KEY": "your-key",
        "HF_TOKEN": "your-token"
    },

    # Startup command
    command="jupyter lab --ip=0.0.0.0 --allow-root",

    # Volume mounts
    volumes=[
        {"volume_id": "vol-123", "mount_path": "/data"}
    ],

    # Labels for organization
    labels={
        "team": "ml",
        "experiment": "gpt-fine-tune"
    }
)

List Pods

# All pods
pods = client.pods.list()

# Filter by project
pods = client.pods.list(project_id="project-id")

# Filter by status
pods = client.pods.list(status="running")

# Filter by labels
pods = client.pods.list(labels={"team": "ml"})

# Iterate
for pod in pods:
    print(f"{pod.name}: {pod.status} ({pod.gpu_type} x{pod.gpu_count})")

Get Pod Details

pod = client.pods.get("pod-id")

print(f"Name: {pod.name}")
print(f"Status: {pod.status}")
print(f"GPU: {pod.gpu_type} x {pod.gpu_count}")
print(f"CPU: {pod.cpu} cores")
print(f"Memory: {pod.memory} GB")
print(f"Created: {pod.created_at}")
print(f"SSH Command: {pod.ssh_command}")
print(f"Jupyter URL: {pod.jupyter_url}")

Pod Lifecycle

Start Pod

client.pods.start("pod-id")

# Wait for running
pod = client.pods.wait_until_running("pod-id", timeout=300)

Stop Pod

# Graceful stop
client.pods.stop("pod-id")

# Force stop
client.pods.stop("pod-id", force=True)

Restart Pod

client.pods.restart("pod-id")

Delete Pod

# Delete stopped pod
client.pods.delete("pod-id")

# Force delete running pod
client.pods.delete("pod-id", force=True)

Execute Commands

Run Command

result = client.pods.exec("pod-id", "nvidia-smi")

print(f"Exit code: {result.exit_code}")
print(f"Output: {result.output}")
print(f"Error: {result.error}")

Long-Running Commands

# With timeout (seconds)
result = client.pods.exec(
    "pod-id",
    "python train.py",
    timeout=3600  # 1 hour
)

# Async execution
task = client.pods.exec_async("pod-id", "python train.py")
print(f"Task ID: {task.id}")

# Check status later
status = client.tasks.get(task.id)
print(f"Status: {status.state}")

Interactive Shell

# Get SSH connection details
pod = client.pods.get("pod-id")
print(f"SSH: {pod.ssh_command}")

# Or use built-in SSH
client.pods.ssh("pod-id")  # Opens interactive shell

File Transfer

Upload Files

# Single file
client.pods.upload("pod-id", "./model.py", "/workspace/model.py")

# Directory
client.pods.upload(
    "pod-id",
    "./training_data/",
    "/workspace/data/",
    recursive=True
)

# With progress callback
def progress(transferred, total):
    print(f"Progress: {transferred}/{total} bytes")

client.pods.upload("pod-id", "./large_file.tar", "/workspace/", progress=progress)

Download Files

# Single file
client.pods.download("pod-id", "/workspace/model.pt", "./model.pt")

# Directory
client.pods.download(
    "pod-id",
    "/workspace/results/",
    "./results/",
    recursive=True
)

Logs

# Get recent logs
logs = client.pods.logs("pod-id")
print(logs)

# Tail logs
logs = client.pods.logs("pod-id", tail=100)

# Stream logs
for line in client.pods.logs_stream("pod-id"):
    print(line)

Metrics

metrics = client.pods.metrics("pod-id")

print(f"GPU Utilization: {metrics.gpu_utilization}%")
print(f"GPU Memory: {metrics.gpu_memory_used}/{metrics.gpu_memory_total} GB")
print(f"CPU Usage: {metrics.cpu_usage}%")
print(f"Memory Usage: {metrics.memory_used}/{metrics.memory_total} GB")

Pod Templates

Save as Template

template = client.pods.save_template(
    pod_id="pod-id",
    name="pytorch-a100-template",
    description="PyTorch training environment with A100"
)

Create from Template

pod = client.pods.create_from_template(
    template_id="template-id",
    name="new-pod-from-template"
)

Batch Operations

# Stop all pods in project
pods = client.pods.list(project_id="project-id", status="running")
for pod in pods:
    client.pods.stop(pod.id)

# Delete all stopped pods
pods = client.pods.list(status="stopped")
for pod in pods:
    client.pods.delete(pod.id)

Events

# Get pod events
events = client.pods.events("pod-id")

for event in events:
    print(f"{event.timestamp}: {event.type} - {event.message}")

Next Steps