Curt-Park · November 10, 2025 13:10 · Curt-Park · Nov 8, 2025
diff --git a/client.sh b/client.sh
 # Prerequisites:
 #   1. The SkyPilot server must be prepared ahead.
 #   2. SkyPilot installation: https://docs.skypilot.co/en/latest/getting-started/installation.html

 # Connect to the remote server
 sky api login -e http://$WEB_USERNAME:$WEB_PASSWORD@ip-address/

 # Verify cloud access
 sky check

 # Check any gpu available
 sky show-gpus --infra k8s
 # In my case:
 ## Kubernetes GPUs
 ## Context: in-cluster
 ## GPU      REQUESTABLE_QTY_PER_NODE  UTILIZATION
 ## RTX4080  1

 # Create a cluster
 sky launch -c mycluster hello_sky.yaml

 # SSH to the cluster
 ssh mycluster  # Run `nvidia-smi` to check the gpu allocated

 # Execute a task on an existing cluster
 sky exec mycluster hello_sky.yaml

 # Terminate the cluster
 sky down mycluster

 # NOTE: You can just stop the cluster if you don't lose the data on the attached disk.
 #       `sky stop mycluster`
diff --git a/hello_sky.yaml b/hello_sky.yaml
 resources:
  # Optional; if left out, automatically pick the cheapest cloud.
  infra: k8s
  # 1 x NVIDIA RTX4080 GPU
  accelerators: RTX4080:1

 # Working directory (optional) containing the project codebase.
 # Its contents are synced to ~/sky_workdir/ on the cluster.
 workdir: .

 # Typical use: pip install -r requirements.txt
 # Invoked under the workdir (i.e., can use its files).
 setup: |
  echo "Running setup."

 # Typical use: make use of resources, such as running training.
 # Invoked under the workdir (i.e., can use its files).
 run: |
  echo "Hello, SkyPilot!"
  conda env list
diff --git a/server.sh b/server.sh
 # Official docs: https://docs.skypilot.co/en/latest/docs/index.html
 #
 # SkyPilot: Key Features & Advantages
 #   - Unified ML orchestration for any public cloud or Kubernetes.
 #   - Automatic GPU/CPU selection & cost-saving (spot/preemptible support, multi-cloud failover).
 #   - Seamless multi-node/distributed training with simple YAML—no code changes needed.
 #   - Direct SSH/VSCode access to jobs.
 #   - Autostop for idle clusters—no cost leakage.
 #   - RBAC, SSO, and team dashboard available in managed mode.
 #   - Reproducible, infrastructure-as-code interface.
 #
 # Prerequisites:
 #   1. kubectl installation: https://kubernetes.io/docs/tasks/tools/
 #   2. minikube installation: https://minikube.sigs.k8s.io/docs/start/
 #   3. helm installation: https://helm.sh/docs/intro/install/
 #   4. Cluster Initialization: `minikube start --driver docker --container-runtime docker --gpus all`
 #   5. Check all gpus well mounted: `minikube ssh nvidia-smi

 # Ensure the helm repository is added and up to date
 helm repo add skypilot https://helm.skypilot.co
 helm repo add nvidia https://nvidia.github.io/gpu-operator
 helm repo update

 # SkyPilot requires any gpu label (e.g. nvidia.com/gpu.product) to recognize gpus.
 helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator
 kubectl get pods -n gpu-operator --watch

 # The following variables will be used throughout the guide
 # NAMESPACE is the namespace to deploy the API server in
 NAMESPACE=skypilot
 # RELEASE_NAME is the name of the helm release, must be unique within the namespace
 RELEASE_NAME=skypilot
 # Set up basic username/password HTTP auth, or use OAuth2 proxy
 WEB_USERNAME=skypilot
 WEB_PASSWORD=yourpassword
 AUTH_STRING=$(htpasswd -nb $WEB_USERNAME $WEB_PASSWORD)
 # Deploy the API server
 helm upgrade --install $RELEASE_NAME skypilot/skypilot-nightly --devel \
  --namespace $NAMESPACE \
  --create-namespace \
  --set ingress.authCredentials=$AUTH_STRING

 # Check all deployments are running
 helm list -n skypilot
 kubectl get pods -n skypilot --watch

 # Tunnel the service
 minikube tunnel --bind-address="0.0.0.0"

 # Access to http://ip-address
 # Login with WEB_USERNAME and WEB_PASSWORD

 # Run `minikube delete --all` to terminate the minikube cluster.
	# Prerequisites:
	# 1. The SkyPilot server must be prepared ahead.
	# 2. SkyPilot installation: https://docs.skypilot.co/en/latest/getting-started/installation.html

	# Connect to the remote server
	sky api login -e http://$WEB_USERNAME:$WEB_PASSWORD@ip-address/

	# Verify cloud access
	sky check

	# Check any gpu available
	sky show-gpus --infra k8s
	# In my case:
	## Kubernetes GPUs
	## Context: in-cluster
	## GPU REQUESTABLE_QTY_PER_NODE UTILIZATION
	## RTX4080 1

	# Create a cluster
	sky launch -c mycluster hello_sky.yaml

	# SSH to the cluster
	ssh mycluster # Run `nvidia-smi` to check the gpu allocated

	# Execute a task on an existing cluster
	sky exec mycluster hello_sky.yaml

	# Terminate the cluster
	sky down mycluster

	# NOTE: You can just stop the cluster if you don't lose the data on the attached disk.
	# `sky stop mycluster`
	resources:
	# Optional; if left out, automatically pick the cheapest cloud.
	infra: k8s
	# 1 x NVIDIA RTX4080 GPU
	accelerators: RTX4080:1

	# Working directory (optional) containing the project codebase.
	# Its contents are synced to ~/sky_workdir/ on the cluster.
	workdir: .

	# Typical use: pip install -r requirements.txt
	# Invoked under the workdir (i.e., can use its files).
	setup: \|
	echo "Running setup."

	# Typical use: make use of resources, such as running training.
	# Invoked under the workdir (i.e., can use its files).
	run: \|
	echo "Hello, SkyPilot!"
	conda env list
	# Official docs: https://docs.skypilot.co/en/latest/docs/index.html
	#
	# SkyPilot: Key Features & Advantages
	# - Unified ML orchestration for any public cloud or Kubernetes.
	# - Automatic GPU/CPU selection & cost-saving (spot/preemptible support, multi-cloud failover).
	# - Seamless multi-node/distributed training with simple YAML—no code changes needed.
	# - Direct SSH/VSCode access to jobs.
	# - Autostop for idle clusters—no cost leakage.
	# - RBAC, SSO, and team dashboard available in managed mode.
	# - Reproducible, infrastructure-as-code interface.
	#
	# Prerequisites:
	# 1. kubectl installation: https://kubernetes.io/docs/tasks/tools/
	# 2. minikube installation: https://minikube.sigs.k8s.io/docs/start/
	# 3. helm installation: https://helm.sh/docs/intro/install/
	# 4. Cluster Initialization: `minikube start --driver docker --container-runtime docker --gpus all`
	# 5. Check all gpus well mounted: `minikube ssh nvidia-smi

	# Ensure the helm repository is added and up to date
	helm repo add skypilot https://helm.skypilot.co
	helm repo add nvidia https://nvidia.github.io/gpu-operator
	helm repo update

	# SkyPilot requires any gpu label (e.g. nvidia.com/gpu.product) to recognize gpus.
	helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator
	kubectl get pods -n gpu-operator --watch

	# The following variables will be used throughout the guide
	# NAMESPACE is the namespace to deploy the API server in
	NAMESPACE=skypilot
	# RELEASE_NAME is the name of the helm release, must be unique within the namespace
	RELEASE_NAME=skypilot
	# Set up basic username/password HTTP auth, or use OAuth2 proxy
	WEB_USERNAME=skypilot
	WEB_PASSWORD=yourpassword
	AUTH_STRING=$(htpasswd -nb $WEB_USERNAME $WEB_PASSWORD)
	# Deploy the API server
	helm upgrade --install $RELEASE_NAME skypilot/skypilot-nightly --devel \
	--namespace $NAMESPACE \
	--create-namespace \
	--set ingress.authCredentials=$AUTH_STRING

	# Check all deployments are running
	helm list -n skypilot
	kubectl get pods -n skypilot --watch

	# Tunnel the service
	minikube tunnel --bind-address="0.0.0.0"

	# Access to http://ip-address
	# Login with WEB_USERNAME and WEB_PASSWORD

	# Run `minikube delete --all` to terminate the minikube cluster.