sozercan · March 7, 2025 17:51
diff --git a/gistfile1.txt b/gistfile1.txt
 k describe po  qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w                                                ⎈ sertac-aks-gpu sozercan@sertac-vm
 Name:             qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w
 Namespace:        default
 Priority:         0
 Service Account:  default
 Node:             aks-gpu-13753585-vmss000002/10.224.0.5
 Start Time:       Thu, 06 Mar 2025 06:46:19 +0000
 Labels:           app.kubernetes.io/created-by=kuberay-operator
                  app.kubernetes.io/name=kuberay
                  ray.io/cluster=qwen-coder-7b-instruct-7f74c4446-q5r94
                  ray.io/group=small-group
                  ray.io/identifier=qwen-coder-7b-instruct-7f74c4446-q5r94-worker
                  ray.io/is-ray-node=yes
                  ray.io/node-type=worker
 Annotations:      ray.io/ft-enabled: false
                  ray.io/overwrite-container-cmd: true
 Status:           Running
 IP:               10.244.1.158
 IPs:
  IP:           10.244.1.158
 Controlled By:  RayCluster/qwen-coder-7b-instruct-7f74c4446-q5r94
 Init Containers:
  wait-gcs-ready:
    Container ID:  containerd://39894a9b25a2486ccb1ff7375cfc12160bc9f0809b96074f3b489784a74b6fcf
    Image:         vllm/vllm-openai:v0.7.1
    Image ID:      docker.io/vllm/vllm-openai@sha256:9cd69b577cf26df32aceb74577ea7f6749618a72e630f654ecb10dbfb23e3de4
    Port:          <none>
    Host Port:     <none>
    Command:
      /bin/bash
      -lc
      --
    Args:

                            SECONDS=0
                            while true; do
                              if (( SECONDS <= 120 )); then
                                if ray health-check --address qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379 > /dev/null 2>&1; then
                                  echo "GCS is ready."
                                  break
                                fi
                                echo "$SECONDS seconds elapsed: Waiting for GCS to be ready."
                              else
                                if ray health-check --address qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379; then
                                  echo "GCS is ready. Any error messages above can be safely ignored."
                                  break
                                fi
                                echo "$SECONDS seconds elapsed: Still waiting for GCS to be ready. For troubleshooting, refer to the FAQ at https://github.com/ray-project/kuberay/blob/master/docs/guidance/FAQ.md."
                              fi
                              sleep 5
                            done

    State:          Terminated
      Reason:       Completed
      Exit Code:    0
      Started:      Thu, 06 Mar 2025 06:46:20 +0000
      Finished:     Thu, 06 Mar 2025 06:46:26 +0000
    Ready:          True
    Restart Count:  0
    Limits:
      cpu:     200m
      memory:  256Mi
    Requests:
      cpu:     200m
      memory:  256Mi
    Environment:
      FQ_RAY_IP:  qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local
      RAY_IP:     qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-k6cz8 (ro)
 Containers:
  ray-worker:
    Container ID:   containerd://fbb77818006de245f9cd5f77071263e19cbe80682c2b5e95722418e1f0ee59f6
    Image:          vllm/vllm-openai:v0.7.1
    Image ID:       docker.io/vllm/vllm-openai@sha256:9cd69b577cf26df32aceb74577ea7f6749618a72e630f654ecb10dbfb23e3de4
    Port:           8080/TCP
    Host Port:      0/TCP
    State:          Running
      Started:      Thu, 06 Mar 2025 06:46:26 +0000
    Ready:          False
    Restart Count:  0
    Limits:
      cpu:             4
      nvidia.com/gpu:  1
    Requests:
      cpu:             4
      nvidia.com/gpu:  1
    Liveness:          exec [bash -c wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success] delay=30s timeout=2s period=5s #success=1 #failure=120
    Readiness:         exec [bash -c wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success] delay=10s timeout=2s period=5s #success=1 #failure=10
    Environment:
      FQ_RAY_IP:                            qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local
      RAY_IP:                               qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc
      RAY_CLUSTER_NAME:                      (v1:metadata.labels['ray.io/cluster'])
      RAY_CLOUD_INSTANCE_ID:                qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w (v1:metadata.name)
      RAY_NODE_TYPE_NAME:                    (v1:metadata.labels['ray.io/group'])
      KUBERAY_GEN_RAY_START_CMD:            ray start  --address=qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379  --metrics-export-port=8080  --block  --dashboard-agent-listen-port=52365  --num-cpus=4  --num-gpus=1
      RAY_PORT:                             6379
      RAY_ADDRESS:                          qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379
      RAY_USAGE_STATS_KUBERAY_IN_USE:       1
      REDIS_PASSWORD:
      RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE:  1
    Mounts:
      /dev/shm from shared-mem (rw)
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-k6cz8 (ro)
 Conditions:
  Type                        Status
  PodReadyToStartContainers   True
  Initialized                 True
  Ready                       False
  ContainersReady             False
  PodScheduled                True
 Volumes:
  shared-mem:
    Type:       EmptyDir (a temporary directory that shares a pod's lifetime)
    Medium:     Memory
    SizeLimit:  <unset>
  kube-api-access-k6cz8:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
 QoS Class:                   Burstable
 Node-Selectors:              <none>
 Tolerations:                 node.kubernetes.io/memory-pressure:NoSchedule op=Exists
                             node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
                             nvidia.com/gpu:NoSchedule op=Exists
 Events:
  Type     Reason     Age               From               Message
  ----     ------     ----              ----               -------
  Normal   Scheduled  46s               default-scheduler  Successfully assigned default/qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w to aks-gpu-13753585-vmss000002
  Normal   Pulled     45s               kubelet            Container image "vllm/vllm-openai:v0.7.1" already present on machine
  Normal   Created    45s               kubelet            Created container wait-gcs-ready
  Normal   Started    45s               kubelet            Started container wait-gcs-ready
  Normal   Pulled     39s               kubelet            Container image "vllm/vllm-openai:v0.7.1" already present on machine
  Normal   Created    39s               kubelet            Created container ray-worker
  Normal   Started    39s               kubelet            Started container ray-worker
  Warning  Unhealthy  1s (x6 over 26s)  kubelet            Readiness probe failed:
  Warning  Unhealthy  1s (x2 over 6s)   kubelet            Liveness probe failed:
	k describe po qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w ⎈ sertac-aks-gpu sozercan@sertac-vm
	Name: qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w
	Namespace: default
	Priority: 0
	Service Account: default
	Node: aks-gpu-13753585-vmss000002/10.224.0.5
	Start Time: Thu, 06 Mar 2025 06:46:19 +0000
	Labels: app.kubernetes.io/created-by=kuberay-operator
	app.kubernetes.io/name=kuberay
	ray.io/cluster=qwen-coder-7b-instruct-7f74c4446-q5r94
	ray.io/group=small-group
	ray.io/identifier=qwen-coder-7b-instruct-7f74c4446-q5r94-worker
	ray.io/is-ray-node=yes
	ray.io/node-type=worker
	Annotations: ray.io/ft-enabled: false
	ray.io/overwrite-container-cmd: true
	Status: Running
	IP: 10.244.1.158
	IPs:
	IP: 10.244.1.158
	Controlled By: RayCluster/qwen-coder-7b-instruct-7f74c4446-q5r94
	Init Containers:
	wait-gcs-ready:
	Container ID: containerd://39894a9b25a2486ccb1ff7375cfc12160bc9f0809b96074f3b489784a74b6fcf
	Image: vllm/vllm-openai:v0.7.1
	Image ID: docker.io/vllm/vllm-openai@sha256:9cd69b577cf26df32aceb74577ea7f6749618a72e630f654ecb10dbfb23e3de4
	Port: <none>
	Host Port: <none>
	Command:
	/bin/bash
	-lc
	--
	Args:

	SECONDS=0
	while true; do
	if (( SECONDS <= 120 )); then
	if ray health-check --address qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379 > /dev/null 2>&1; then
	echo "GCS is ready."
	break
	fi
	echo "$SECONDS seconds elapsed: Waiting for GCS to be ready."
	else
	if ray health-check --address qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379; then
	echo "GCS is ready. Any error messages above can be safely ignored."
	break
	fi
	echo "$SECONDS seconds elapsed: Still waiting for GCS to be ready. For troubleshooting, refer to the FAQ at https://github.com/ray-project/kuberay/blob/master/docs/guidance/FAQ.md."
	fi
	sleep 5
	done

	State: Terminated
	Reason: Completed
	Exit Code: 0
	Started: Thu, 06 Mar 2025 06:46:20 +0000
	Finished: Thu, 06 Mar 2025 06:46:26 +0000
	Ready: True
	Restart Count: 0
	Limits:
	cpu: 200m
	memory: 256Mi
	Requests:
	cpu: 200m
	memory: 256Mi
	Environment:
	FQ_RAY_IP: qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local
	RAY_IP: qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc
	Mounts:
	/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-k6cz8 (ro)
	Containers:
	ray-worker:
	Container ID: containerd://fbb77818006de245f9cd5f77071263e19cbe80682c2b5e95722418e1f0ee59f6
	Image: vllm/vllm-openai:v0.7.1
	Image ID: docker.io/vllm/vllm-openai@sha256:9cd69b577cf26df32aceb74577ea7f6749618a72e630f654ecb10dbfb23e3de4
	Port: 8080/TCP
	Host Port: 0/TCP
	State: Running
	Started: Thu, 06 Mar 2025 06:46:26 +0000
	Ready: False
	Restart Count: 0
	Limits:
	cpu: 4
	nvidia.com/gpu: 1
	Requests:
	cpu: 4
	nvidia.com/gpu: 1
	Liveness: exec [bash -c wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz \| grep success] delay=30s timeout=2s period=5s #success=1 #failure=120
	Readiness: exec [bash -c wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz \| grep success] delay=10s timeout=2s period=5s #success=1 #failure=10
	Environment:
	FQ_RAY_IP: qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local
	RAY_IP: qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc
	RAY_CLUSTER_NAME: (v1:metadata.labels['ray.io/cluster'])
	RAY_CLOUD_INSTANCE_ID: qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w (v1:metadata.name)
	RAY_NODE_TYPE_NAME: (v1:metadata.labels['ray.io/group'])
	KUBERAY_GEN_RAY_START_CMD: ray start --address=qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379 --metrics-export-port=8080 --block --dashboard-agent-listen-port=52365 --num-cpus=4 --num-gpus=1
	RAY_PORT: 6379
	RAY_ADDRESS: qwen-coder-7b-instruct-7f74c4446-q5r94-head-svc.default.svc.cluster.local:6379
	RAY_USAGE_STATS_KUBERAY_IN_USE: 1
	REDIS_PASSWORD:
	RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE: 1
	Mounts:
	/dev/shm from shared-mem (rw)
	/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-k6cz8 (ro)
	Conditions:
	Type Status
	PodReadyToStartContainers True
	Initialized True
	Ready False
	ContainersReady False
	PodScheduled True
	Volumes:
	shared-mem:
	Type: EmptyDir (a temporary directory that shares a pod's lifetime)
	Medium: Memory
	SizeLimit: <unset>
	kube-api-access-k6cz8:
	Type: Projected (a volume that contains injected data from multiple sources)
	TokenExpirationSeconds: 3607
	ConfigMapName: kube-root-ca.crt
	ConfigMapOptional: <nil>
	DownwardAPI: true
	QoS Class: Burstable
	Node-Selectors: <none>
	Tolerations: node.kubernetes.io/memory-pressure:NoSchedule op=Exists
	node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
	node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
	nvidia.com/gpu:NoSchedule op=Exists
	Events:
	Type Reason Age From Message
	---- ------ ---- ---- -------
	Normal Scheduled 46s default-scheduler Successfully assigned default/qwen-coder-7b-instruct-7f74c4446-q5r94-small-group-worker-ncw9w to aks-gpu-13753585-vmss000002
	Normal Pulled 45s kubelet Container image "vllm/vllm-openai:v0.7.1" already present on machine
	Normal Created 45s kubelet Created container wait-gcs-ready
	Normal Started 45s kubelet Started container wait-gcs-ready
	Normal Pulled 39s kubelet Container image "vllm/vllm-openai:v0.7.1" already present on machine
	Normal Created 39s kubelet Created container ray-worker
	Normal Started 39s kubelet Started container ray-worker
	Warning Unhealthy 1s (x6 over 26s) kubelet Readiness probe failed:
	Warning Unhealthy 1s (x2 over 6s) kubelet Liveness probe failed: