Contents

Deploy a vllm hosted LLM on k8s

In this post we show the steps to deploy a LLM with vllm on a GCP GKE environment.

1. Build a base docker image with cuda and conda

This docker image setup the env to host a LLM

  1. cuda:12.1.1
  2. miniconda
  3. a conda env named “main”, with python3.10 and some python libs

1.1 The dockerfile

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04

ENV HOME /app
RUN mkdir /app
WORKDIR /app

# set bash as current shell
RUN chsh -s /bin/bash
SHELL ["/bin/bash", "-c"]

##################################
# install utils
##################################
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata
RUN apt install -y software-properties-common curl jq

##################################
# install gcloud
##################################
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz
RUN mkdir -p /app/gcloud \
  && tar -C /app/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
  && /app/gcloud/google-cloud-sdk/install.sh
ENV PATH $PATH:/app/gcloud/google-cloud-sdk/bin

##################################
# install conda
##################################
ARG DEFAULT_ENV=main

RUN curl -OL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
RUN bash Miniconda3-latest-Linux-x86_64.sh  -b -f -p /app/miniconda3/ 
RUN ln -s /app/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh
RUN echo ". /app/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc

ENV PATH /app/miniconda3/bin/:$PATH
ENV CONDA_DEFAULT_ENV ${DEFAULT_ENV}
ENV PATH /app/miniconda3/envs/${DEFAULT_ENV}/bin:$PATH

# create a conda env with name "main"
COPY main.environment.yaml environment.yaml
#RUN conda env update -f base.environment.yaml
RUN conda env create -f environment.yaml
RUN echo "conda activate main" >> ~/.bashrc

1.2 The environment yaml for conda env “main”

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
name: main
channels:
  - defaults
  - conda-forge
  - nvidia
  - pytorch
dependencies:
  - python==3.10
  - pip
  - numpy
  - pandas
  - pyarrow
  - grpcio
  - grpcio-tools
  - protobuf
  - pip:
    - vllm==0.3.0
    - transformers==4.37.2
    - google-cloud-bigquery==3.17.2
    - google-cloud-storage==2.14.0
    - google-cloud-aiplatform==1.41.0
    - google-auth==2.27.0

1.3 the build script

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
set -e # fail on any errors

PROJECT=<GCP_PROJECT_ID>
ARTIFACT_REGISTRY_ROOT="us-central1-docker.pkg.dev"
ARTIFACT_REGISTRY_REPOSITORY="prod"

export NAMESPACE_ID=<K8S_NAMESPACE_ID>
export APP_NAME="cuda-python"
export APP_VERSION="0.0.1"
export DEPLOYMENT_NAME=${APP_NAME}-"deployment"

# cuda-12.1.1, python-3.10, transformers
export MODEL_IMAGE_ID=${ARTIFACT_REGISTRY_ROOT}/${PROJECT}/${ARTIFACT_REGISTRY_REPOSITORY}/${APP_NAME}:${APP_VERSION}

echo "Image: ${MODEL_IMAGE_ID}"

# build and push images
docker buildx build --platform linux/amd64 --push -t ${MODEL_IMAGE_ID} .
echo "Done"

2. Deploy a vllm service

Now we host a LLM. Here use “TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ” as an example.

2.1 Dockerfile

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# the base docker image
FROM us-central1-docker.pkg.dev/<GCP_PROJECT_ID>/prod/cuda-python:0.0.1

# if you have any python files and libs
COPY src/*.py .
COPY requirements.txt .

# python deps
RUN pip install -r requirements.txt

# assuming you have downloaded model artifacts from HuggingFace
RUN echo "export HUGGINGFACE_HUB_CACHE=/data/models" >> /root/.bashrc

# run the serving service
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server", \
            "--model", "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ", \
            "--quantization", "awq", "--tensor-parallel-size", "2"]

2.2 the k8s deployment yaml

deploy it as a K8S service

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
apiVersion: apps/v1
kind: Deployment
metadata:
  name: ${DEPLOYMENT_NAME}  # name of this deployment
  namespace: ${NAMESPACE_ID}
spec:
  replicas: 1
  selector:
    matchLabels:
      app: ${APP_NAME}
  template:
    metadata:
      labels:
        app: ${APP_NAME}
    spec:
      serviceAccountName: default-editor
      automountServiceAccountToken: false
      containers:
        - image: ${MODEL_IMAGE_ID}
          name: api
          imagePullPolicy: Always
          ports:
            - containerPort: 8000  # gradio
              name: http
          env: 
            - name: GCE_METADATA_TIMEOUT
              value: "60"
            - name: APP_NAME
              value: ${APP_NAME}
            - name: APP_VERSION
              value: ${APP_VERSION}
            - name: SERVER_NAME
              value: "0.0.0.0"
          resources:
            requests:
              #ephemeral-storage: "20Gi"
              memory: "16Gi"
              cpu: "4000m"
              nvidia.com/gpu: "2"
            limits:
              memory: "32Gi"
              cpu: "8000m"
              nvidia.com/gpu: "2"
              #ephemeral-storage: "25Gi"
          volumeMounts:
            - mountPath: /data
              name: model-data
            - mountPath: /dev/shm
              name: dshm
      volumes:
        - name: wayspot-data
          persistentVolumeClaim:
            claimName: model-data-volume-claim
        - name: dshm
          emptyDir:
              medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
        cloud.google.com/gke-nodepool: gpu-l4-standard-pool

---
apiVersion: v1
kind: Service
metadata:
  name: ${SERVICE_NAME}
  namespace: ${NAMESPACE_ID}
spec:
  type: ClusterIP
  ports:
    - port: 80
      targetPort: 8000
  selector:
    app: ${APP_NAME}

2.3 the build script

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

set -e # fail on any errors

PROJECT=<GCP_PROJECT_ID>
ARTIFACT_REGISTRY_ROOT="us-central1-docker.pkg.dev"
ARTIFACT_REGISTRY_REPOSITORY="prod"

export NAMESPACE_ID=<K8S_NAMESPACE_ID>
export APP_NAME=<APP_NAME>
export APP_VERSION="0.0.1"
export DEPLOYMENT_NAME=${APP_NAME}-"deployment"
export HUGGINGFACE_HUB_CACHE="/data/models"
export SERVICE_NAME=${APP_NAME}-"service"
export MODEL_IMAGE_ID=${ARTIFACT_REGISTRY_ROOT}/${PROJECT}/${ARTIFACT_REGISTRY_REPOSITORY}/${APP_NAME}:${APP_VERSION}

echo "Image: ${MODEL_IMAGE_ID}"

# build and push images
docker buildx build --platform linux/amd64 --push -t ${MODEL_IMAGE_ID} .

# sometimes it is necessary to delete existing deployment for clear installation
kubectl delete service ${SERVICE_NAME} -n ${NAMESPACE_ID} || true
kubectl delete deployment ${DEPLOYMENT_NAME} -n ${NAMESPACE_ID} || true

# deploy it!
cat deployment.yaml | envsubst | kubectl apply -f -
# python -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --port 8080 --tensor-parallel-size 2
#python -m vllm.entrypoints.openai.api_server --model TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ --port 8080 --quantization awq --tensor-parallel-size 2