In this post we show the steps to deploy a LLM with vllm on a GCP GKE environment.
1. Build a base docker image with cuda and conda
This docker image setup the env to host a LLM
- cuda:12.1.1
- miniconda
- a conda env named “main”, with python3.10 and some python libs
1.1 The dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
ENV HOME /app
RUN mkdir /app
WORKDIR /app
# set bash as current shell
RUN chsh -s /bin/bash
SHELL ["/bin/bash", "-c"]
##################################
# install utils
##################################
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata
RUN apt install -y software-properties-common curl jq
##################################
# install gcloud
##################################
RUN curl https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz > /tmp/google-cloud-sdk.tar.gz
RUN mkdir -p /app/gcloud \
&& tar -C /app/gcloud -xvf /tmp/google-cloud-sdk.tar.gz \
&& /app/gcloud/google-cloud-sdk/install.sh
ENV PATH $PATH:/app/gcloud/google-cloud-sdk/bin
##################################
# install conda
##################################
ARG DEFAULT_ENV=main
RUN curl -OL https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
RUN bash Miniconda3-latest-Linux-x86_64.sh -b -f -p /app/miniconda3/
RUN ln -s /app/miniconda3/etc/profile.d/conda.sh /etc/profile.d/conda.sh
RUN echo ". /app/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc
ENV PATH /app/miniconda3/bin/:$PATH
ENV CONDA_DEFAULT_ENV ${DEFAULT_ENV}
ENV PATH /app/miniconda3/envs/${DEFAULT_ENV}/bin:$PATH
# create a conda env with name "main"
COPY main.environment.yaml environment.yaml
#RUN conda env update -f base.environment.yaml
RUN conda env create -f environment.yaml
RUN echo "conda activate main" >> ~/.bashrc
|
1.2 The environment yaml for conda env “main”
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
name: main
channels:
- defaults
- conda-forge
- nvidia
- pytorch
dependencies:
- python==3.10
- pip
- numpy
- pandas
- pyarrow
- grpcio
- grpcio-tools
- protobuf
- pip:
- vllm==0.3.0
- transformers==4.37.2
- google-cloud-bigquery==3.17.2
- google-cloud-storage==2.14.0
- google-cloud-aiplatform==1.41.0
- google-auth==2.27.0
|
1.3 the build script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
set -e # fail on any errors
PROJECT=<GCP_PROJECT_ID>
ARTIFACT_REGISTRY_ROOT="us-central1-docker.pkg.dev"
ARTIFACT_REGISTRY_REPOSITORY="prod"
export NAMESPACE_ID=<K8S_NAMESPACE_ID>
export APP_NAME="cuda-python"
export APP_VERSION="0.0.1"
export DEPLOYMENT_NAME=${APP_NAME}-"deployment"
# cuda-12.1.1, python-3.10, transformers
export MODEL_IMAGE_ID=${ARTIFACT_REGISTRY_ROOT}/${PROJECT}/${ARTIFACT_REGISTRY_REPOSITORY}/${APP_NAME}:${APP_VERSION}
echo "Image: ${MODEL_IMAGE_ID}"
# build and push images
docker buildx build --platform linux/amd64 --push -t ${MODEL_IMAGE_ID} .
echo "Done"
|
2. Deploy a vllm service
Now we host a LLM. Here use “TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ” as an example.
2.1 Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
# the base docker image
FROM us-central1-docker.pkg.dev/<GCP_PROJECT_ID>/prod/cuda-python:0.0.1
# if you have any python files and libs
COPY src/*.py .
COPY requirements.txt .
# python deps
RUN pip install -r requirements.txt
# assuming you have downloaded model artifacts from HuggingFace
RUN echo "export HUGGINGFACE_HUB_CACHE=/data/models" >> /root/.bashrc
# run the serving service
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server", \
"--model", "TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ", \
"--quantization", "awq", "--tensor-parallel-size", "2"]
|
2.2 the k8s deployment yaml
deploy it as a K8S service
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
apiVersion: apps/v1
kind: Deployment
metadata:
name: ${DEPLOYMENT_NAME} # name of this deployment
namespace: ${NAMESPACE_ID}
spec:
replicas: 1
selector:
matchLabels:
app: ${APP_NAME}
template:
metadata:
labels:
app: ${APP_NAME}
spec:
serviceAccountName: default-editor
automountServiceAccountToken: false
containers:
- image: ${MODEL_IMAGE_ID}
name: api
imagePullPolicy: Always
ports:
- containerPort: 8000 # gradio
name: http
env:
- name: GCE_METADATA_TIMEOUT
value: "60"
- name: APP_NAME
value: ${APP_NAME}
- name: APP_VERSION
value: ${APP_VERSION}
- name: SERVER_NAME
value: "0.0.0.0"
resources:
requests:
#ephemeral-storage: "20Gi"
memory: "16Gi"
cpu: "4000m"
nvidia.com/gpu: "2"
limits:
memory: "32Gi"
cpu: "8000m"
nvidia.com/gpu: "2"
#ephemeral-storage: "25Gi"
volumeMounts:
- mountPath: /data
name: model-data
- mountPath: /dev/shm
name: dshm
volumes:
- name: wayspot-data
persistentVolumeClaim:
claimName: model-data-volume-claim
- name: dshm
emptyDir:
medium: Memory
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-l4
cloud.google.com/gke-nodepool: gpu-l4-standard-pool
---
apiVersion: v1
kind: Service
metadata:
name: ${SERVICE_NAME}
namespace: ${NAMESPACE_ID}
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 8000
selector:
app: ${APP_NAME}
|
2.3 the build script
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
set -e # fail on any errors
PROJECT=<GCP_PROJECT_ID>
ARTIFACT_REGISTRY_ROOT="us-central1-docker.pkg.dev"
ARTIFACT_REGISTRY_REPOSITORY="prod"
export NAMESPACE_ID=<K8S_NAMESPACE_ID>
export APP_NAME=<APP_NAME>
export APP_VERSION="0.0.1"
export DEPLOYMENT_NAME=${APP_NAME}-"deployment"
export HUGGINGFACE_HUB_CACHE="/data/models"
export SERVICE_NAME=${APP_NAME}-"service"
export MODEL_IMAGE_ID=${ARTIFACT_REGISTRY_ROOT}/${PROJECT}/${ARTIFACT_REGISTRY_REPOSITORY}/${APP_NAME}:${APP_VERSION}
echo "Image: ${MODEL_IMAGE_ID}"
# build and push images
docker buildx build --platform linux/amd64 --push -t ${MODEL_IMAGE_ID} .
# sometimes it is necessary to delete existing deployment for clear installation
kubectl delete service ${SERVICE_NAME} -n ${NAMESPACE_ID} || true
kubectl delete deployment ${DEPLOYMENT_NAME} -n ${NAMESPACE_ID} || true
# deploy it!
cat deployment.yaml | envsubst | kubectl apply -f -
# python -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --port 8080 --tensor-parallel-size 2
#python -m vllm.entrypoints.openai.api_server --model TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ --port 8080 --quantization awq --tensor-parallel-size 2
|