llama v3
46 words
One minute
1
2
3
|
python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
--gpu-memory-utilization 0.80 --dtype bfloat16 \
--model gradientai/Llama-3-8B-Instruct-Gradient-4194k
|
1
2
3
4
5
6
|
python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
--tensor-parallel-size 4 \
--gpu-memory-utilization 0.80 \
--dtype float16 \
--quantization awq \
--model casperhansen/llama-3-70b-instruct-awq
|