Contents

Setup the llama2 inference

#In this post we explore how to add SSL suport to Hugo website by combining NGINX

#

llama 2

0. libs

1
pip install -U torch transformers accelerate SentencePiece

1. download

Get approved. then download all files (except for pytorch_model-0000X-of-00003.bin)from here:

https://huggingface.co/meta-llama/Llama-2-13b-hf/tree/main

1
2
3
mkdir -p llama2/models/13b/

kubectl cp /Users/$USER/Downloads/. big-0:/home/jovyan/llama2/models/13b/

2. code

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

from transformers import LlamaForCausalLM, LlamaTokenizer
import transformers
import torch

model = '/home/jovyan/llama2/models/13b'

# split the model into multiple GPUs
tokenizer = LlamaTokenizer.from_pretrained(model,device_map='auto')
model = LlamaForCausalLM.from_pretrained(model,device_map='auto')

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Comments