import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
# Initialize the model and tokenizer
model_name = "jetmoe/jetmoe-8b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", trust_remote_code=True)
# Check if a GPU is available and move the model to GPU if it isif torch.cuda.is_available():
model = model.cuda()
print("Using GPU:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
print("GPU is not available, using CPU instead.")
# Encode input context
messages = [
{
"role": "system",
"content": "You are a friendly chatbot",
},
{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
print(tokenized_chat)
# If using a GPU, move the input IDs to the GPUif torch.cuda.is_available():
input_ids = tokenized_chat.cuda()
# Generate text
output = model.generate(input_ids, max_length=500, num_return_sequences=1, no_repeat_ngram_size=2)
# If the output is on the GPU, move it back to CPU for decodingif torch.cuda.is_available():
output = output.cpu()
# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
# Initialize the model and tokenizer
model_name = "jetmoe/jetmoe-8b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, attn_implementation="eager", trust_remote_code=True)
# Check if a GPU is available and move the model to GPU if it isif torch.cuda.is_available():
model = model.cuda()
print("Using GPU:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
print("GPU is not available, using CPU instead.")
# Encode input context
messages = [
{
"role": "system",
"content": "You are a friendly chatbot",
},
{"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
print(tokenized_chat)
# If using a GPU, move the input IDs to the GPUif torch.cuda.is_available():
input_ids = tokenized_chat.cuda()
# Generate text
output = model.generate(input_ids, max_length=500, num_return_sequences=1, no_repeat_ngram_size=2)
# If the output is on the GPU, move it back to CPU for decodingif torch.cuda.is_available():
output = output.cpu()
# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)
📚 ドキュメント
モデルの詳細
JetMoE-8Bは24個のブロックから構成されています。各ブロックには、Mixture of Attention heads (MoA) とMixture of MLP Experts (MoE) の2つのMoEレイヤーがあります。各MoAとMoEレイヤーには8つのエキスパートがあり、各入力トークンに対して2つのエキスパートがアクティブになります。総計80億個のパラメータを持ち、アクティブパラメータは2.2Bです。JetMoE-8Bは、公開データセットからの1.25Tのトークンで訓練され、学習率は5.0 x 10-4、グローバルバッチサイズは4Mトークンです。