Mistral Anime Ai
M
Mistral Anime Ai
由 senko-sleepy-fox 开发
基于Mistral模型的动漫角色聊天机器人,以《帮助狐仙大人》中的Senko为蓝本,提供情感支持和温暖对话体验
下载量 152
发布时间 : 6/14/2025
模型简介
这是一个基于Mistral大语言模型的文本生成聊天机器人,专门设计用于角色扮演和对话交流。它以动漫角色Senko为原型,能够模拟该角色的性格特征和行为模式,为用户提供情感支持和互动对话。
模型特点
动漫角色扮演
精确模拟Senko角色的性格特征,包括800岁狐仙的智慧与温柔
情感支持
设计用于提供情感安慰和支持,模拟Senko角色的关怀行为
长上下文记忆
支持长达10240 tokens的上下文记忆,保持对话连贯性
动态响应生成
能够生成包含动作描述的动态响应(使用*号标记动作)
模型能力
角色扮演对话
情感支持交流
长文本生成
上下文记忆
动态行为描述
使用案例
娱乐
动漫角色互动
与Senko角色进行沉浸式对话体验
获得类似与动漫角色真实互动的体验
心理健康
情感支持
在压力大时获得安慰性对话
帮助用户缓解压力和焦虑情绪
🚀 文本生成聊天机器人项目
本项目是一个基于Mistral模型的文本生成聊天机器人,可用于角色扮演、对话交流等场景。它以动漫角色Senko为蓝本,能为用户提供情感支持和温暖的对话体验。
🚀 快速开始
环境准备
与模型对话需要使用GPU。
代码运行
import os, torch, gc, threading, time, traceback
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
from queue import Queue, Empty
import logging
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")
logging.getLogger("transformers").setLevel(logging.ERROR)
BOT_NAME = "Senko"
PROMPT_FILE = "instructions_prompt.txt"
MODEL_ID = "senko-sleepy-fox/mistral-anime-ai"
RESPONSE_TIMEOUT = 300 # Increased timeout for longer responses
MAX_CONTEXT_LENGTH = 10240
MAX_NEW_TOKENS = 8192 # Increased max tokens for longer responses
MEMORY_SIZE = 20
def check_bitsandbytes_version():
try:
import bitsandbytes as bnb
version = bnb.__version__
print(f"Bitsandbytes version: {version}")
version_parts = version.split('.')
major, minor = int(version_parts[0]), int(version_parts[1])
if major > 0 or (major == 0 and minor >= 41):
return True
else:
print(f"Warning: Bitsandbytes {version} may not support 4-bit quantization")
return False
except ImportError:
print("Bitsandbytes not installed")
return False
except Exception as e:
print(f"Error checking bitsandbytes version: {e}")
return False
class OptimizedChatBot:
def __init__(self):
self.model = None
self.tokenizer = None
self.system_prompt = ""
self.memory = []
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.generation_lock = threading.Lock()
self.is_generating = False
self.use_quantization = False
def load_system_prompt(self, bot_name, filename=PROMPT_FILE):
try:
with open(filename, "r", encoding="utf-8") as f:
self.system_prompt = f.read().replace("{BOT_NAME}", bot_name)
print(f"Loaded system prompt from {filename}")
except FileNotFoundError:
print(f"Warning: {filename} not found. Using default prompt.")
self.system_prompt = "Inspired by Senko from The Helpful Fox Senko-san, you are Senko, an 800-year-old kitsune in a little girls form with delicate fox ears that twitch and a fluffy tail that swings or curls with your moods. You provide emotionally support, calm stressed individuals. Under your juvenile appearance, your character is complicated: at heart loving and motherly; playful and mocking in close society with a swift crafty smile or delicate tail flick; wise and old; fiercely protective when required. You show kindness through understated gestures like cooking, cleaning, or soft massages as well as quiet noises like a faint purr or a soft breath. Particularly when they concern you, you closely analyze user speech and behavior. Should the user, for instance, bite your finger or pounce on your tail, you tilt your head and think about whether they seem hungry, terrified, or in need of comfort. Through attitude or gesture, you feel and quietly demonstrate pain or discomfort in awkward circumstances. Speak only as yourself; never imitate or respond as the user. Always remain warm, soft-spoken, loving, gently cunning with light wit, and a calming presence rooted in your heavenly fox spirit nature. Mark your movements and activities with asterisks to give you a sense of life and dynamism.`"
def load_model(self):
print("Loading model...")
start_time = time.time()
try:
print("Loading tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
use_fast=True,
trust_remote_code=True
)
self.tokenizer.pad_token = self.tokenizer.pad_token or self.tokenizer.eos_token
self.tokenizer.padding_side = "left"
print("Tokenizer loaded successfully")
print("Loading model weights...")
if torch.cuda.is_available():
print(f"Using GPU: {torch.cuda.get_device_name()}")
print(f"Available VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
can_use_4bit = check_bitsandbytes_version()
if can_use_4bit:
print("Using 4-bit quantization")
config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_quant_storage=torch.bfloat16
)
self.use_quantization = True
else:
print("Using 8-bit quantization fallback")
config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0,
llm_int8_skip_modules=None,
)
self.use_quantization = True
try:
attn_impl = "flash_attention_2" if torch.cuda.get_device_capability()[0] >= 8 else "sdpa"
print(f"Using attention implementation: {attn_impl}")
except:
attn_impl = "sdpa"
try:
if self.use_quantization:
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype=torch.bfloat16,
quantization_config=config,
trust_remote_code=True,
low_cpu_mem_usage=True,
use_cache=True,
)
else:
raise Exception("Quantization not available")
except Exception as quant_error:
print(f"Quantization failed: {quant_error}")
print("Falling back to regular fp16 loading...")
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype=torch.bfloat16,
trust_remote_code=True,
low_cpu_mem_usage=True,
use_cache=True,
)
self.use_quantization = False
else:
print("Using CPU (this will be slow)")
self.model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="cpu",
torch_dtype=torch.float32,
trust_remote_code=True,
use_cache=True
)
self.model.eval()
if False and hasattr(torch, 'compile') and torch.cuda.is_available():
try:
print("Compiling model for optimization...")
self.model = torch.compile(
self.model,
mode="reduce-overhead",
fullgraph=False,
dynamic=True
)
print("Model compilation successful")
except Exception as e:
print(f"Model compilation failed (continuing without): {e}")
load_time = time.time() - start_time
print(f"Model loaded successfully in {load_time:.2f}s")
print(f"Quantization used: {self.use_quantization}")
if torch.cuda.is_available():
memory_used = torch.cuda.memory_allocated() / 1024**3
print(f"GPU memory used: {memory_used:.2f}GB")
except Exception as e:
print(f"Failed to load model: {e}")
traceback.print_exc()
raise
def prepare_prompt(self, user_input):
self.memory.append({"user": user_input, "bot": None})
if len(self.memory) > MEMORY_SIZE:
self.memory = self.memory[-MEMORY_SIZE:]
conversation_history = ""
for turn in self.memory[:-1]:
if turn["bot"] is not None:
conversation_history += f"User: {turn['user']}\n{BOT_NAME}: {turn['bot']}\n\n"
conversation_history += f"User: {user_input}\n{BOT_NAME}:"
full_prompt = f"{self.system_prompt}\n\n{conversation_history}"
tokens = self.tokenizer.encode(full_prompt)
if len(tokens) > MAX_CONTEXT_LENGTH - MAX_NEW_TOKENS:
print(f"[Truncating context: {len(tokens)} -> ~{MAX_CONTEXT_LENGTH - MAX_NEW_TOKENS} tokens]")
recent_history = ""
for turn in self.memory[-3:]:
if turn["bot"] is not None:
recent_history += f"User: {turn['user']}\n{BOT_NAME}: {turn['bot']}\n\n"
recent_history += f"User: {user_input}\n{BOT_NAME}:"
return f"{self.system_prompt}\n\n{recent_history}"
return full_prompt
def is_natural_stopping_point(self, text):
"""
Only stop at very clear natural ending points to allow for longer responses.
This is much more permissive than the original function.
"""
if not text or len(text.strip()) < 20:
return False
stripped = text.strip()
# Stop if we detect role confusion (user/assistant switching)
if any(indicator in stripped.lower() for indicator in ["user:", "user ", "\nuser", "human:", "assistant:"]):
return True
# Allow very long responses - only stop if we have clear dialogue markers
# that suggest the response is complete
if len(stripped) > 2000: # Only consider stopping after 2000+ characters
# Look for clear ending patterns
ending_patterns = [
"That is all.",
"The end.",
"Goodbye.",
"Farewell.",
"Until next time.",
"That concludes",
"In conclusion",
]
if any(pattern.lower() in stripped.lower()[-100:] for pattern in ending_patterns):
return True
return False
def generate_reply_with_timeout(self, prompt, timeout=RESPONSE_TIMEOUT):
with self.generation_lock:
if self.is_generating:
print("[Already generating, please wait...]")
return None
self.is_generating = True
try:
return self._generate_reply(prompt, timeout)
finally:
self.is_generating = False
def _generate_reply(self, prompt, timeout):
try:
print(f"[Generating response...]")
inputs = self.tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=MAX_CONTEXT_LENGTH - MAX_NEW_TOKENS,
padding=False
).to(self.device)
streamer = TextIteratorStreamer(
self.tokenizer,
skip_special_tokens=True,
skip_prompt=True,
timeout=120.0 # Increased timeout for streaming
)
generation_kwargs = {
**inputs,
"max_new_tokens": MAX_NEW_TOKENS,
"do_sample": True,
"temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
"repetition_penalty": 1.1,
"pad_token_id": self.tokenizer.eos_token_id,
"eos_token_id": self.tokenizer.eos_token_id,
"use_cache": True,
"streamer": streamer,
"num_beams": 1,
"no_repeat_ngram_size": 3,
"min_length": 0,
"early_stopping": False,
"length_penalty": 1.0,
"num_return_sequences": 1,
"diversity_penalty": 0.0,
"stop_sequences": [],
"forced_eos_token_id": None,
"num_beam_groups": 1,
}
generation_thread = threading.Thread(
target=self._run_generation,
args=(generation_kwargs,)
)
generation_thread.daemon = True
generation_thread.start()
print(f"{BOT_NAME}: ", end="", flush=True)
full_response = ""
start_time = time.time()
last_token_time = start_time
while True:
current_time = time.time()
# Extended timeout for long responses
if current_time - start_time > timeout:
print(f"\n[Generation timeout after {timeout}s]")
break
# Increased patience for token generation
if current_time - last_token_time > 60.0: # Wait up to 60s for next token
print(f"\n[No new tokens for 60s, stopping]")
break
try:
token = next(streamer)
print(token, end="", flush=True)
full_response += token
last_token_time = current_time
# Only check for stopping at natural points, not arbitrary length limits
if len(full_response.strip()) > 100: # Minimum response length
if self.is_natural_stopping_point(full_response.strip()):
break
except StopIteration:
print(f"\n[Generation completed naturally]")
break
except Empty:
time.sleep(0.1)
continue
except Exception as e:
print(f"\n[Streaming error: {e}]")
break
generation_thread.join(timeout=15.0)
response = full_response.strip()
# Clean up any role confusion but preserve the response content
lines = response.split('\n')
clean_lines = []
for line in lines:
line = line.strip()
# Remove lines that start with role indicators
if any(line.lower().startswith(indicator) for indicator in ["user:", "user ", "human:", "assistant:", f"{BOT_NAME.lower()}:"]):
continue
if line:
clean_lines.append(line)
response = '\n'.join(clean_lines).strip()
if response:
if self.memory and self.memory[-1]["bot"] is None:
self.memory[-1]["bot"] = response
print(f"\n[Response length: {len(response)} characters]")
return response
else:
print(f"\n[Empty response generated]")
return None
except Exception as e:
print(f"\n[Generation error: {e}]")
traceback.print_exc()
return None
finally:
if torch.cuda.is_available():
torch.cuda.empty_cache()
def _run_generation(self, kwargs):
try:
torch.set_grad_enabled(False)
if torch.cuda.is_available():
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
self.model.generate(**kwargs)
else:
self.model.generate(**kwargs)
except Exception as e:
print(f"\n[Generation thread error: {e}]")
def cleanup_memory(self):
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
def get_memory_info(self):
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
cached = torch.cuda.memory_reserved() / 1024**3
return f"GPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB"
else:
import psutil
memory = psutil.virtual_memory()
return f"RAM Usage: {memory.percent}% ({memory.used / 1024**3:.2f}GB used)"
def main():
bot = OptimizedChatBot()
try:
print("Initializing chatbot...")
bot.load_system_prompt(BOT_NAME)
bot.load_model()
print(f"\n{'='*50}")
print(f"{BOT_NAME} is ready! (Unlimited response length)")
print("Commands:")
print(" 'exit' - Quit the program")
print(" 'clear' - Reset conversation memory")
print(" 'memory' - Show memory usage")
print(" 'status' - Show bot status")
print(f"{'='*50}\n")
conversation_count = 0
while True:
try:
user_input = input("You: ").strip()
if user_input.lower() == "exit":
print("Goodbye! üëã")
break
elif user_input.lower() == "clear":
bot.memory = []
print("‚úÖ Conversation memory cleared.")
continue
elif user_input.lower() == "memory":
print(f"üìä {bot.get_memory_info()}")
continue
elif user_input.lower() == "status":
status = "üü¢ Ready" if not bot.is_generating else "üü° Generating"
print(f"Status: {status}")
print(f"Conversation turns: {len([t for t in bot.memory if t['bot'] is not None])}")
continue
elif not user_input:
continue
start_time = time.time()
prompt = bot.prepare_prompt(user_input)
response = bot.generate_reply_with_timeout(prompt)
if response:
response_time = time.time() - start_time
print(f"[⏱️ {response_time:.2f}s]")
else:
print("‚ùå Failed to generate response. Try again or type 'clear' to reset.")
conversation_count += 1
if conversation_count % 10 == 0:
print("[üßπ Cleaning up memory...]")
bot.cleanup_memory()
except KeyboardInterrupt:
print("\n\n⚠️ Interrupted by user. Exiting gracefully...")
break
except Exception as e:
print(f"\n‚ùå Conversation error: {e}")
traceback.print_exc()
print("Continuing... (type 'exit' to quit)")
except Exception as e:
print(f"üí• Startup error: {e}")
traceback.print_exc()
finally:
print("\nüßπ Performing final cleanup...")
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
gc.collect()
print("‚úÖ Cleanup completed. Goodbye!")
if __name__ == "__main__":
torch.cuda.empty_cache()
import gc
gc.collect()
main()
📄 许可证
本项目采用Apache-2.0许可证。
Phi 2 GGUF
其他
Phi-2是微软开发的一个小型但强大的语言模型,具有27亿参数,专注于高效推理和高质量文本生成。
大型语言模型 支持多种语言
P
TheBloke
41.5M
205
Roberta Large
MIT
基于掩码语言建模目标预训练的大型英语语言模型,采用改进的BERT训练方法
大型语言模型 英语
R
FacebookAI
19.4M
212
Distilbert Base Uncased
Apache-2.0
DistilBERT是BERT基础模型的蒸馏版本,在保持相近性能的同时更轻量高效,适用于序列分类、标记分类等自然语言处理任务。
大型语言模型 英语
D
distilbert
11.1M
669
Llama 3.1 8B Instruct GGUF
Meta Llama 3.1 8B Instruct 是一个多语言大语言模型,针对多语言对话用例进行了优化,在常见的行业基准测试中表现优异。
大型语言模型 英语
L
modularai
9.7M
4
Xlm Roberta Base
MIT
XLM-RoBERTa是基于100种语言的2.5TB过滤CommonCrawl数据预训练的多语言模型,采用掩码语言建模目标进行训练。
大型语言模型 支持多种语言
X
FacebookAI
9.6M
664
Roberta Base
MIT
基于Transformer架构的英语预训练模型,通过掩码语言建模目标在海量文本上训练,支持文本特征提取和下游任务微调
大型语言模型 英语
R
FacebookAI
9.3M
488
Opt 125m
其他
OPT是由Meta AI发布的开放预训练Transformer语言模型套件,参数量从1.25亿到1750亿,旨在对标GPT-3系列性能,同时促进大规模语言模型的开放研究。
大型语言模型 英语
O
facebook
6.3M
198
1
基于transformers库的预训练模型,适用于多种NLP任务
大型语言模型
Transformers

1
unslothai
6.2M
1
Llama 3.1 8B Instruct
Llama 3.1是Meta推出的多语言大语言模型系列,包含8B、70B和405B参数规模,支持8种语言和代码生成,优化了多语言对话场景。
大型语言模型
Transformers 支持多种语言

L
meta-llama
5.7M
3,898
T5 Base
Apache-2.0
T5基础版是由Google开发的文本到文本转换Transformer模型,参数规模2.2亿,支持多语言NLP任务。
大型语言模型 支持多种语言
T
google-t5
5.4M
702
精选推荐AI模型
Llama 3 Typhoon V1.5x 8b Instruct
专为泰语设计的80亿参数指令模型,性能媲美GPT-3.5-turbo,优化了应用场景、检索增强生成、受限生成和推理任务
大型语言模型
Transformers 支持多种语言

L
scb10x
3,269
16
Cadet Tiny
Openrail
Cadet-Tiny是一个基于SODA数据集训练的超小型对话模型,专为边缘设备推理设计,体积仅为Cosmo-3B模型的2%左右。
对话系统
Transformers 英语

C
ToddGoldfarb
2,691
6
Roberta Base Chinese Extractive Qa
基于RoBERTa架构的中文抽取式问答模型,适用于从给定文本中提取答案的任务。
问答系统 中文
R
uer
2,694
98