Uploaded model
- Developed by: Agnuxo
- License: apache-2.0
- Finetuned from model : unsloth/qwen2-0.5b-bnb-4bit
This qwen2 model was trained 2x faster with Unsloth and Huggingface's TRL library.
How the MOE System Works
This model is a core component of a larger Multi-Expert Question Answering System. Here's a breakdown of the system's functionality:
- Model Loading: The system loads the "director" LLM and keeps other expert LLMs (e.g., for programming, biology, mathematics) ready for use.
- Expert Routing: When a user asks a question, the system either:
- Uses keyword matching to identify the relevant domain.
- Consults the director LLM to classify the question's category.
- Dynamic Expert Loading: The system loads the chosen expert LLM into memory, optimizing resource usage by releasing any previously active expert.
- Response Generation: The selected expert LLM receives the question and generates a tailored answer.
- Chat Interface: A user-friendly chat interface facilitates interaction with the MOE system.
This MOE approach enhances efficiency and accuracy compared to relying on a single, general-purpose LLM.
Repository and Additional Information Full Code: https://huggingface.co/Agnuxo/Qwen2-1.5B-Instruct_MOE_Director_16bit/resolve/main/MOE-LLMs3.py GitHub Repository: https://github.com/Agnuxo1/NEBULA
Code Example
The following code demonstrates the implementation of the Multi-Expert Question Answering System:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# Global parameters for each model
MODEL_PARAMS = {
"director": {
"temperature": 0.7, # Adjust as needed
"max_tokens": 25 # Adjust as needed
},
"programming": {
"temperature": 0.5,
"max_tokens": 200
},
"biology": {
"temperature": 0.5,
"max_tokens": 200
},
"mathematics": {
"temperature": 0.5,
"max_tokens": 200
}
}
# Model configuration
MODEL_CONFIG = {
"director": {
"name": "Agnuxo/Qwen2_0.5B_Spanish_English_raspberry_pi_16bit",
"task": "text-generation",
},
"programming": {
"name": "Qwen/Qwen2-1.5B-Instruct",
"task": "text-generation",
},
"biology": {
"name": "Agnuxo/Qwen2-1.5B-Instruct_MOE_BIOLOGY_assistant_16bit",
"task": "text-generation",
},
"mathematics": {
"name": "Qwen/Qwen2-Math-1.5B-Instruct",
"task": "text-generation",
}
}
# Keywords for each subject
KEYWORDS = {
"biology": ["cell", "DNA", "protein", "evolution", "genetics", "ecosystem", "organism", "metabolism", "photosynthesis", "microbiology", "célula", "ADN", "proteína", "evolución", "genética", "ecosistema", "organismo", "metabolismo", "fotosíntesis", "microbiología"],
"mathematics": ["Math" "mathematics", "equation", "integral", "derivative", "function", "geometry", "algebra", "statistics", "probability", "ecuación", "integral", "derivada", "función", "geometría", "álgebra", "estadística", "probabilidad"],
"programming": ["python", "java", "C++", "HTML", "scrip", "code", "Dataset", "API", "framework", "debugging", "algorithm", "compiler", "database", "CSS", "JSON", "XML", "encryption", "IDE", "repository", "Git", "version control", "front-end", "back-end", "API", "stack trace", "REST", "machine learning"]
}
class MOELLM:
def __init__(self):
self.current_expert = None
self.current_model = None
self.current_tokenizer = None
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
self.load_director_model()
def load_director_model(self):
"""Loads the director model."""
print("Loading director model...")
model_name = MODEL_CONFIG["director"]["name"]
self.director_tokenizer = AutoTokenizer.from_pretrained(model_name)
self.director_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(self.device)
print("Director model loaded.")
def load_expert_model(self, expert):
"""Dynamically loads an expert model, releasing memory from the previous model."""
if expert not in MODEL_CONFIG:
raise ValueError(f"Unknown expert: {expert}")
if self.current_expert != expert:
print(f"Loading expert model: {expert}...")
# Free memory from the current model if it exists
if self.current_model:
del self.current_model
del self.current_tokenizer
torch.cuda.empty_cache()
model_config = MODEL_CONFIG[expert]
self.current_tokenizer = AutoTokenizer.from_pretrained(model_config["name"])
self.current_model = AutoModelForCausalLM.from_pretrained(model_config["name"], torch_dtype=torch.float16).to(self.device)
self.current_expert = expert
print(f"{expert.capitalize()} model loaded.")
def determine_expert_by_keywords(self, question):
"""Determines the expert based on keywords in the question."""
question_lower = question.lower()
for expert, keywords in KEYWORDS.items():
if any(keyword in question_lower for keyword in keywords):
return expert
return None
def determine_expert(self, question):
"""Determines which expert should answer the question."""
expert = self.determine_expert_by_keywords(question)
if expert:
print(f"Expert determined by keyword: {expert}")
return expert
prompt = f"Classify the following question into one of these categories: programming, biology, mathematics. Question: {question}\nCategory:"
response = self.director_model.generate(
**self.director_tokenizer(prompt, return_tensors="pt").to(self.device),
max_new_tokens=MODEL_PARAMS["director"]["max_tokens"],
temperature=MODEL_PARAMS["director"]["temperature"],
num_return_sequences=1
)
response_text = self.director_tokenizer.decode(response[0], skip_special_tokens=True)
expert = response_text.split(":")[-1].strip().lower()
if expert not in MODEL_CONFIG:
expert = "director"
print(f"Redirecting question to: {expert}")
return expert
def generate_response(self, question, expert):
"""Generates a response using the appropriate model."""
try:
self.load_expert_model(expert)
prompt = f"Answer the following question as an expert in {expert}: {question}\nAnswer:"
if expert == "director":
model = self.director_model
tokenizer = self.director_tokenizer
else:
model = self.current_model
tokenizer = self.current_tokenizer
response = model.generate(
**tokenizer(prompt, return_tensors="pt").to(self.device),
max_new_tokens=MODEL_PARAMS[expert]["max_tokens"],
temperature=MODEL_PARAMS[expert]["temperature"],
num_return_sequences=1
)
response_text = tokenizer.decode(response[0], skip_special_tokens=True)
return response_text.split("Answer:")[-1].strip()
except Exception as e:
print(f"Error generating response: {str(e)}")
return "Sorry, there was an error processing your request. Please try again."
def chat_interface(self):
"""Simple chat interface."""
print("Welcome to the MOE-LLM chat. Type 'exit' to quit.")
while True:
question = input("\nYou: ")
if question.lower() in ['exit', 'quit']:
break
try:
expert = self.determine_expert(question)
response = self.generate_response(question, expert)
print(f"\n{expert.capitalize()}: {response}")
except Exception as e:
print(f"Error in chat: {str(e)}")
print("Please try asking another question.")
if __name__ == "__main__":
moe_llm = MOELLM()
moe_llm.chat_interface()
- Downloads last month
- 3