Update README.md
Browse files
README.md
CHANGED
@@ -20,3 +20,192 @@ tags:
|
|
20 |
This qwen2 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
|
21 |
|
22 |
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
This qwen2 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
|
21 |
|
22 |
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
|
23 |
+
|
24 |
+
## How the MOE System Works
|
25 |
+
|
26 |
+
This model is a core component of a larger Multi-Expert Question Answering System. Here's a breakdown of the system's functionality:
|
27 |
+
|
28 |
+
1. **Model Loading:** The system loads the "director" LLM and keeps other expert LLMs (e.g., for programming, biology, mathematics) ready for use.
|
29 |
+
2. **Expert Routing:** When a user asks a question, the system either:
|
30 |
+
- Uses keyword matching to identify the relevant domain.
|
31 |
+
- Consults the director LLM to classify the question's category.
|
32 |
+
3. **Dynamic Expert Loading:** The system loads the chosen expert LLM into memory, optimizing resource usage by releasing any previously active expert.
|
33 |
+
4. **Response Generation:** The selected expert LLM receives the question and generates a tailored answer.
|
34 |
+
5. **Chat Interface:** A user-friendly chat interface facilitates interaction with the MOE system.
|
35 |
+
|
36 |
+
This MOE approach enhances efficiency and accuracy compared to relying on a single, general-purpose LLM.
|
37 |
+
|
38 |
+
Repository and Additional Information
|
39 |
+
Full Code: https://huggingface.co/Agnuxo/Qwen2-1.5B-Instruct_MOE_Director_16bit/resolve/main/MOE-LLMs3.py
|
40 |
+
GitHub Repository: https://github.com/Agnuxo1/NEBULA
|
41 |
+
|
42 |
+
|
43 |
+
## Code Example
|
44 |
+
|
45 |
+
The following code demonstrates the implementation of the Multi-Expert Question Answering System:
|
46 |
+
|
47 |
+
```python
|
48 |
+
import os
|
49 |
+
import torch
|
50 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
51 |
+
|
52 |
+
# Global parameters for each model
|
53 |
+
MODEL_PARAMS = {
|
54 |
+
"director": {
|
55 |
+
"temperature": 0.7, # Adjust as needed
|
56 |
+
"max_tokens": 25 # Adjust as needed
|
57 |
+
},
|
58 |
+
"programming": {
|
59 |
+
"temperature": 0.5,
|
60 |
+
"max_tokens": 200
|
61 |
+
},
|
62 |
+
"biology": {
|
63 |
+
"temperature": 0.5,
|
64 |
+
"max_tokens": 200
|
65 |
+
},
|
66 |
+
"mathematics": {
|
67 |
+
"temperature": 0.5,
|
68 |
+
"max_tokens": 200
|
69 |
+
}
|
70 |
+
}
|
71 |
+
|
72 |
+
# Model configuration
|
73 |
+
MODEL_CONFIG = {
|
74 |
+
"director": {
|
75 |
+
"name": "Agnuxo/Qwen2_0.5B_Spanish_English_raspberry_pi_16bit",
|
76 |
+
"task": "text-generation",
|
77 |
+
},
|
78 |
+
"programming": {
|
79 |
+
"name": "Qwen/Qwen2-1.5B-Instruct",
|
80 |
+
"task": "text-generation",
|
81 |
+
},
|
82 |
+
"biology": {
|
83 |
+
"name": "Agnuxo/Qwen2-1.5B-Instruct_MOE_BIOLOGY_assistant_16bit",
|
84 |
+
"task": "text-generation",
|
85 |
+
},
|
86 |
+
"mathematics": {
|
87 |
+
"name": "Qwen/Qwen2-Math-1.5B-Instruct",
|
88 |
+
"task": "text-generation",
|
89 |
+
}
|
90 |
+
}
|
91 |
+
|
92 |
+
# Keywords for each subject
|
93 |
+
KEYWORDS = {
|
94 |
+
"biology": ["cell", "DNA", "protein", "evolution", "genetics", "ecosystem", "organism", "metabolism", "photosynthesis", "microbiology", "célula", "ADN", "proteína", "evolución", "genética", "ecosistema", "organismo", "metabolismo", "fotosíntesis", "microbiología"],
|
95 |
+
"mathematics": ["Math" "mathematics", "equation", "integral", "derivative", "function", "geometry", "algebra", "statistics", "probability", "ecuación", "integral", "derivada", "función", "geometría", "álgebra", "estadística", "probabilidad"],
|
96 |
+
"programming": ["python", "java", "C++", "HTML", "scrip", "code", "Dataset", "API", "framework", "debugging", "algorithm", "compiler", "database", "CSS", "JSON", "XML", "encryption", "IDE", "repository", "Git", "version control", "front-end", "back-end", "API", "stack trace", "REST", "machine learning"]
|
97 |
+
}
|
98 |
+
|
99 |
+
|
100 |
+
class MOELLM:
|
101 |
+
def __init__(self):
|
102 |
+
self.current_expert = None
|
103 |
+
self.current_model = None
|
104 |
+
self.current_tokenizer = None
|
105 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
106 |
+
print(f"Using device: {self.device}")
|
107 |
+
self.load_director_model()
|
108 |
+
|
109 |
+
def load_director_model(self):
|
110 |
+
"""Loads the director model."""
|
111 |
+
print("Loading director model...")
|
112 |
+
model_name = MODEL_CONFIG["director"]["name"]
|
113 |
+
self.director_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
114 |
+
self.director_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to(self.device)
|
115 |
+
print("Director model loaded.")
|
116 |
+
|
117 |
+
def load_expert_model(self, expert):
|
118 |
+
"""Dynamically loads an expert model, releasing memory from the previous model."""
|
119 |
+
if expert not in MODEL_CONFIG:
|
120 |
+
raise ValueError(f"Unknown expert: {expert}")
|
121 |
+
|
122 |
+
if self.current_expert != expert:
|
123 |
+
print(f"Loading expert model: {expert}...")
|
124 |
+
|
125 |
+
# Free memory from the current model if it exists
|
126 |
+
if self.current_model:
|
127 |
+
del self.current_model
|
128 |
+
del self.current_tokenizer
|
129 |
+
torch.cuda.empty_cache()
|
130 |
+
|
131 |
+
model_config = MODEL_CONFIG[expert]
|
132 |
+
self.current_tokenizer = AutoTokenizer.from_pretrained(model_config["name"])
|
133 |
+
self.current_model = AutoModelForCausalLM.from_pretrained(model_config["name"], torch_dtype=torch.float16).to(self.device)
|
134 |
+
self.current_expert = expert
|
135 |
+
|
136 |
+
print(f"{expert.capitalize()} model loaded.")
|
137 |
+
|
138 |
+
def determine_expert_by_keywords(self, question):
|
139 |
+
"""Determines the expert based on keywords in the question."""
|
140 |
+
question_lower = question.lower()
|
141 |
+
for expert, keywords in KEYWORDS.items():
|
142 |
+
if any(keyword in question_lower for keyword in keywords):
|
143 |
+
return expert
|
144 |
+
return None
|
145 |
+
|
146 |
+
def determine_expert(self, question):
|
147 |
+
"""Determines which expert should answer the question."""
|
148 |
+
expert = self.determine_expert_by_keywords(question)
|
149 |
+
if expert:
|
150 |
+
print(f"Expert determined by keyword: {expert}")
|
151 |
+
return expert
|
152 |
+
|
153 |
+
prompt = f"Classify the following question into one of these categories: programming, biology, mathematics. Question: {question}\nCategory:"
|
154 |
+
response = self.director_model.generate(
|
155 |
+
**self.director_tokenizer(prompt, return_tensors="pt").to(self.device),
|
156 |
+
max_new_tokens=MODEL_PARAMS["director"]["max_tokens"],
|
157 |
+
temperature=MODEL_PARAMS["director"]["temperature"],
|
158 |
+
num_return_sequences=1
|
159 |
+
)
|
160 |
+
response_text = self.director_tokenizer.decode(response[0], skip_special_tokens=True)
|
161 |
+
expert = response_text.split(":")[-1].strip().lower()
|
162 |
+
if expert not in MODEL_CONFIG:
|
163 |
+
expert = "director"
|
164 |
+
print(f"Redirecting question to: {expert}")
|
165 |
+
return expert
|
166 |
+
|
167 |
+
def generate_response(self, question, expert):
|
168 |
+
"""Generates a response using the appropriate model."""
|
169 |
+
try:
|
170 |
+
self.load_expert_model(expert)
|
171 |
+
prompt = f"Answer the following question as an expert in {expert}: {question}\nAnswer:"
|
172 |
+
|
173 |
+
if expert == "director":
|
174 |
+
model = self.director_model
|
175 |
+
tokenizer = self.director_tokenizer
|
176 |
+
else:
|
177 |
+
model = self.current_model
|
178 |
+
tokenizer = self.current_tokenizer
|
179 |
+
|
180 |
+
response = model.generate(
|
181 |
+
**tokenizer(prompt, return_tensors="pt").to(self.device),
|
182 |
+
max_new_tokens=MODEL_PARAMS[expert]["max_tokens"],
|
183 |
+
temperature=MODEL_PARAMS[expert]["temperature"],
|
184 |
+
num_return_sequences=1
|
185 |
+
)
|
186 |
+
response_text = tokenizer.decode(response[0], skip_special_tokens=True)
|
187 |
+
return response_text.split("Answer:")[-1].strip()
|
188 |
+
except Exception as e:
|
189 |
+
print(f"Error generating response: {str(e)}")
|
190 |
+
return "Sorry, there was an error processing your request. Please try again."
|
191 |
+
|
192 |
+
def chat_interface(self):
|
193 |
+
"""Simple chat interface."""
|
194 |
+
print("Welcome to the MOE-LLM chat. Type 'exit' to quit.")
|
195 |
+
while True:
|
196 |
+
question = input("\nYou: ")
|
197 |
+
if question.lower() in ['exit', 'quit']:
|
198 |
+
break
|
199 |
+
|
200 |
+
try:
|
201 |
+
expert = self.determine_expert(question)
|
202 |
+
response = self.generate_response(question, expert)
|
203 |
+
print(f"\n{expert.capitalize()}: {response}")
|
204 |
+
except Exception as e:
|
205 |
+
print(f"Error in chat: {str(e)}")
|
206 |
+
print("Please try asking another question.")
|
207 |
+
|
208 |
+
if __name__ == "__main__":
|
209 |
+
moe_llm = MOELLM()
|
210 |
+
moe_llm.chat_interface()
|
211 |
+
|