DawnC commited on
Commit
b91a1e2
1 Parent(s): e0dbe8c

Upload smart_breed_matcher.py

Browse files
Files changed (1) hide show
  1. smart_breed_matcher.py +392 -0
smart_breed_matcher.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch
3
+ import re
4
+ import numpy as np
5
+ from typing import List, Dict, Tuple, Optional
6
+ from dataclasses import dataclass
7
+ from breed_health_info import breed_health_info
8
+ from breed_noise_info import breed_noise_info
9
+ from dog_database import dog_data
10
+ from scoring_calculation_system import UserPreferences
11
+ from sentence_transformers import SentenceTransformer, util
12
+
13
+ class SmartBreedMatcher:
14
+ def __init__(self, dog_data: List[Tuple]):
15
+ self.dog_data = dog_data
16
+ self.model = SentenceTransformer('all-mpnet-base-v2')
17
+ self._embedding_cache = {}
18
+
19
+ def _get_cached_embedding(self, text: str) -> torch.Tensor:
20
+ if text not in self._embedding_cache:
21
+ self._embedding_cache[text] = self.model.encode(text)
22
+ return self._embedding_cache[text]
23
+
24
+ def _categorize_breeds(self) -> Dict:
25
+ """自動將狗品種分類"""
26
+ categories = {
27
+ 'working_dogs': [],
28
+ 'herding_dogs': [],
29
+ 'hunting_dogs': [],
30
+ 'companion_dogs': [],
31
+ 'guard_dogs': []
32
+ }
33
+
34
+ for breed_info in self.dog_data:
35
+ description = breed_info[9].lower()
36
+ temperament = breed_info[4].lower()
37
+
38
+ # 根據描述和性格特徵自動分類
39
+ if any(word in description for word in ['herding', 'shepherd', 'cattle', 'flock']):
40
+ categories['herding_dogs'].append(breed_info[1])
41
+ elif any(word in description for word in ['hunting', 'hunt', 'retriever', 'pointer']):
42
+ categories['hunting_dogs'].append(breed_info[1])
43
+ elif any(word in description for word in ['companion', 'toy', 'family', 'lap']):
44
+ categories['companion_dogs'].append(breed_info[1])
45
+ elif any(word in description for word in ['guard', 'protection', 'watchdog']):
46
+ categories['guard_dogs'].append(breed_info[1])
47
+ elif any(word in description for word in ['working', 'draft', 'cart']):
48
+ categories['working_dogs'].append(breed_info[1])
49
+
50
+ return categories
51
+
52
+ def find_similar_breeds(self, breed_name: str, top_n: int = 5) -> List[Tuple[str, float]]:
53
+ """找出與指定品種最相似的其他品種"""
54
+ target_breed = next((breed for breed in self.dog_data if breed[1] == breed_name), None)
55
+ if not target_breed:
56
+ return []
57
+
58
+ # 獲取目標品種的特徵
59
+ target_features = {
60
+ 'breed_name': target_breed[1], # 添加品種名稱
61
+ 'size': target_breed[2],
62
+ 'temperament': target_breed[4],
63
+ 'exercise': target_breed[7],
64
+ 'description': target_breed[9]
65
+ }
66
+
67
+ similarities = []
68
+ for breed in self.dog_data:
69
+ if breed[1] != breed_name:
70
+ breed_features = {
71
+ 'breed_name': breed[1], # 添加品種名稱
72
+ 'size': breed[2],
73
+ 'temperament': breed[4],
74
+ 'exercise': breed[7],
75
+ 'description': breed[9]
76
+ }
77
+
78
+ similarity_score = self._calculate_breed_similarity(target_features, breed_features)
79
+ similarities.append((breed[1], similarity_score))
80
+
81
+ return sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]
82
+
83
+
84
+ def _calculate_breed_similarity(self, breed1_features: Dict, breed2_features: Dict) -> float:
85
+ """計算兩個品種之間的相似度,包含健康和噪音因素"""
86
+ # 計算描述文本的相似度
87
+ desc1_embedding = self._get_cached_embedding(breed1_features['description'])
88
+ desc2_embedding = self._get_cached_embedding(breed2_features['description'])
89
+ description_similarity = float(util.pytorch_cos_sim(desc1_embedding, desc2_embedding))
90
+
91
+ # 基本特徵相似度
92
+ size_similarity = 1.0 if breed1_features['size'] == breed2_features['size'] else 0.5
93
+ exercise_similarity = 1.0 if breed1_features['exercise'] == breed2_features['exercise'] else 0.5
94
+
95
+ # 性格相似度
96
+ temp1_embedding = self._get_cached_embedding(breed1_features['temperament'])
97
+ temp2_embedding = self._get_cached_embedding(breed2_features['temperament'])
98
+ temperament_similarity = float(util.pytorch_cos_sim(temp1_embedding, temp2_embedding))
99
+
100
+ # 健康分數相似度
101
+ health_score1 = self._calculate_health_score(breed1_features['breed_name'])
102
+ health_score2 = self._calculate_health_score(breed2_features['breed_name'])
103
+ health_similarity = 1.0 - abs(health_score1 - health_score2)
104
+
105
+ # 噪音水平相似度
106
+ noise_similarity = self._calculate_noise_similarity(
107
+ breed1_features['breed_name'],
108
+ breed2_features['breed_name']
109
+ )
110
+
111
+ # 加權計算
112
+ weights = {
113
+ 'description': 0.25,
114
+ 'temperament': 0.20,
115
+ 'exercise': 0.2,
116
+ 'size': 0.05,
117
+ 'health': 0.15,
118
+ 'noise': 0.15
119
+ }
120
+
121
+ final_similarity = (
122
+ description_similarity * weights['description'] +
123
+ temperament_similarity * weights['temperament'] +
124
+ exercise_similarity * weights['exercise'] +
125
+ size_similarity * weights['size'] +
126
+ health_similarity * weights['health'] +
127
+ noise_similarity * weights['noise']
128
+ )
129
+
130
+ return final_similarity
131
+
132
+
133
+ def _calculate_final_scores(self, breed_name: str, base_scores: Dict,
134
+ smart_score: float, is_preferred: bool,
135
+ similarity_score: float = 0.0) -> Dict:
136
+ """
137
+ 計算最終分數,包含基礎分數和獎勵分數
138
+
139
+ Args:
140
+ breed_name: 品種名稱
141
+ base_scores: 基礎評分 (空間、運動等)
142
+ smart_score: 智能匹配分數
143
+ is_preferred: 是否為用戶指定品種
144
+ similarity_score: 與指定品種的相似度 (0-1)
145
+ """
146
+ # 基礎權重
147
+ weights = {
148
+ 'base': 0.6, # 基礎分數權重
149
+ 'smart': 0.25, # 智能匹配權重
150
+ 'bonus': 0.15 # 獎勵分數權重
151
+ }
152
+
153
+ # 計算基礎分數
154
+ base_score = base_scores.get('overall', 0.7)
155
+
156
+ # 計算獎勵分數
157
+ bonus_score = 0.0
158
+ if is_preferred:
159
+ # 用戶指定品種獲得最高獎勵
160
+ bonus_score = 0.95
161
+ elif similarity_score > 0:
162
+ # 相似品種獲得部分獎勵,但不超過80%的最高獎勵
163
+ bonus_score = min(0.8, similarity_score) * 0.95
164
+
165
+ # 計算最終分數
166
+ final_score = (
167
+ base_score * weights['base'] +
168
+ smart_score * weights['smart'] +
169
+ bonus_score * weights['bonus']
170
+ )
171
+
172
+ # 更新各項分數
173
+ scores = base_scores.copy()
174
+
175
+ # 如果是用戶指定品種,稍微提升各項基礎分數,但保持合理範圍
176
+ if is_preferred:
177
+ for key in scores:
178
+ if key != 'overall':
179
+ scores[key] = min(1.0, scores[key] * 1.1) # 最多提升10%
180
+
181
+ # 為相似品種調整分數
182
+ elif similarity_score > 0:
183
+ boost_factor = 1.0 + (similarity_score * 0.05) # 最多提升5%
184
+ for key in scores:
185
+ if key != 'overall':
186
+ scores[key] = min(0.95, scores[key] * boost_factor) # 確保不超過95%
187
+
188
+ return {
189
+ 'final_score': round(final_score, 4),
190
+ 'base_score': round(base_score, 4),
191
+ 'bonus_score': round(bonus_score, 4),
192
+ 'scores': {k: round(v, 4) for k, v in scores.items()}
193
+ }
194
+
195
+ def _calculate_health_score(self, breed_name: str) -> float:
196
+ """計算品種的健康分數"""
197
+ if breed_name not in breed_health_info:
198
+ return 0.5
199
+
200
+ health_notes = breed_health_info[breed_name]['health_notes'].lower()
201
+
202
+ # 嚴重健康問題
203
+ severe_conditions = [
204
+ 'cancer', 'cardiomyopathy', 'epilepsy', 'dysplasia',
205
+ 'bloat', 'progressive', 'syndrome'
206
+ ]
207
+
208
+ # 中等健康問題
209
+ moderate_conditions = [
210
+ 'allergies', 'infections', 'thyroid', 'luxation',
211
+ 'skin problems', 'ear'
212
+ ]
213
+
214
+ severe_count = sum(1 for condition in severe_conditions if condition in health_notes)
215
+ moderate_count = sum(1 for condition in moderate_conditions if condition in health_notes)
216
+
217
+ health_score = 1.0
218
+ health_score -= (severe_count * 0.1)
219
+ health_score -= (moderate_count * 0.05)
220
+
221
+ # 特殊條件調整(根據用戶偏好)
222
+ if hasattr(self, 'user_preferences'):
223
+ if self.user_preferences.has_children:
224
+ if 'requires frequent' in health_notes or 'regular monitoring' in health_notes:
225
+ health_score *= 0.9
226
+
227
+ if self.user_preferences.health_sensitivity == 'high':
228
+ health_score *= 0.9
229
+
230
+ return max(0.3, min(1.0, health_score))
231
+
232
+
233
+
234
+ def _calculate_noise_similarity(self, breed1: str, breed2: str) -> float:
235
+ """計算兩個品種的噪音相似度"""
236
+ noise_levels = {
237
+ 'Low': 1,
238
+ 'Moderate': 2,
239
+ 'High': 3,
240
+ 'Unknown': 2 # 默認為中等
241
+ }
242
+
243
+ noise1 = breed_noise_info.get(breed1, {}).get('noise_level', 'Unknown')
244
+ noise2 = breed_noise_info.get(breed2, {}).get('noise_level', 'Unknown')
245
+
246
+ # 獲取數值級別
247
+ level1 = noise_levels.get(noise1, 2)
248
+ level2 = noise_levels.get(noise2, 2)
249
+
250
+ # 計算差異並歸一化
251
+ difference = abs(level1 - level2)
252
+ similarity = 1.0 - (difference / 2) # 最大差異是2,所以除以2來歸一化
253
+
254
+ return similarity
255
+
256
+ def _general_matching(self, description: str, top_n: int = 10) -> List[Dict]:
257
+ """基本的品種匹配邏輯,考慮描述���性格、噪音和健康因素"""
258
+ matches = []
259
+ # 預先計算描述的 embedding 並快取
260
+ desc_embedding = self._get_cached_embedding(description)
261
+
262
+ for breed in self.dog_data:
263
+ breed_name = breed[1]
264
+ breed_description = breed[9]
265
+ temperament = breed[4]
266
+
267
+ # 使用快取計算相似度
268
+ breed_desc_embedding = self._get_cached_embedding(breed_description)
269
+ breed_temp_embedding = self._get_cached_embedding(temperament)
270
+
271
+ desc_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_desc_embedding))
272
+ temp_similarity = float(util.pytorch_cos_sim(desc_embedding, breed_temp_embedding))
273
+
274
+ # 其餘計算保持不變
275
+ noise_similarity = self._calculate_noise_similarity(breed_name, breed_name)
276
+ health_score = self._calculate_health_score(breed_name)
277
+ health_similarity = 1.0 - abs(health_score - 0.8)
278
+
279
+ weights = {
280
+ 'description': 0.35,
281
+ 'temperament': 0.25,
282
+ 'noise': 0.2,
283
+ 'health': 0.2
284
+ }
285
+
286
+ final_score = (
287
+ desc_similarity * weights['description'] +
288
+ temp_similarity * weights['temperament'] +
289
+ noise_similarity * weights['noise'] +
290
+ health_similarity * weights['health']
291
+ )
292
+
293
+ matches.append({
294
+ 'breed': breed_name,
295
+ 'score': final_score,
296
+ 'is_preferred': False,
297
+ 'similarity': final_score,
298
+ 'reason': "Matched based on description, temperament, noise level, and health score"
299
+ })
300
+
301
+ return sorted(matches, key=lambda x: -x['score'])[:top_n]
302
+
303
+
304
+ def _detect_breed_preference(self, description: str) -> Optional[str]:
305
+ """檢測用戶是否提到特定品種"""
306
+ description_lower = f" {description.lower()} "
307
+
308
+ for breed_info in self.dog_data:
309
+ breed_name = breed_info[1]
310
+ normalized_breed = breed_name.lower().replace('_', ' ')
311
+
312
+ pattern = rf"\b{re.escape(normalized_breed)}\b"
313
+
314
+ if re.search(pattern, description_lower):
315
+ return breed_name
316
+
317
+ return None
318
+
319
+ def match_user_preference(self, description: str, top_n: int = 10) -> List[Dict]:
320
+ """根據用戶描述匹配最適合的品種"""
321
+ preferred_breed = self._detect_breed_preference(description)
322
+
323
+ matches = []
324
+ if preferred_breed:
325
+ # 首先添加偏好品種
326
+ breed_info = next((breed for breed in self.dog_data if breed[1] == preferred_breed), None)
327
+ if breed_info:
328
+ base_scores = {'overall': 1.0} # 給予最高基礎分數
329
+ # 計算偏好品種的最終分數
330
+ scores = self._calculate_final_scores(
331
+ preferred_breed,
332
+ base_scores,
333
+ smart_score=1.0,
334
+ is_preferred=True,
335
+ similarity_score=1.0
336
+ )
337
+
338
+ matches.append({
339
+ 'breed': preferred_breed,
340
+ 'score': 1.0, # 確保最高分
341
+ 'final_score': scores['final_score'],
342
+ 'base_score': scores['base_score'],
343
+ 'bonus_score': scores['bonus_score'],
344
+ 'is_preferred': True,
345
+ 'priority': 1, # 最高優先級
346
+ 'health_score': self._calculate_health_score(preferred_breed),
347
+ 'noise_level': breed_noise_info.get(preferred_breed, {}).get('noise_level', 'Unknown'),
348
+ 'reason': "Directly matched your preferred breed"
349
+ })
350
+
351
+ # 添加相似品種
352
+ similar_breeds = self.find_similar_breeds(preferred_breed, top_n=top_n-1)
353
+ for breed_name, similarity in similar_breeds:
354
+ if breed_name != preferred_breed:
355
+ # 使用 _calculate_final_scores 計算相似品種分數
356
+ scores = self._calculate_final_scores(
357
+ breed_name,
358
+ {'overall': similarity * 0.9}, # 基礎分數略低於偏好品種
359
+ smart_score=similarity,
360
+ is_preferred=False,
361
+ similarity_score=similarity
362
+ )
363
+
364
+ matches.append({
365
+ 'breed': breed_name,
366
+ 'score': min(0.95, similarity), # 確保不超過偏好品種
367
+ 'final_score': scores['final_score'],
368
+ 'base_score': scores['base_score'],
369
+ 'bonus_score': scores['bonus_score'],
370
+ 'is_preferred': False,
371
+ 'priority': 2,
372
+ 'health_score': self._calculate_health_score(breed_name),
373
+ 'noise_level': breed_noise_info.get(breed_name, {}).get('noise_level', 'Unknown'),
374
+ 'reason': f"Similar to {preferred_breed}"
375
+ })
376
+ else:
377
+ matches = self._general_matching(description, top_n)
378
+ for match in matches:
379
+ match['priority'] = 3
380
+
381
+ # 使用複合排序鍵
382
+ final_matches = sorted(
383
+ matches,
384
+ key=lambda x: (
385
+ x.get('priority', 3) * -1, # 優先級倒序(1最高)
386
+ x.get('is_preferred', False) * 1, # 偏好品種優先
387
+ float(x.get('final_score', 0)) * -1, # 分數倒序
388
+ x.get('breed', '') # 品種名稱正序
389
+ )
390
+ )[:top_n]
391
+
392
+ return final_matches