harismlnaslm commited on
Commit
ef903ff
Β·
1 Parent(s): e80082d

Add fixed app and dataset loading script

Browse files
Files changed (2) hide show
  1. app_fixed.py +284 -0
  2. fix_dataset_loading.sh +33 -0
app_fixed.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Textilindo AI Assistant - Hugging Face Spaces
4
+ """
5
+
6
+ from flask import Flask, request, jsonify, render_template
7
+ import os
8
+ import json
9
+ import requests
10
+ from difflib import SequenceMatcher
11
+ import logging
12
+
13
+ # Setup logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ app = Flask(__name__)
18
+
19
+ def load_system_prompt(default_text):
20
+ """Load system prompt from configs/system_prompt.md if available"""
21
+ try:
22
+ base_dir = os.path.dirname(__file__)
23
+ md_path = os.path.join(base_dir, 'configs', 'system_prompt.md')
24
+ if not os.path.exists(md_path):
25
+ return default_text
26
+ with open(md_path, 'r', encoding='utf-8') as f:
27
+ content = f.read()
28
+ start = content.find('"""')
29
+ end = content.rfind('"""')
30
+ if start != -1 and end != -1 and end > start:
31
+ return content[start+3:end].strip()
32
+ lines = []
33
+ for line in content.splitlines():
34
+ if line.strip().startswith('#'):
35
+ continue
36
+ lines.append(line)
37
+ cleaned = '\n'.join(lines).strip()
38
+ return cleaned or default_text
39
+ except Exception:
40
+ return default_text
41
+
42
+ class TextilindoAI:
43
+ def __init__(self):
44
+ self.system_prompt = os.getenv(
45
+ 'SYSTEM_PROMPT',
46
+ load_system_prompt("You are Textilindo AI Assistant. Be concise, helpful, and use Indonesian.")
47
+ )
48
+ self.dataset = self.load_all_datasets()
49
+
50
+ def load_all_datasets(self):
51
+ """Load all available datasets"""
52
+ dataset = []
53
+
54
+ # Try multiple possible data directory paths
55
+ possible_data_dirs = [
56
+ "data",
57
+ "./data",
58
+ "/app/data",
59
+ os.path.join(os.path.dirname(__file__), "data")
60
+ ]
61
+
62
+ data_dir = None
63
+ for dir_path in possible_data_dirs:
64
+ if os.path.exists(dir_path):
65
+ data_dir = dir_path
66
+ logger.info(f"Found data directory: {data_dir}")
67
+ break
68
+
69
+ if not data_dir:
70
+ logger.warning("No data directory found in any of the expected locations")
71
+ return dataset
72
+
73
+ # Load all JSONL files
74
+ try:
75
+ for filename in os.listdir(data_dir):
76
+ if filename.endswith('.jsonl'):
77
+ filepath = os.path.join(data_dir, filename)
78
+ try:
79
+ with open(filepath, 'r', encoding='utf-8') as f:
80
+ for line_num, line in enumerate(f, 1):
81
+ line = line.strip()
82
+ if line:
83
+ try:
84
+ data = json.loads(line)
85
+ dataset.append(data)
86
+ except json.JSONDecodeError as e:
87
+ logger.warning(f"Invalid JSON in {filename} line {line_num}: {e}")
88
+ continue
89
+ logger.info(f"Loaded {filename}: {len([d for d in dataset if d.get('instruction')])} examples")
90
+ except Exception as e:
91
+ logger.error(f"Error loading {filename}: {e}")
92
+ except Exception as e:
93
+ logger.error(f"Error reading data directory {data_dir}: {e}")
94
+
95
+ logger.info(f"Total examples loaded: {len(dataset)}")
96
+ return dataset
97
+
98
+ def find_relevant_context(self, user_query, top_k=3):
99
+ """Find most relevant examples from dataset"""
100
+ if not self.dataset:
101
+ return []
102
+
103
+ scores = []
104
+ for i, example in enumerate(self.dataset):
105
+ instruction = example.get('instruction', '').lower()
106
+ output = example.get('output', '').lower()
107
+ query = user_query.lower()
108
+
109
+ instruction_score = SequenceMatcher(None, query, instruction).ratio()
110
+ output_score = SequenceMatcher(None, query, output).ratio()
111
+ combined_score = (instruction_score * 0.7) + (output_score * 0.3)
112
+ scores.append((combined_score, i))
113
+
114
+ scores.sort(reverse=True)
115
+ relevant_examples = []
116
+
117
+ for score, idx in scores[:top_k]:
118
+ if score > 0.1:
119
+ relevant_examples.append(self.dataset[idx])
120
+
121
+ return relevant_examples
122
+
123
+ def create_context_prompt(self, user_query, relevant_examples):
124
+ """Create a prompt with relevant context"""
125
+ if not relevant_examples:
126
+ return user_query
127
+
128
+ context_parts = []
129
+ context_parts.append("Berikut adalah beberapa contoh pertanyaan dan jawaban tentang Textilindo:")
130
+ context_parts.append("")
131
+
132
+ for i, example in enumerate(relevant_examples, 1):
133
+ instruction = example.get('instruction', '')
134
+ output = example.get('output', '')
135
+ context_parts.append(f"Contoh {i}:")
136
+ context_parts.append(f"Pertanyaan: {instruction}")
137
+ context_parts.append(f"Jawaban: {output}")
138
+ context_parts.append("")
139
+
140
+ context_parts.append("Berdasarkan contoh di atas, jawab pertanyaan berikut:")
141
+ context_parts.append(f"Pertanyaan: {user_query}")
142
+ context_parts.append("Jawaban:")
143
+
144
+ return "\n".join(context_parts)
145
+
146
+ def chat(self, message, max_tokens=300, temperature=0.7):
147
+ """Generate response using Hugging Face Spaces"""
148
+ relevant_examples = self.find_relevant_context(message, 3)
149
+
150
+ if relevant_examples:
151
+ enhanced_prompt = self.create_context_prompt(message, relevant_examples)
152
+ context_used = True
153
+ else:
154
+ enhanced_prompt = message
155
+ context_used = False
156
+
157
+ # For now, return a simple response
158
+ # In production, this would call your HF Space inference endpoint
159
+ response = f"Terima kasih atas pertanyaan Anda: {message}. Saya akan membantu Anda dengan informasi tentang Textilindo."
160
+
161
+ return {
162
+ "success": True,
163
+ "response": response,
164
+ "context_used": context_used,
165
+ "relevant_examples_count": len(relevant_examples)
166
+ }
167
+
168
+ # Initialize AI
169
+ ai = TextilindoAI()
170
+
171
+ @app.route('/health', methods=['GET'])
172
+ def health_check():
173
+ """Health check endpoint"""
174
+ return jsonify({
175
+ "status": "healthy",
176
+ "service": "Textilindo AI Assistant",
177
+ "dataset_loaded": len(ai.dataset) > 0,
178
+ "dataset_size": len(ai.dataset)
179
+ })
180
+
181
+ @app.route('/chat', methods=['POST'])
182
+ def chat():
183
+ """Main chat endpoint"""
184
+ try:
185
+ data = request.get_json()
186
+
187
+ if not data:
188
+ return jsonify({
189
+ "success": False,
190
+ "error": "No JSON data provided"
191
+ }), 400
192
+
193
+ message = data.get('message', '').strip()
194
+ if not message:
195
+ return jsonify({
196
+ "success": False,
197
+ "error": "Message is required"
198
+ }), 400
199
+
200
+ # Optional parameters
201
+ max_tokens = data.get('max_tokens', 300)
202
+ temperature = data.get('temperature', 0.7)
203
+
204
+ # Process chat
205
+ result = ai.chat(message, max_tokens, temperature)
206
+
207
+ if result["success"]:
208
+ return jsonify(result)
209
+ else:
210
+ return jsonify(result), 500
211
+
212
+ except Exception as e:
213
+ logger.error(f"Error in chat endpoint: {e}")
214
+ return jsonify({
215
+ "success": False,
216
+ "error": f"Internal server error: {str(e)}"
217
+ }), 500
218
+
219
+ @app.route('/stats', methods=['GET'])
220
+ def get_stats():
221
+ """Get dataset and system statistics"""
222
+ try:
223
+ topics = {}
224
+ for example in ai.dataset:
225
+ metadata = example.get('metadata', {})
226
+ topic = metadata.get('topic', 'unknown')
227
+ topics[topic] = topics.get(topic, 0) + 1
228
+
229
+ return jsonify({
230
+ "success": True,
231
+ "dataset": {
232
+ "total_examples": len(ai.dataset),
233
+ "topics": topics,
234
+ "topics_count": len(topics)
235
+ },
236
+ "system": {
237
+ "api_version": "1.0.0",
238
+ "status": "operational"
239
+ }
240
+ })
241
+
242
+ except Exception as e:
243
+ logger.error(f"Error in stats endpoint: {e}")
244
+ return jsonify({
245
+ "success": False,
246
+ "error": f"Internal server error: {str(e)}"
247
+ }), 500
248
+
249
+ @app.route('/', methods=['GET'])
250
+ def root():
251
+ """API root endpoint with documentation"""
252
+ return jsonify({
253
+ "service": "Textilindo AI Assistant",
254
+ "version": "1.0.0",
255
+ "description": "AI-powered customer service for Textilindo",
256
+ "endpoints": {
257
+ "GET /": "API documentation (this endpoint)",
258
+ "GET /health": "Health check",
259
+ "POST /chat": "Chat with AI",
260
+ "GET /stats": "Dataset and system statistics"
261
+ },
262
+ "usage": {
263
+ "chat": {
264
+ "method": "POST",
265
+ "url": "/chat",
266
+ "body": {
267
+ "message": "string (required)",
268
+ "max_tokens": "integer (optional, default: 300)",
269
+ "temperature": "float (optional, default: 0.7)"
270
+ }
271
+ }
272
+ },
273
+ "dataset_size": len(ai.dataset)
274
+ })
275
+
276
+ if __name__ == '__main__':
277
+ logger.info("Starting Textilindo AI Assistant...")
278
+ logger.info(f"Dataset loaded: {len(ai.dataset)} examples")
279
+
280
+ app.run(
281
+ debug=False,
282
+ host='0.0.0.0',
283
+ port=8080
284
+ )
fix_dataset_loading.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "πŸ”§ Fixing dataset loading issue..."
4
+
5
+ # Copy the fixed app
6
+ if [ -f "app_fixed.py" ]; then
7
+ cp app_fixed.py app.py
8
+ echo "βœ… Updated app.py with fixed dataset loading"
9
+ else
10
+ echo "❌ app_fixed.py not found"
11
+ exit 1
12
+ fi
13
+
14
+ # Add and commit changes
15
+ echo "πŸ“ Committing dataset loading fix..."
16
+ git add app.py
17
+ git commit -m "Fix dataset loading - handle multiple data directory paths
18
+
19
+ - Try multiple possible data directory locations
20
+ - Add better error handling for JSON parsing
21
+ - Improve logging for dataset loading
22
+ - Handle different working directories on HF Spaces"
23
+
24
+ # Push to Hugging Face Space
25
+ echo "πŸš€ Pushing fix to Hugging Face Space..."
26
+ git push hf lora-training:main --force
27
+
28
+ echo "βœ… Dataset loading fix complete!"
29
+ echo ""
30
+ echo "πŸ“‹ Next steps:"
31
+ echo "1. Check your HF Space build status: https://huggingface.co/spaces/harismlnaslm/Textilindo"
32
+ echo "2. Test the /stats endpoint to verify dataset is loaded"
33
+ echo "3. Try the /chat endpoint with a question"