Spaces:

harismlnaslm
/

textilindo-ai-assistant

Build error

App Files Files Community

harismlnaslm commited on Oct 24

Commit

ef903ff

1 Parent(s): e80082d

Add fixed app and dataset loading script

Browse files

Files changed (2) hide show

app_fixed.py +284 -0
fix_dataset_loading.sh +33 -0

app_fixed.py ADDED Viewed

	@@ -0,0 +1,284 @@

+#!/usr/bin/env python3
+"""
+Textilindo AI Assistant - Hugging Face Spaces
+"""
+from flask import Flask, request, jsonify, render_template
+import os
+import json
+import requests
+from difflib import SequenceMatcher
+import logging
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = Flask(__name__)
+def load_system_prompt(default_text):
+    """Load system prompt from configs/system_prompt.md if available"""
+    try:
+        base_dir = os.path.dirname(__file__)
+        md_path = os.path.join(base_dir, 'configs', 'system_prompt.md')
+        if not os.path.exists(md_path):
+            return default_text
+        with open(md_path, 'r', encoding='utf-8') as f:
+            content = f.read()
+        start = content.find('"""')
+        end = content.rfind('"""')
+        if start != -1 and end != -1 and end > start:
+            return content[start+3:end].strip()
+        lines = []
+        for line in content.splitlines():
+            if line.strip().startswith('#'):
+                continue
+            lines.append(line)
+        cleaned = '\n'.join(lines).strip()
+        return cleaned or default_text
+    except Exception:
+        return default_text
+class TextilindoAI:
+    def __init__(self):
+        self.system_prompt = os.getenv(
+            'SYSTEM_PROMPT',
+            load_system_prompt("You are Textilindo AI Assistant. Be concise, helpful, and use Indonesian.")
+        )
+        self.dataset = self.load_all_datasets()
+    def load_all_datasets(self):
+        """Load all available datasets"""
+        dataset = []
+        # Try multiple possible data directory paths
+        possible_data_dirs = [
+            "data",
+            "./data",
+            "/app/data",
+            os.path.join(os.path.dirname(__file__), "data")
+        ]
+        data_dir = None
+        for dir_path in possible_data_dirs:
+            if os.path.exists(dir_path):
+                data_dir = dir_path
+                logger.info(f"Found data directory: {data_dir}")
+                break
+        if not data_dir:
+            logger.warning("No data directory found in any of the expected locations")
+            return dataset
+        # Load all JSONL files
+        try:
+            for filename in os.listdir(data_dir):
+                if filename.endswith('.jsonl'):
+                    filepath = os.path.join(data_dir, filename)
+                    try:
+                        with open(filepath, 'r', encoding='utf-8') as f:
+                            for line_num, line in enumerate(f, 1):
+                                line = line.strip()
+                                if line:
+                                    try:
+                                        data = json.loads(line)
+                                        dataset.append(data)
+                                    except json.JSONDecodeError as e:
+                                        logger.warning(f"Invalid JSON in {filename} line {line_num}: {e}")
+                                        continue
+                        logger.info(f"Loaded {filename}: {len([d for d in dataset if d.get('instruction')])} examples")
+                    except Exception as e:
+                        logger.error(f"Error loading {filename}: {e}")
+        except Exception as e:
+            logger.error(f"Error reading data directory {data_dir}: {e}")
+        logger.info(f"Total examples loaded: {len(dataset)}")
+        return dataset
+    def find_relevant_context(self, user_query, top_k=3):
+        """Find most relevant examples from dataset"""
+        if not self.dataset:
+            return []
+        scores = []
+        for i, example in enumerate(self.dataset):
+            instruction = example.get('instruction', '').lower()
+            output = example.get('output', '').lower()
+            query = user_query.lower()
+            instruction_score = SequenceMatcher(None, query, instruction).ratio()
+            output_score = SequenceMatcher(None, query, output).ratio()
+            combined_score = (instruction_score * 0.7) + (output_score * 0.3)
+            scores.append((combined_score, i))
+        scores.sort(reverse=True)
+        relevant_examples = []
+        for score, idx in scores[:top_k]:
+            if score > 0.1:
+                relevant_examples.append(self.dataset[idx])
+        return relevant_examples
+    def create_context_prompt(self, user_query, relevant_examples):
+        """Create a prompt with relevant context"""
+        if not relevant_examples:
+            return user_query
+        context_parts = []
+        context_parts.append("Berikut adalah beberapa contoh pertanyaan dan jawaban tentang Textilindo:")
+        context_parts.append("")
+        for i, example in enumerate(relevant_examples, 1):
+            instruction = example.get('instruction', '')
+            output = example.get('output', '')
+            context_parts.append(f"Contoh {i}:")
+            context_parts.append(f"Pertanyaan: {instruction}")
+            context_parts.append(f"Jawaban: {output}")
+            context_parts.append("")
+        context_parts.append("Berdasarkan contoh di atas, jawab pertanyaan berikut:")
+        context_parts.append(f"Pertanyaan: {user_query}")
+        context_parts.append("Jawaban:")
+        return "\n".join(context_parts)
+    def chat(self, message, max_tokens=300, temperature=0.7):
+        """Generate response using Hugging Face Spaces"""
+        relevant_examples = self.find_relevant_context(message, 3)
+        if relevant_examples:
+            enhanced_prompt = self.create_context_prompt(message, relevant_examples)
+            context_used = True
+        else:
+            enhanced_prompt = message
+            context_used = False
+        # For now, return a simple response
+        # In production, this would call your HF Space inference endpoint
+        response = f"Terima kasih atas pertanyaan Anda: {message}. Saya akan membantu Anda dengan informasi tentang Textilindo."
+        return {
+            "success": True,
+            "response": response,
+            "context_used": context_used,
+            "relevant_examples_count": len(relevant_examples)
+        }
+# Initialize AI
+ai = TextilindoAI()
+@app.route('/health', methods=['GET'])
+def health_check():
+    """Health check endpoint"""
+    return jsonify({
+        "status": "healthy",
+        "service": "Textilindo AI Assistant",
+        "dataset_loaded": len(ai.dataset) > 0,
+        "dataset_size": len(ai.dataset)
+    })
+@app.route('/chat', methods=['POST'])
+def chat():
+    """Main chat endpoint"""
+    try:
+        data = request.get_json()
+        if not data:
+            return jsonify({
+                "success": False,
+                "error": "No JSON data provided"
+            }), 400
+        message = data.get('message', '').strip()
+        if not message:
+            return jsonify({
+                "success": False,
+                "error": "Message is required"
+            }), 400
+        # Optional parameters
+        max_tokens = data.get('max_tokens', 300)
+        temperature = data.get('temperature', 0.7)
+        # Process chat
+        result = ai.chat(message, max_tokens, temperature)
+        if result["success"]:
+            return jsonify(result)
+        else:
+            return jsonify(result), 500
+    except Exception as e:
+        logger.error(f"Error in chat endpoint: {e}")
+        return jsonify({
+            "success": False,
+            "error": f"Internal server error: {str(e)}"
+        }), 500
+@app.route('/stats', methods=['GET'])
+def get_stats():
+    """Get dataset and system statistics"""
+    try:
+        topics = {}
+        for example in ai.dataset:
+            metadata = example.get('metadata', {})
+            topic = metadata.get('topic', 'unknown')
+            topics[topic] = topics.get(topic, 0) + 1
+        return jsonify({
+            "success": True,
+            "dataset": {
+                "total_examples": len(ai.dataset),
+                "topics": topics,
+                "topics_count": len(topics)
+            },
+            "system": {
+                "api_version": "1.0.0",
+                "status": "operational"
+            }
+        })
+    except Exception as e:
+        logger.error(f"Error in stats endpoint: {e}")
+        return jsonify({
+            "success": False,
+            "error": f"Internal server error: {str(e)}"
+        }), 500
+@app.route('/', methods=['GET'])
+def root():
+    """API root endpoint with documentation"""
+    return jsonify({
+        "service": "Textilindo AI Assistant",
+        "version": "1.0.0",
+        "description": "AI-powered customer service for Textilindo",
+        "endpoints": {
+            "GET /": "API documentation (this endpoint)",
+            "GET /health": "Health check",
+            "POST /chat": "Chat with AI",
+            "GET /stats": "Dataset and system statistics"
+        },
+        "usage": {
+            "chat": {
+                "method": "POST",
+                "url": "/chat",
+                "body": {
+                    "message": "string (required)",
+                    "max_tokens": "integer (optional, default: 300)",
+                    "temperature": "float (optional, default: 0.7)"
+                }
+            }
+        },
+        "dataset_size": len(ai.dataset)
+    })
+if __name__ == '__main__':
+    logger.info("Starting Textilindo AI Assistant...")
+    logger.info(f"Dataset loaded: {len(ai.dataset)} examples")
+    app.run(
+        debug=False,
+        host='0.0.0.0',
+        port=8080
+    )

fix_dataset_loading.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+#!/bin/bash
+echo "🔧 Fixing dataset loading issue..."
+# Copy the fixed app
+if [ -f "app_fixed.py" ]; then
+    cp app_fixed.py app.py
+    echo "✅ Updated app.py with fixed dataset loading"
+else
+    echo "❌ app_fixed.py not found"
+    exit 1
+fi
+# Add and commit changes
+echo "📝 Committing dataset loading fix..."
+git add app.py
+git commit -m "Fix dataset loading - handle multiple data directory paths
+- Try multiple possible data directory locations
+- Add better error handling for JSON parsing
+- Improve logging for dataset loading
+- Handle different working directories on HF Spaces"
+# Push to Hugging Face Space
+echo "🚀 Pushing fix to Hugging Face Space..."
+git push hf lora-training:main --force
+echo "✅ Dataset loading fix complete!"
+echo ""
+echo "📋 Next steps:"
+echo "1. Check your HF Space build status: https://huggingface.co/spaces/harismlnaslm/Textilindo"
+echo "2. Test the /stats endpoint to verify dataset is loaded"
+echo "3. Try the /chat endpoint with a question"