Eleanor Zheng
commited on
Commit
·
7daf5b1
1
Parent(s):
e707907
Seperate pages to different files.
Browse files- app.py +18 -510
- pages/about.py +79 -0
- pages/manage_documents.py +69 -0
- pages/search_uni.py +88 -0
- pages/upload_documents.py +70 -0
- styles.css +126 -0
- test_system.py +2 -2
- utils/display.py +79 -0
- rag_system.py → utils/rag_system.py +0 -0
- utils/translations.py +100 -0
app.py
CHANGED
@@ -1,9 +1,25 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
from urllib.parse import urlparse, parse_qs
|
4 |
-
from rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
|
5 |
from datetime import datetime
|
6 |
import uuid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Configure Streamlit page
|
9 |
st.set_page_config(
|
@@ -13,138 +29,6 @@ st.set_page_config(
|
|
13 |
initial_sidebar_state="expanded"
|
14 |
)
|
15 |
|
16 |
-
# Custom CSS - Dark theme compatible
|
17 |
-
st.markdown("""
|
18 |
-
<style>
|
19 |
-
.main-header {
|
20 |
-
text-align: center;
|
21 |
-
padding: 2rem 0;
|
22 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
23 |
-
color: white;
|
24 |
-
margin: -1rem -1rem 2rem -1rem;
|
25 |
-
border-radius: 10px;
|
26 |
-
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
27 |
-
}
|
28 |
-
|
29 |
-
.stApp {
|
30 |
-
background: var(--background-color);
|
31 |
-
}
|
32 |
-
|
33 |
-
/* Dark theme compatible containers */
|
34 |
-
.query-result {
|
35 |
-
background: rgba(255, 255, 255, 0.05);
|
36 |
-
backdrop-filter: blur(10px);
|
37 |
-
border: 1px solid rgba(255, 255, 255, 0.1);
|
38 |
-
padding: 1.5rem;
|
39 |
-
border-radius: 15px;
|
40 |
-
margin: 1rem 0;
|
41 |
-
color: var(--text-color);
|
42 |
-
}
|
43 |
-
|
44 |
-
.source-doc {
|
45 |
-
background: rgba(31, 119, 180, 0.1);
|
46 |
-
backdrop-filter: blur(5px);
|
47 |
-
padding: 1rem;
|
48 |
-
border-left: 4px solid #1f77b4;
|
49 |
-
border-radius: 8px;
|
50 |
-
margin: 0.5rem 0;
|
51 |
-
color: var(--text-color);
|
52 |
-
}
|
53 |
-
|
54 |
-
.share-link {
|
55 |
-
background: rgba(46, 204, 113, 0.1);
|
56 |
-
backdrop-filter: blur(5px);
|
57 |
-
padding: 1rem;
|
58 |
-
border-radius: 10px;
|
59 |
-
border-left: 4px solid #2ecc71;
|
60 |
-
color: var(--text-color);
|
61 |
-
}
|
62 |
-
|
63 |
-
/* Model indicator boxes */
|
64 |
-
.model-info {
|
65 |
-
background: rgba(52, 152, 219, 0.15);
|
66 |
-
backdrop-filter: blur(10px);
|
67 |
-
padding: 15px;
|
68 |
-
border-radius: 12px;
|
69 |
-
border-left: 4px solid #3498db;
|
70 |
-
margin: 10px 0;
|
71 |
-
}
|
72 |
-
|
73 |
-
/* Language selection enhancement */
|
74 |
-
.language-selection {
|
75 |
-
background: rgba(155, 89, 182, 0.1);
|
76 |
-
backdrop-filter: blur(10px);
|
77 |
-
padding: 15px;
|
78 |
-
border-radius: 12px;
|
79 |
-
border-left: 4px solid #9b59b6;
|
80 |
-
margin: 10px 0;
|
81 |
-
}
|
82 |
-
|
83 |
-
/* Upload area enhancement */
|
84 |
-
.stFileUploader {
|
85 |
-
background: rgba(230, 126, 34, 0.1);
|
86 |
-
backdrop-filter: blur(10px);
|
87 |
-
padding: 20px;
|
88 |
-
border-radius: 15px;
|
89 |
-
border: 2px dashed #e67e22;
|
90 |
-
}
|
91 |
-
|
92 |
-
.stFileUploader label {
|
93 |
-
font-size: 1.2rem;
|
94 |
-
font-weight: bold;
|
95 |
-
color: var(--text-color);
|
96 |
-
}
|
97 |
-
|
98 |
-
/* Button enhancements */
|
99 |
-
.stButton > button {
|
100 |
-
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
101 |
-
color: white;
|
102 |
-
border: none;
|
103 |
-
border-radius: 10px;
|
104 |
-
padding: 0.6rem 1.5rem;
|
105 |
-
font-weight: 600;
|
106 |
-
transition: all 0.3s ease;
|
107 |
-
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
108 |
-
}
|
109 |
-
|
110 |
-
.stButton > button:hover {
|
111 |
-
transform: translateY(-2px);
|
112 |
-
box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
|
113 |
-
}
|
114 |
-
|
115 |
-
/* Sidebar enhancements */
|
116 |
-
.css-1d391kg {
|
117 |
-
background: rgba(255, 255, 255, 0.02);
|
118 |
-
backdrop-filter: blur(10px);
|
119 |
-
}
|
120 |
-
|
121 |
-
/* Info boxes */
|
122 |
-
.stInfo {
|
123 |
-
background: rgba(52, 152, 219, 0.1);
|
124 |
-
backdrop-filter: blur(10px);
|
125 |
-
border-left: 4px solid #3498db;
|
126 |
-
}
|
127 |
-
|
128 |
-
.stSuccess {
|
129 |
-
background: rgba(46, 204, 113, 0.1);
|
130 |
-
backdrop-filter: blur(10px);
|
131 |
-
border-left: 4px solid #2ecc71;
|
132 |
-
}
|
133 |
-
|
134 |
-
.stWarning {
|
135 |
-
background: rgba(241, 196, 15, 0.1);
|
136 |
-
backdrop-filter: blur(10px);
|
137 |
-
border-left: 4px solid #f1c40f;
|
138 |
-
}
|
139 |
-
|
140 |
-
.stError {
|
141 |
-
background: rgba(231, 76, 60, 0.1);
|
142 |
-
backdrop-filter: blur(10px);
|
143 |
-
border-left: 4px solid #e74c3c;
|
144 |
-
}
|
145 |
-
</style>
|
146 |
-
""", unsafe_allow_html=True)
|
147 |
-
|
148 |
def main():
|
149 |
# Check for shared query in URL
|
150 |
query_params = st.query_params
|
@@ -182,388 +66,12 @@ def main():
|
|
182 |
upload_documents_page()
|
183 |
elif st.session_state.current_page == "🗂 Manage Documents":
|
184 |
manage_documents_page()
|
185 |
-
elif st.session_state.current_page == "ℹ️ About":
|
186 |
about_page()
|
187 |
else:
|
188 |
search_page()
|
189 |
|
190 |
|
191 |
-
def upload_documents_page():
|
192 |
-
st.header("📄 Upload University Documents")
|
193 |
-
st.write("Upload PDF documents containing university admission requirements, fees, and program information.")
|
194 |
-
|
195 |
-
col1, col2 = st.columns(2)
|
196 |
-
|
197 |
-
with col1:
|
198 |
-
university_name = st.text_input("🏫 University Name", placeholder="e.g., National University of Singapore")
|
199 |
-
country = st.selectbox(
|
200 |
-
"🌏 Country",
|
201 |
-
["", "Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei", "Cambodia", "Laos", "Myanmar"]
|
202 |
-
)
|
203 |
-
|
204 |
-
with col2:
|
205 |
-
document_type = st.selectbox(
|
206 |
-
"📋 Document Type",
|
207 |
-
["admission_requirements", "tuition_fees", "program_catalog", "application_guide", "scholarship_info"]
|
208 |
-
)
|
209 |
-
language = st.selectbox(
|
210 |
-
"🌐 Primary Language",
|
211 |
-
["English", "Chinese", "Malay", "Thai", "Indonesian", "Vietnamese", "Filipino", "Other"]
|
212 |
-
)
|
213 |
-
|
214 |
-
# File upload
|
215 |
-
uploaded_files = st.file_uploader(
|
216 |
-
"Choose PDF files",
|
217 |
-
accept_multiple_files=True,
|
218 |
-
type=['pdf'],
|
219 |
-
help="Select one or more PDF files to upload"
|
220 |
-
)
|
221 |
-
|
222 |
-
if uploaded_files and st.button("🚀 Process Documents", type="primary"):
|
223 |
-
if not university_name or not country:
|
224 |
-
st.error("Please provide university name and country.")
|
225 |
-
return
|
226 |
-
|
227 |
-
with st.spinner("Processing documents... This may take a few minutes."):
|
228 |
-
try:
|
229 |
-
# Initialize document ingestion
|
230 |
-
doc_ingestion = DocumentIngestion()
|
231 |
-
|
232 |
-
# Process documents
|
233 |
-
documents = doc_ingestion.process_documents(
|
234 |
-
uploaded_files, university_name, country, document_type
|
235 |
-
)
|
236 |
-
|
237 |
-
if documents:
|
238 |
-
# Create or update vector store
|
239 |
-
vectorstore = doc_ingestion.create_vector_store(documents)
|
240 |
-
|
241 |
-
if vectorstore:
|
242 |
-
st.success(f"✅ Successfully processed {len(documents)} documents!")
|
243 |
-
st.info(f"Documents from {university_name} ({country}) have been added to the knowledge base.")
|
244 |
-
|
245 |
-
# Show processed files
|
246 |
-
with st.expander("📋 Processed Files"):
|
247 |
-
for doc in documents:
|
248 |
-
st.write(f"• **{doc.metadata['source']}**")
|
249 |
-
st.write(f" - University: {doc.metadata['university']}")
|
250 |
-
st.write(f" - Country: {doc.metadata['country']}")
|
251 |
-
st.write(f" - Type: {doc.metadata['document_type']}")
|
252 |
-
st.write("---")
|
253 |
-
else:
|
254 |
-
st.error("No documents were successfully processed.")
|
255 |
-
|
256 |
-
except Exception as e:
|
257 |
-
st.error(f"Error processing documents: {str(e)}")
|
258 |
-
|
259 |
-
def manage_documents_page():
|
260 |
-
st.header("🗂 Manage Documents in Database")
|
261 |
-
st.write("View and delete documents currently stored in the Chroma vector database.")
|
262 |
-
|
263 |
-
from rag_system import DocumentIngestion
|
264 |
-
doc_ingestion = DocumentIngestion()
|
265 |
-
vectorstore = doc_ingestion.load_existing_vectorstore()
|
266 |
-
|
267 |
-
if not vectorstore:
|
268 |
-
st.warning("No vector store found. Upload documents first.")
|
269 |
-
return
|
270 |
-
|
271 |
-
# Get all documents (chunks) in the vectorstore
|
272 |
-
try:
|
273 |
-
# Chroma stores documents as chunks, but we want to show original metadata
|
274 |
-
# We'll group by file_id to show unique documents
|
275 |
-
collection = vectorstore._collection
|
276 |
-
all_docs = collection.get(include=["metadatas", "documents"]) # Removed 'ids'
|
277 |
-
metadatas = all_docs["metadatas"]
|
278 |
-
ids = all_docs["ids"] # ids are always returned
|
279 |
-
documents = all_docs["documents"]
|
280 |
-
|
281 |
-
# Group by file_id
|
282 |
-
doc_map = {}
|
283 |
-
for meta, doc_id, doc_text in zip(metadatas, ids, documents):
|
284 |
-
file_id = meta.get("file_id", doc_id)
|
285 |
-
if file_id not in doc_map:
|
286 |
-
doc_map[file_id] = {
|
287 |
-
"source": meta.get("source", "Unknown"),
|
288 |
-
"university": meta.get("university", "Unknown"),
|
289 |
-
"country": meta.get("country", "Unknown"),
|
290 |
-
"document_type": meta.get("document_type", "Unknown"),
|
291 |
-
"upload_timestamp": meta.get("upload_timestamp", "Unknown"),
|
292 |
-
"file_id": file_id,
|
293 |
-
"chunks": []
|
294 |
-
}
|
295 |
-
doc_map[file_id]["chunks"].append(doc_text)
|
296 |
-
|
297 |
-
if not doc_map:
|
298 |
-
st.info("No documents found in the database.")
|
299 |
-
return
|
300 |
-
|
301 |
-
st.subheader("Current Documents:")
|
302 |
-
for file_id, info in doc_map.items():
|
303 |
-
with st.expander(f"{info['source']} ({info['university']}, {info['country']})"):
|
304 |
-
st.write(f"**Type:** {info['document_type']}")
|
305 |
-
st.write(f"**Uploaded:** {info['upload_timestamp']}")
|
306 |
-
st.write(f"**File ID:** {file_id}")
|
307 |
-
st.write(f"**Chunks:** {len(info['chunks'])}")
|
308 |
-
if st.button(f"🗑️ Delete Document", key=f"del_{file_id}"):
|
309 |
-
# Delete all chunks with this file_id
|
310 |
-
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == file_id]
|
311 |
-
vectorstore._collection.delete(ids=ids_to_delete)
|
312 |
-
st.success(f"Deleted document: {info['source']}")
|
313 |
-
st.rerun()
|
314 |
-
|
315 |
-
# Add Delete All button
|
316 |
-
if doc_map:
|
317 |
-
if st.button("🗑️ Delete All Documents", key="del_all_docs", type="secondary"):
|
318 |
-
all_ids = list(ids)
|
319 |
-
vectorstore._collection.delete(ids=all_ids)
|
320 |
-
st.success("All documents deleted.")
|
321 |
-
st.rerun()
|
322 |
-
except Exception as e:
|
323 |
-
st.error(f"Error loading documents: {str(e)}")
|
324 |
-
|
325 |
-
def search_page():
|
326 |
-
st.header("🔍 Search University Information")
|
327 |
-
|
328 |
-
# --- Language selection ---
|
329 |
-
col1, col2 = st.columns([3, 1])
|
330 |
-
with col1:
|
331 |
-
st.write("Ask about admissions, fees, scholarships, and programs.")
|
332 |
-
with col2:
|
333 |
-
response_language = st.selectbox(
|
334 |
-
"Language",
|
335 |
-
["English", "中文 (Chinese)", "Bahasa Malaysia", "ไทย (Thai)",
|
336 |
-
"Bahasa Indonesia", "Tiếng Việt (Vietnamese)"],
|
337 |
-
key="response_language"
|
338 |
-
)
|
339 |
-
|
340 |
-
language_map = {
|
341 |
-
"English": "English",
|
342 |
-
"中文 (Chinese)": "Chinese",
|
343 |
-
"Bahasa Malaysia": "Malay",
|
344 |
-
"ไทย (Thai)": "Thai",
|
345 |
-
"Bahasa Indonesia": "Indonesian",
|
346 |
-
"Tiếng Việt (Vietnamese)": "Vietnamese"
|
347 |
-
}
|
348 |
-
selected_lang = language_map[response_language]
|
349 |
-
|
350 |
-
if selected_lang != "English":
|
351 |
-
st.info(f"🌐 Responses will be in **{selected_lang}**")
|
352 |
-
|
353 |
-
# --- Query input ---
|
354 |
-
query = st.text_area(
|
355 |
-
"Your question:",
|
356 |
-
height=80,
|
357 |
-
placeholder="e.g., Master's in Malaysia under 40,000 RMB/year",
|
358 |
-
)
|
359 |
-
|
360 |
-
# --- Example queries ---
|
361 |
-
with st.expander("💡 See Example Queries"):
|
362 |
-
tab1, tab2 = st.tabs(["🧠 Complex Queries", "⚡ Simple Queries"])
|
363 |
-
|
364 |
-
complex_examples = [
|
365 |
-
"Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
|
366 |
-
"专科毕业,无雅思,想在马来西亚读硕士,学费不超过4万人民币/年",
|
367 |
-
"Compare engineering programs in Thailand and Singapore under $15,000 per year",
|
368 |
-
"Find MBA programs in ASEAN with GMAT requirements and scholarships available"
|
369 |
-
]
|
370 |
-
simple_examples = [
|
371 |
-
"What does IELTS stand for?",
|
372 |
-
"Translate 'application deadline' to Chinese",
|
373 |
-
"What is the difference between bachelor and master degree?",
|
374 |
-
"How to say 'university' in Thai?"
|
375 |
-
]
|
376 |
-
|
377 |
-
for ex in complex_examples:
|
378 |
-
if tab1.button(ex, key=f"complex_{hash(ex)}"):
|
379 |
-
st.session_state.example_query = ex
|
380 |
-
|
381 |
-
for ex in simple_examples:
|
382 |
-
if tab2.button(ex, key=f"simple_{hash(ex)}"):
|
383 |
-
st.session_state.example_query = ex
|
384 |
-
|
385 |
-
# --- Use clicked example ---
|
386 |
-
if 'example_query' in st.session_state:
|
387 |
-
query = st.session_state.example_query
|
388 |
-
st.info(f"📝 Using example: {query}")
|
389 |
-
del st.session_state.example_query
|
390 |
-
# Optionally auto-trigger search
|
391 |
-
|
392 |
-
# --- Optional filters ---
|
393 |
-
with st.expander("🔧 Advanced Filters"):
|
394 |
-
col1, col2, col3 = st.columns(3)
|
395 |
-
budget_range = col1.select_slider(
|
396 |
-
"Budget (USD/year)",
|
397 |
-
options=["Any", "<10k", "10k-20k", "20k-30k", "30k-40k", ">40k"],
|
398 |
-
value="Any"
|
399 |
-
)
|
400 |
-
study_level = col2.multiselect(
|
401 |
-
"Study Level", ["Diploma", "Bachelor", "Master", "PhD"]
|
402 |
-
)
|
403 |
-
preferred_countries = col3.multiselect(
|
404 |
-
"Preferred Countries",
|
405 |
-
["Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei"]
|
406 |
-
)
|
407 |
-
|
408 |
-
# --- Search button ---
|
409 |
-
if st.button("🔍 Search", type="primary", disabled=not query.strip()):
|
410 |
-
st.success("Searching...") # Placeholder for your RAGSystem logic
|
411 |
-
|
412 |
-
def display_query_result(result, show_share_link=False):
|
413 |
-
"""Display query results in a formatted way."""
|
414 |
-
st.markdown('<div class="query-result">', unsafe_allow_html=True)
|
415 |
-
|
416 |
-
# Show which model was used
|
417 |
-
if result.get("model_used"):
|
418 |
-
st.info(f"🤖 **Model Used:** {result['model_used']}")
|
419 |
-
|
420 |
-
st.subheader("🎯 Answer")
|
421 |
-
st.write(result["answer"])
|
422 |
-
|
423 |
-
# Share link
|
424 |
-
if show_share_link and result.get("query_id"):
|
425 |
-
st.markdown("---")
|
426 |
-
current_url = st.get_option("browser.serverAddress") or "localhost:8501"
|
427 |
-
share_url = f"http://{current_url}?share={result['query_id']}"
|
428 |
-
st.markdown(f"""
|
429 |
-
<div class="share-link">
|
430 |
-
<strong>🔗 Share this result:</strong><br>
|
431 |
-
<code>{share_url}</code>
|
432 |
-
</div>
|
433 |
-
""", unsafe_allow_html=True)
|
434 |
-
|
435 |
-
if st.button("📋 Copy Share Link"):
|
436 |
-
st.code(share_url)
|
437 |
-
|
438 |
-
# Source documents
|
439 |
-
if result.get("source_documents"):
|
440 |
-
st.markdown("---")
|
441 |
-
st.subheader("📚 Sources")
|
442 |
-
for i, doc in enumerate(result["source_documents"], 1):
|
443 |
-
with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
|
444 |
-
col1, col2 = st.columns([1, 2])
|
445 |
-
with col1:
|
446 |
-
st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
|
447 |
-
st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
|
448 |
-
st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
|
449 |
-
with col2:
|
450 |
-
st.write("**Relevant Content:**")
|
451 |
-
content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
|
452 |
-
st.write(content_preview)
|
453 |
-
|
454 |
-
st.markdown('</div>', unsafe_allow_html=True)
|
455 |
-
|
456 |
-
def display_shared_query(query_id):
|
457 |
-
"""Display a shared query result."""
|
458 |
-
st.header("🔗 Shared Query Result")
|
459 |
-
|
460 |
-
result_data = load_shared_query(query_id)
|
461 |
-
|
462 |
-
if result_data:
|
463 |
-
st.info(f"**Original Question:** {result_data['question']}")
|
464 |
-
st.write(f"**Language:** {result_data['language']}")
|
465 |
-
st.write(f"**Date:** {result_data['timestamp'][:10]}")
|
466 |
-
|
467 |
-
# Create a mock result object for display
|
468 |
-
mock_result = {
|
469 |
-
"answer": result_data["answer"],
|
470 |
-
"source_documents": [
|
471 |
-
type('MockDoc', (), {
|
472 |
-
'metadata': source,
|
473 |
-
'page_content': source.get('content_preview', '')
|
474 |
-
})() for source in result_data.get('sources', [])
|
475 |
-
]
|
476 |
-
}
|
477 |
-
|
478 |
-
display_query_result(mock_result, show_share_link=False)
|
479 |
-
|
480 |
-
if st.button("🔍 Ask Your Own Question"):
|
481 |
-
st.experimental_set_query_params()
|
482 |
-
st.experimental_rerun()
|
483 |
-
else:
|
484 |
-
st.error("❌ Shared query not found or has expired.")
|
485 |
-
if st.button("🏠 Go to Home"):
|
486 |
-
st.experimental_set_query_params()
|
487 |
-
st.experimental_rerun()
|
488 |
-
|
489 |
-
def about_page():
|
490 |
-
st.header("ℹ️ About PanSea University Search")
|
491 |
-
|
492 |
-
col1, col2 = st.columns([2, 1])
|
493 |
-
|
494 |
-
with col1:
|
495 |
-
st.markdown("""
|
496 |
-
### 🎯 Problem We Solve
|
497 |
-
|
498 |
-
Prospective students worldwide seeking to study abroad face difficulty finding accurate, up-to-date university admission requirements. Information is scattered across PDFs, brochures, and outdated agency websites. Many waste time applying to unsuitable programs due to missing criteria and pay high agent fees.
|
499 |
-
|
500 |
-
### 💡 Our Solution
|
501 |
-
|
502 |
-
PanSea is an LLM-powered, RAG-based study search platform powered by **SEA-LION models** that ingests official admissions documents from ASEAN universities. Students can query in any ASEAN language and receive:
|
503 |
-
|
504 |
-
- 📋 **Ranked program matches** with detailed requirements
|
505 |
-
- 💰 **Tuition fees and costs**
|
506 |
-
- 📅 **Application deadlines and windows**
|
507 |
-
- 🎓 **Entry requirements and prerequisites**
|
508 |
-
- 📖 **Source citations** from official documents
|
509 |
-
|
510 |
-
### 🤖 AI Models Used
|
511 |
-
|
512 |
-
- **SEA-LION v3.5 Reasoning Model**: For complex university search queries requiring multi-step reasoning
|
513 |
-
- **SEA-LION v3 Instruct Model**: For translation and simple question-answering
|
514 |
-
- **Automatic Model Selection**: The system intelligently chooses the appropriate model based on query complexity
|
515 |
-
|
516 |
-
### 🌏 Supported Languages
|
517 |
-
|
518 |
-
- English
|
519 |
-
- 中文 (Chinese)
|
520 |
-
- Bahasa Malaysia
|
521 |
-
- ไทย (Thai)
|
522 |
-
- Bahasa Indonesia
|
523 |
-
- Tiếng Việt (Vietnamese)
|
524 |
-
- Filipino
|
525 |
-
|
526 |
-
### 🔧 How It Works
|
527 |
-
|
528 |
-
1. **📄 Document Ingestion**: Upload official PDF documents from universities
|
529 |
-
2. **🔍 AI Processing**: Our system processes and indexes the content
|
530 |
-
3. **❓ Natural Language Queries**: Ask questions in your preferred language
|
531 |
-
4. **🎯 Intelligent Answers**: Get relevant, sourced responses
|
532 |
-
5. **🔗 Share Results**: Generate shareable links for your queries
|
533 |
-
""")
|
534 |
-
|
535 |
-
with col2:
|
536 |
-
st.markdown("""
|
537 |
-
### 📊 Features
|
538 |
-
|
539 |
-
✅ **Multi-language support**
|
540 |
-
✅ **PDF document ingestion**
|
541 |
-
✅ **Intelligent search & retrieval**
|
542 |
-
✅ **Source citations**
|
543 |
-
✅ **Shareable query results**
|
544 |
-
✅ **Advanced filtering**
|
545 |
-
✅ **Real-time processing**
|
546 |
-
|
547 |
-
### 🏛️ Target Universities
|
548 |
-
|
549 |
-
- 🇸🇬 Singapore
|
550 |
-
- 🇲🇾 Malaysia
|
551 |
-
- 🇹🇭 Thailand
|
552 |
-
- 🇮🇩 Indonesia
|
553 |
-
- 🇵🇭 Philippines
|
554 |
-
- 🇻🇳 Vietnam
|
555 |
-
- 🇧🇳 Brunei
|
556 |
-
- 🇰🇭 Cambodia
|
557 |
-
- 🇱🇦 Laos
|
558 |
-
- 🇲🇲 Myanmar
|
559 |
-
|
560 |
-
### 🚀 Get Started
|
561 |
-
|
562 |
-
1. Go to **Upload Documents** to add university PDFs
|
563 |
-
2. Use **Search Universities** to ask questions
|
564 |
-
3. Share your results with others!
|
565 |
-
""")
|
566 |
-
|
567 |
if __name__ == "__main__":
|
568 |
# Check if SEA-LION API key is set
|
569 |
if not os.getenv("SEA_LION_API_KEY"):
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
from urllib.parse import urlparse, parse_qs
|
4 |
+
from utils.rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
|
5 |
from datetime import datetime
|
6 |
import uuid
|
7 |
+
from utils.translations import translations
|
8 |
+
from pathlib import Path
|
9 |
+
from pages.search_uni import search_page
|
10 |
+
from pages.upload_documents import upload_documents_page
|
11 |
+
from pages.manage_documents import manage_documents_page
|
12 |
+
from pages.about import about_page
|
13 |
+
from utils.display import display_shared_query
|
14 |
+
|
15 |
+
# Load external CSS
|
16 |
+
def load_css(file_name):
|
17 |
+
css_file = Path(file_name)
|
18 |
+
if css_file.exists():
|
19 |
+
with open(css_file) as f:
|
20 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
21 |
+
|
22 |
+
load_css("styles.css")
|
23 |
|
24 |
# Configure Streamlit page
|
25 |
st.set_page_config(
|
|
|
29 |
initial_sidebar_state="expanded"
|
30 |
)
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def main():
|
33 |
# Check for shared query in URL
|
34 |
query_params = st.query_params
|
|
|
66 |
upload_documents_page()
|
67 |
elif st.session_state.current_page == "🗂 Manage Documents":
|
68 |
manage_documents_page()
|
69 |
+
elif st.session_state.current_page == "ℹ️ About Top.Edu":
|
70 |
about_page()
|
71 |
else:
|
72 |
search_page()
|
73 |
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
if __name__ == "__main__":
|
76 |
# Check if SEA-LION API key is set
|
77 |
if not os.getenv("SEA_LION_API_KEY"):
|
pages/about.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def about_page():
|
4 |
+
st.header("ℹ️ About PanSea University Search")
|
5 |
+
|
6 |
+
col1, col2 = st.columns([2, 1])
|
7 |
+
|
8 |
+
with col1:
|
9 |
+
st.markdown("""
|
10 |
+
### 🎯 Problem We Solve
|
11 |
+
|
12 |
+
Prospective students worldwide seeking to study abroad face difficulty finding accurate, up-to-date university admission requirements. Information is scattered across PDFs, brochures, and outdated agency websites. Many waste time applying to unsuitable programs due to missing criteria and pay high agent fees.
|
13 |
+
|
14 |
+
### 💡 Our Solution
|
15 |
+
|
16 |
+
PanSea is an LLM-powered, RAG-based study search platform powered by **SEA-LION models** that ingests official admissions documents from ASEAN universities. Students can query in any ASEAN language and receive:
|
17 |
+
|
18 |
+
- 📋 **Ranked program matches** with detailed requirements
|
19 |
+
- 💰 **Tuition fees and costs**
|
20 |
+
- 📅 **Application deadlines and windows**
|
21 |
+
- 🎓 **Entry requirements and prerequisites**
|
22 |
+
- 📖 **Source citations** from official documents
|
23 |
+
|
24 |
+
### 🤖 AI Models Used
|
25 |
+
|
26 |
+
- **SEA-LION v3.5 Reasoning Model**: For complex university search queries requiring multi-step reasoning
|
27 |
+
- **SEA-LION v3 Instruct Model**: For translation and simple question-answering
|
28 |
+
- **Automatic Model Selection**: The system intelligently chooses the appropriate model based on query complexity
|
29 |
+
|
30 |
+
### 🌏 Supported Languages
|
31 |
+
|
32 |
+
- English
|
33 |
+
- 中文 (Chinese)
|
34 |
+
- Bahasa Malaysia
|
35 |
+
- ไทย (Thai)
|
36 |
+
- Bahasa Indonesia
|
37 |
+
- Tiếng Việt (Vietnamese)
|
38 |
+
- Filipino
|
39 |
+
|
40 |
+
### 🔧 How It Works
|
41 |
+
|
42 |
+
1. **📄 Document Ingestion**: Upload official PDF documents from universities
|
43 |
+
2. **🔍 AI Processing**: Our system processes and indexes the content
|
44 |
+
3. **❓ Natural Language Queries**: Ask questions in your preferred language
|
45 |
+
4. **🎯 Intelligent Answers**: Get relevant, sourced responses
|
46 |
+
5. **🔗 Share Results**: Generate shareable links for your queries
|
47 |
+
""")
|
48 |
+
|
49 |
+
with col2:
|
50 |
+
st.markdown("""
|
51 |
+
### 📊 Features
|
52 |
+
|
53 |
+
✅ **Multi-language support**
|
54 |
+
✅ **PDF document ingestion**
|
55 |
+
✅ **Intelligent search & retrieval**
|
56 |
+
✅ **Source citations**
|
57 |
+
✅ **Shareable query results**
|
58 |
+
✅ **Advanced filtering**
|
59 |
+
✅ **Real-time processing**
|
60 |
+
|
61 |
+
### 🏛️ Target Universities
|
62 |
+
|
63 |
+
- 🇸🇬 Singapore
|
64 |
+
- 🇲🇾 Malaysia
|
65 |
+
- 🇹🇭 Thailand
|
66 |
+
- 🇮🇩 Indonesia
|
67 |
+
- 🇵🇭 Philippines
|
68 |
+
- 🇻🇳 Vietnam
|
69 |
+
- 🇧🇳 Brunei
|
70 |
+
- 🇰🇭 Cambodia
|
71 |
+
- 🇱🇦 Laos
|
72 |
+
- 🇲🇲 Myanmar
|
73 |
+
|
74 |
+
### 🚀 Get Started
|
75 |
+
|
76 |
+
1. Go to **Upload Documents** to add university PDFs
|
77 |
+
2. Use **Search Universities** to ask questions
|
78 |
+
3. Share your results with others!
|
79 |
+
""")
|
pages/manage_documents.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utils.rag_system import DocumentIngestion
|
3 |
+
|
4 |
+
|
5 |
+
def manage_documents_page():
|
6 |
+
st.header("🗂 Manage Documents in Database")
|
7 |
+
st.write("View and delete documents currently stored in the Chroma vector database.")
|
8 |
+
|
9 |
+
from utils.rag_system import DocumentIngestion
|
10 |
+
doc_ingestion = DocumentIngestion()
|
11 |
+
vectorstore = doc_ingestion.load_existing_vectorstore()
|
12 |
+
|
13 |
+
if not vectorstore:
|
14 |
+
st.warning("No vector store found. Upload documents first.")
|
15 |
+
return
|
16 |
+
|
17 |
+
# Get all documents (chunks) in the vectorstore
|
18 |
+
try:
|
19 |
+
# Chroma stores documents as chunks, but we want to show original metadata
|
20 |
+
# We'll group by file_id to show unique documents
|
21 |
+
collection = vectorstore._collection
|
22 |
+
all_docs = collection.get(include=["metadatas", "documents"]) # Removed 'ids'
|
23 |
+
metadatas = all_docs["metadatas"]
|
24 |
+
ids = all_docs["ids"] # ids are always returned
|
25 |
+
documents = all_docs["documents"]
|
26 |
+
|
27 |
+
# Group by file_id
|
28 |
+
doc_map = {}
|
29 |
+
for meta, doc_id, doc_text in zip(metadatas, ids, documents):
|
30 |
+
file_id = meta.get("file_id", doc_id)
|
31 |
+
if file_id not in doc_map:
|
32 |
+
doc_map[file_id] = {
|
33 |
+
"source": meta.get("source", "Unknown"),
|
34 |
+
"university": meta.get("university", "Unknown"),
|
35 |
+
"country": meta.get("country", "Unknown"),
|
36 |
+
"document_type": meta.get("document_type", "Unknown"),
|
37 |
+
"upload_timestamp": meta.get("upload_timestamp", "Unknown"),
|
38 |
+
"file_id": file_id,
|
39 |
+
"chunks": []
|
40 |
+
}
|
41 |
+
doc_map[file_id]["chunks"].append(doc_text)
|
42 |
+
|
43 |
+
if not doc_map:
|
44 |
+
st.info("No documents found in the database.")
|
45 |
+
return
|
46 |
+
|
47 |
+
st.subheader("Current Documents:")
|
48 |
+
for file_id, info in doc_map.items():
|
49 |
+
with st.expander(f"{info['source']} ({info['university']}, {info['country']})"):
|
50 |
+
st.write(f"**Type:** {info['document_type']}")
|
51 |
+
st.write(f"**Uploaded:** {info['upload_timestamp']}")
|
52 |
+
st.write(f"**File ID:** {file_id}")
|
53 |
+
st.write(f"**Chunks:** {len(info['chunks'])}")
|
54 |
+
if st.button(f"🗑️ Delete Document", key=f"del_{file_id}"):
|
55 |
+
# Delete all chunks with this file_id
|
56 |
+
ids_to_delete = [doc_id for meta, doc_id in zip(metadatas, ids) if meta.get("file_id", doc_id) == file_id]
|
57 |
+
vectorstore._collection.delete(ids=ids_to_delete)
|
58 |
+
st.success(f"Deleted document: {info['source']}")
|
59 |
+
st.rerun()
|
60 |
+
|
61 |
+
# Add Delete All button
|
62 |
+
if doc_map:
|
63 |
+
if st.button("🗑️ Delete All Documents", key="del_all_docs", type="secondary"):
|
64 |
+
all_ids = list(ids)
|
65 |
+
vectorstore._collection.delete(ids=all_ids)
|
66 |
+
st.success("All documents deleted.")
|
67 |
+
st.rerun()
|
68 |
+
except Exception as e:
|
69 |
+
st.error(f"Error loading documents: {str(e)}")
|
pages/search_uni.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
def search_page():
|
4 |
+
st.header("🔍 Search University Information")
|
5 |
+
|
6 |
+
# --- Language selection ---
|
7 |
+
col1, col2 = st.columns([3, 1])
|
8 |
+
with col1:
|
9 |
+
st.write("Ask about admissions, fees, scholarships, and programs.")
|
10 |
+
with col2:
|
11 |
+
response_language = st.selectbox(
|
12 |
+
"Language",
|
13 |
+
["English", "中文 (Chinese)", "Bahasa Malaysia", "ไทย (Thai)",
|
14 |
+
"Bahasa Indonesia", "Tiếng Việt (Vietnamese)"],
|
15 |
+
key="response_language"
|
16 |
+
)
|
17 |
+
|
18 |
+
language_map = {
|
19 |
+
"English": "English",
|
20 |
+
"中文 (Chinese)": "中文",
|
21 |
+
"Bahasa Malaysia": "Malay",
|
22 |
+
"ไทย (Thai)": "ไทย",
|
23 |
+
"Bahasa Indonesia": "Indonesian",
|
24 |
+
"Tiếng Việt (Vietnamese)": "Tiếng Việt"
|
25 |
+
}
|
26 |
+
selected_lang = language_map[response_language]
|
27 |
+
|
28 |
+
if selected_lang != "English":
|
29 |
+
st.info(f"🌐 Responses will be in **{selected_lang}**")
|
30 |
+
|
31 |
+
# --- Query input ---
|
32 |
+
query = st.text_area(
|
33 |
+
"Your question:",
|
34 |
+
height=80,
|
35 |
+
placeholder="e.g., Master's in Malaysia under 40,000 RMB/year",
|
36 |
+
)
|
37 |
+
|
38 |
+
# --- Example queries ---
|
39 |
+
with st.expander("💡 See Example Queries"):
|
40 |
+
tab1, tab2 = st.tabs(["🧠 Complex Queries", "⚡ Simple Queries"])
|
41 |
+
|
42 |
+
complex_examples = [
|
43 |
+
"Show me universities in Malaysia for master's degrees with tuition under 40,000 RMB per year",
|
44 |
+
"专科毕业,无雅思,想在马来西亚读硕士,学费不超过4万人民币/年",
|
45 |
+
"Compare engineering programs in Thailand and Singapore under $15,000 per year",
|
46 |
+
"Find MBA programs in ASEAN with GMAT requirements and scholarships available"
|
47 |
+
]
|
48 |
+
simple_examples = [
|
49 |
+
"What does IELTS stand for?",
|
50 |
+
"Translate 'application deadline' to Chinese",
|
51 |
+
"What is the difference between bachelor and master degree?",
|
52 |
+
"How to say 'university' in Thai?"
|
53 |
+
]
|
54 |
+
|
55 |
+
for ex in complex_examples:
|
56 |
+
if tab1.button(ex, key=f"complex_{hash(ex)}"):
|
57 |
+
st.session_state.example_query = ex
|
58 |
+
|
59 |
+
for ex in simple_examples:
|
60 |
+
if tab2.button(ex, key=f"simple_{hash(ex)}"):
|
61 |
+
st.session_state.example_query = ex
|
62 |
+
|
63 |
+
# --- Use clicked example ---
|
64 |
+
if 'example_query' in st.session_state:
|
65 |
+
query = st.session_state.example_query
|
66 |
+
st.info(f"📝 Using example: {query}")
|
67 |
+
del st.session_state.example_query
|
68 |
+
# Optionally auto-trigger search
|
69 |
+
|
70 |
+
# --- Optional filters ---
|
71 |
+
with st.expander("🔧 Advanced Filters"):
|
72 |
+
col1, col2, col3 = st.columns(3)
|
73 |
+
budget_range = col1.select_slider(
|
74 |
+
"Budget (USD/year)",
|
75 |
+
options=["Any", "<10k", "10k-20k", "20k-30k", "30k-40k", ">40k"],
|
76 |
+
value="Any"
|
77 |
+
)
|
78 |
+
study_level = col2.multiselect(
|
79 |
+
"Study Level", ["Diploma", "Bachelor", "Master", "PhD"]
|
80 |
+
)
|
81 |
+
preferred_countries = col3.multiselect(
|
82 |
+
"Preferred Countries",
|
83 |
+
["Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei"]
|
84 |
+
)
|
85 |
+
|
86 |
+
# --- Search button ---
|
87 |
+
if st.button("🔍 Search", type="primary", disabled=not query.strip()):
|
88 |
+
st.success("Searching...") # Placeholder for your RAGSystem logic
|
pages/upload_documents.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utils.rag_system import DocumentIngestion
|
3 |
+
|
4 |
+
def upload_documents_page():
|
5 |
+
st.header("📄 Upload University Documents")
|
6 |
+
st.write("Upload PDF documents containing university admission requirements, fees, and program information.")
|
7 |
+
|
8 |
+
col1, col2 = st.columns(2)
|
9 |
+
|
10 |
+
with col1:
|
11 |
+
university_name = st.text_input("🏫 University Name", placeholder="e.g., National University of Singapore")
|
12 |
+
country = st.selectbox(
|
13 |
+
"🌏 Country",
|
14 |
+
["", "Singapore", "Malaysia", "Thailand", "Indonesia", "Philippines", "Vietnam", "Brunei", "Cambodia", "Laos", "Myanmar"]
|
15 |
+
)
|
16 |
+
|
17 |
+
with col2:
|
18 |
+
document_type = st.selectbox(
|
19 |
+
"📋 Document Type",
|
20 |
+
["admission_requirements", "tuition_fees", "program_catalog", "application_guide", "scholarship_info"]
|
21 |
+
)
|
22 |
+
language = st.selectbox(
|
23 |
+
"🌐 Primary Language",
|
24 |
+
["English", "Chinese", "Malay", "Thai", "Indonesian", "Vietnamese", "Filipino", "Other"]
|
25 |
+
)
|
26 |
+
|
27 |
+
# File upload
|
28 |
+
uploaded_files = st.file_uploader(
|
29 |
+
"Choose PDF files",
|
30 |
+
accept_multiple_files=True,
|
31 |
+
type=['pdf'],
|
32 |
+
help="Select one or more PDF files to upload"
|
33 |
+
)
|
34 |
+
|
35 |
+
if uploaded_files and st.button("🚀 Process Documents", type="primary"):
|
36 |
+
if not university_name or not country:
|
37 |
+
st.error("Please provide university name and country.")
|
38 |
+
return
|
39 |
+
|
40 |
+
with st.spinner("Processing documents... This may take a few minutes."):
|
41 |
+
try:
|
42 |
+
# Initialize document ingestion
|
43 |
+
doc_ingestion = DocumentIngestion()
|
44 |
+
|
45 |
+
# Process documents
|
46 |
+
documents = doc_ingestion.process_documents(
|
47 |
+
uploaded_files, university_name, country, document_type
|
48 |
+
)
|
49 |
+
|
50 |
+
if documents:
|
51 |
+
# Create or update vector store
|
52 |
+
vectorstore = doc_ingestion.create_vector_store(documents)
|
53 |
+
|
54 |
+
if vectorstore:
|
55 |
+
st.success(f"✅ Successfully processed {len(documents)} documents!")
|
56 |
+
st.info(f"Documents from {university_name} ({country}) have been added to the knowledge base.")
|
57 |
+
|
58 |
+
# Show processed files
|
59 |
+
with st.expander("📋 Processed Files"):
|
60 |
+
for doc in documents:
|
61 |
+
st.write(f"• **{doc.metadata['source']}**")
|
62 |
+
st.write(f" - University: {doc.metadata['university']}")
|
63 |
+
st.write(f" - Country: {doc.metadata['country']}")
|
64 |
+
st.write(f" - Type: {doc.metadata['document_type']}")
|
65 |
+
st.write("---")
|
66 |
+
else:
|
67 |
+
st.error("No documents were successfully processed.")
|
68 |
+
|
69 |
+
except Exception as e:
|
70 |
+
st.error(f"Error processing documents: {str(e)}")
|
styles.css
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.main-header {
|
2 |
+
text-align: center;
|
3 |
+
padding: 2rem 0;
|
4 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
5 |
+
color: white;
|
6 |
+
margin: -1rem -1rem 2rem -1rem;
|
7 |
+
border-radius: 10px;
|
8 |
+
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
9 |
+
}
|
10 |
+
|
11 |
+
.stApp {
|
12 |
+
background: var(--background-color);
|
13 |
+
}
|
14 |
+
|
15 |
+
/* Dark theme compatible containers */
|
16 |
+
.query-result {
|
17 |
+
background: rgba(255, 255, 255, 0.05);
|
18 |
+
backdrop-filter: blur(10px);
|
19 |
+
border: 1px solid rgba(255, 255, 255, 0.1);
|
20 |
+
padding: 1.5rem;
|
21 |
+
border-radius: 15px;
|
22 |
+
margin: 1rem 0;
|
23 |
+
color: var(--text-color);
|
24 |
+
}
|
25 |
+
|
26 |
+
.source-doc {
|
27 |
+
background: rgba(31, 119, 180, 0.1);
|
28 |
+
backdrop-filter: blur(5px);
|
29 |
+
padding: 1rem;
|
30 |
+
border-left: 4px solid #1f77b4;
|
31 |
+
border-radius: 8px;
|
32 |
+
margin: 0.5rem 0;
|
33 |
+
color: var(--text-color);
|
34 |
+
}
|
35 |
+
|
36 |
+
.share-link {
|
37 |
+
background: rgba(46, 204, 113, 0.1);
|
38 |
+
backdrop-filter: blur(5px);
|
39 |
+
padding: 1rem;
|
40 |
+
border-radius: 10px;
|
41 |
+
border-left: 4px solid #2ecc71;
|
42 |
+
color: var(--text-color);
|
43 |
+
}
|
44 |
+
|
45 |
+
/* Model indicator boxes */
|
46 |
+
.model-info {
|
47 |
+
background: rgba(52, 152, 219, 0.15);
|
48 |
+
backdrop-filter: blur(10px);
|
49 |
+
padding: 15px;
|
50 |
+
border-radius: 12px;
|
51 |
+
border-left: 4px solid #3498db;
|
52 |
+
margin: 10px 0;
|
53 |
+
}
|
54 |
+
|
55 |
+
/* Language selection enhancement */
|
56 |
+
.language-selection {
|
57 |
+
background: rgba(155, 89, 182, 0.1);
|
58 |
+
backdrop-filter: blur(10px);
|
59 |
+
padding: 15px;
|
60 |
+
border-radius: 12px;
|
61 |
+
border-left: 4px solid #9b59b6;
|
62 |
+
margin: 10px 0;
|
63 |
+
}
|
64 |
+
|
65 |
+
/* Upload area enhancement */
|
66 |
+
.stFileUploader {
|
67 |
+
background: rgba(230, 126, 34, 0.1);
|
68 |
+
backdrop-filter: blur(10px);
|
69 |
+
padding: 20px;
|
70 |
+
border-radius: 15px;
|
71 |
+
border: 2px dashed #e67e22;
|
72 |
+
}
|
73 |
+
|
74 |
+
.stFileUploader label {
|
75 |
+
font-size: 1.2rem;
|
76 |
+
font-weight: bold;
|
77 |
+
color: var(--text-color);
|
78 |
+
}
|
79 |
+
|
80 |
+
/* Button enhancements */
|
81 |
+
.stButton > button {
|
82 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
83 |
+
color: white;
|
84 |
+
border: none;
|
85 |
+
border-radius: 10px;
|
86 |
+
padding: 0.6rem 1.5rem;
|
87 |
+
font-weight: 600;
|
88 |
+
transition: all 0.3s ease;
|
89 |
+
box-shadow: 0 4px 15px 0 rgba(31, 38, 135, 0.37);
|
90 |
+
}
|
91 |
+
|
92 |
+
.stButton > button:hover {
|
93 |
+
transform: translateY(-2px);
|
94 |
+
box-shadow: 0 6px 20px 0 rgba(31, 38, 135, 0.5);
|
95 |
+
}
|
96 |
+
|
97 |
+
/* Sidebar enhancements */
|
98 |
+
.css-1d391kg {
|
99 |
+
background: rgba(255, 255, 255, 0.02);
|
100 |
+
backdrop-filter: blur(10px);
|
101 |
+
}
|
102 |
+
|
103 |
+
/* Info boxes */
|
104 |
+
.stInfo {
|
105 |
+
background: rgba(52, 152, 219, 0.1);
|
106 |
+
backdrop-filter: blur(10px);
|
107 |
+
border-left: 4px solid #3498db;
|
108 |
+
}
|
109 |
+
|
110 |
+
.stSuccess {
|
111 |
+
background: rgba(46, 204, 113, 0.1);
|
112 |
+
backdrop-filter: blur(10px);
|
113 |
+
border-left: 4px solid #2ecc71;
|
114 |
+
}
|
115 |
+
|
116 |
+
.stWarning {
|
117 |
+
background: rgba(241, 196, 15, 0.1);
|
118 |
+
backdrop-filter: blur(10px);
|
119 |
+
border-left: 4px solid #f1c40f;
|
120 |
+
}
|
121 |
+
|
122 |
+
.stError {
|
123 |
+
background: rgba(231, 76, 60, 0.1);
|
124 |
+
backdrop-filter: blur(10px);
|
125 |
+
border-left: 4px solid #e74c3c;
|
126 |
+
}
|
test_system.py
CHANGED
@@ -37,7 +37,7 @@ def test_imports():
|
|
37 |
return False
|
38 |
|
39 |
try:
|
40 |
-
from rag_system import DocumentIngestion, RAGSystem
|
41 |
print("✅ RAG system modules imported successfully")
|
42 |
except ImportError as e:
|
43 |
print(f"❌ Failed to import RAG system: {e}")
|
@@ -107,7 +107,7 @@ def test_basic_functionality():
|
|
107 |
print("\n⚡ Testing basic functionality...")
|
108 |
|
109 |
try:
|
110 |
-
from rag_system import DocumentIngestion, SEALionLLM
|
111 |
|
112 |
# Test document ingestion initialization
|
113 |
doc_ingestion = DocumentIngestion()
|
|
|
37 |
return False
|
38 |
|
39 |
try:
|
40 |
+
from utils.rag_system import DocumentIngestion, RAGSystem
|
41 |
print("✅ RAG system modules imported successfully")
|
42 |
except ImportError as e:
|
43 |
print(f"❌ Failed to import RAG system: {e}")
|
|
|
107 |
print("\n⚡ Testing basic functionality...")
|
108 |
|
109 |
try:
|
110 |
+
from utils.rag_system import DocumentIngestion, SEALionLLM
|
111 |
|
112 |
# Test document ingestion initialization
|
113 |
doc_ingestion = DocumentIngestion()
|
utils/display.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from utils.rag_system import DocumentIngestion, RAGSystem, save_query_result, load_shared_query
|
3 |
+
|
4 |
+
def display_query_result(result, show_share_link=False):
|
5 |
+
"""Display query results in a formatted way."""
|
6 |
+
st.markdown('<div class="query-result">', unsafe_allow_html=True)
|
7 |
+
|
8 |
+
# Show which model was used
|
9 |
+
if result.get("model_used"):
|
10 |
+
st.info(f"🤖 **Model Used:** {result['model_used']}")
|
11 |
+
|
12 |
+
st.subheader("🎯 Answer")
|
13 |
+
st.write(result["answer"])
|
14 |
+
|
15 |
+
# Share link
|
16 |
+
if show_share_link and result.get("query_id"):
|
17 |
+
st.markdown("---")
|
18 |
+
current_url = st.get_option("browser.serverAddress") or "localhost:8501"
|
19 |
+
share_url = f"http://{current_url}?share={result['query_id']}"
|
20 |
+
st.markdown(f"""
|
21 |
+
<div class="share-link">
|
22 |
+
<strong>🔗 Share this result:</strong><br>
|
23 |
+
<code>{share_url}</code>
|
24 |
+
</div>
|
25 |
+
""", unsafe_allow_html=True)
|
26 |
+
|
27 |
+
if st.button("📋 Copy Share Link"):
|
28 |
+
st.code(share_url)
|
29 |
+
|
30 |
+
# Source documents
|
31 |
+
if result.get("source_documents"):
|
32 |
+
st.markdown("---")
|
33 |
+
st.subheader("📚 Sources")
|
34 |
+
for i, doc in enumerate(result["source_documents"], 1):
|
35 |
+
with st.expander(f"Source {i}: {doc.metadata.get('source', 'Unknown')}"):
|
36 |
+
col1, col2 = st.columns([1, 2])
|
37 |
+
with col1:
|
38 |
+
st.write(f"**University:** {doc.metadata.get('university', 'Unknown')}")
|
39 |
+
st.write(f"**Country:** {doc.metadata.get('country', 'Unknown')}")
|
40 |
+
st.write(f"**Type:** {doc.metadata.get('document_type', 'Unknown')}")
|
41 |
+
with col2:
|
42 |
+
st.write("**Relevant Content:**")
|
43 |
+
content_preview = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
|
44 |
+
st.write(content_preview)
|
45 |
+
|
46 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
47 |
+
|
48 |
+
def display_shared_query(query_id):
|
49 |
+
"""Display a shared query result."""
|
50 |
+
st.header("🔗 Shared Query Result")
|
51 |
+
|
52 |
+
result_data = load_shared_query(query_id)
|
53 |
+
|
54 |
+
if result_data:
|
55 |
+
st.info(f"**Original Question:** {result_data['question']}")
|
56 |
+
st.write(f"**Language:** {result_data['language']}")
|
57 |
+
st.write(f"**Date:** {result_data['timestamp'][:10]}")
|
58 |
+
|
59 |
+
# Create a mock result object for display
|
60 |
+
mock_result = {
|
61 |
+
"answer": result_data["answer"],
|
62 |
+
"source_documents": [
|
63 |
+
type('MockDoc', (), {
|
64 |
+
'metadata': source,
|
65 |
+
'page_content': source.get('content_preview', '')
|
66 |
+
})() for source in result_data.get('sources', [])
|
67 |
+
]
|
68 |
+
}
|
69 |
+
|
70 |
+
display_query_result(mock_result, show_share_link=False)
|
71 |
+
|
72 |
+
if st.button("🔍 Ask Your Own Question"):
|
73 |
+
st.experimental_set_query_params()
|
74 |
+
st.experimental_rerun()
|
75 |
+
else:
|
76 |
+
st.error("❌ Shared query not found or has expired.")
|
77 |
+
if st.button("🏠 Go to Home"):
|
78 |
+
st.experimental_set_query_params()
|
79 |
+
st.experimental_rerun()
|
rag_system.py → utils/rag_system.py
RENAMED
File without changes
|
utils/translations.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# translations.py
|
2 |
+
|
3 |
+
translations = {
|
4 |
+
"en": {
|
5 |
+
"search_header": "🔍 Search University Information",
|
6 |
+
"ask_prompt": "Ask about admissions, fees, scholarships, and programs.",
|
7 |
+
"language_label": "Language",
|
8 |
+
"your_question": "Your question:",
|
9 |
+
"placeholder": "e.g., Master's in Malaysia under 40,000 RMB/year",
|
10 |
+
"example_queries": "💡 See Example Queries",
|
11 |
+
"complex_queries": "🧠 Complex Queries",
|
12 |
+
"simple_queries": "⚡ Simple Queries",
|
13 |
+
"advanced_filters": "🔧 Advanced Filters",
|
14 |
+
"budget_label": "Budget (USD/year)",
|
15 |
+
"study_level": "Study Level",
|
16 |
+
"preferred_countries": "Preferred Countries",
|
17 |
+
"search_button": "🔍 Search",
|
18 |
+
"searching_msg": "Searching..."
|
19 |
+
},
|
20 |
+
"zh": {
|
21 |
+
"search_header": "🔍 搜索大学信息",
|
22 |
+
"ask_prompt": "询问入学、学费、奖学金和课程信息。",
|
23 |
+
"language_label": "语言",
|
24 |
+
"your_question": "你的问题:",
|
25 |
+
"placeholder": "例如:在马来西亚读硕士,每年学费低于4万人民币",
|
26 |
+
"example_queries": "💡 查看示例问题",
|
27 |
+
"complex_queries": "🧠 复杂查询",
|
28 |
+
"simple_queries": "⚡ 简单查询",
|
29 |
+
"advanced_filters": "🔧 高级筛选",
|
30 |
+
"budget_label": "预算 (美元/年)",
|
31 |
+
"study_level": "学历层次",
|
32 |
+
"preferred_countries": "首选国家",
|
33 |
+
"search_button": "🔍 搜索",
|
34 |
+
"searching_msg": "正在搜索..."
|
35 |
+
},
|
36 |
+
"ms": {
|
37 |
+
"search_header": "🔍 Cari Maklumat Universiti",
|
38 |
+
"ask_prompt": "Tanya tentang kemasukan, yuran, biasiswa, dan program.",
|
39 |
+
"language_label": "Bahasa",
|
40 |
+
"your_question": "Soalan anda:",
|
41 |
+
"placeholder": "contoh: Sarjana di Malaysia bawah 40,000 RMB/tahun",
|
42 |
+
"example_queries": "💡 Lihat Contoh Soalan",
|
43 |
+
"complex_queries": "🧠 Soalan Kompleks",
|
44 |
+
"simple_queries": "⚡ Soalan Mudah",
|
45 |
+
"advanced_filters": "🔧 Penapis Lanjutan",
|
46 |
+
"budget_label": "Bajet (USD/tahun)",
|
47 |
+
"study_level": "Peringkat Pengajian",
|
48 |
+
"preferred_countries": "Negara Pilihan",
|
49 |
+
"search_button": "🔍 Cari",
|
50 |
+
"searching_msg": "Sedang mencari..."
|
51 |
+
},
|
52 |
+
"th": {
|
53 |
+
"search_header": "🔍 ค้นหาข้อมูลมหาวิทยาลัย",
|
54 |
+
"ask_prompt": "สอบถามเกี่ยวกับการรับเข้าเรียน ค่าธรรมเนียม ทุนการศึกษา และหลักสูตร",
|
55 |
+
"language_label": "ภาษา",
|
56 |
+
"your_question": "คำถามของคุณ:",
|
57 |
+
"placeholder": "เช่น ปริญญาโทในมาเลเซียไม่เกิน 40,000 หยวน/ปี",
|
58 |
+
"example_queries": "💡 ดูตัวอย่างคำถาม",
|
59 |
+
"complex_queries": "🧠 คำถามซับซ้อน",
|
60 |
+
"simple_queries": "⚡ คำถามง่าย",
|
61 |
+
"advanced_filters": "🔧 ตัวกรองขั้นสูง",
|
62 |
+
"budget_label": "งบประมาณ (USD/ปี)",
|
63 |
+
"study_level": "ระดับการศึกษา",
|
64 |
+
"preferred_countries": "ประเทศที่ต้องการ",
|
65 |
+
"search_button": "🔍 ค้นหา",
|
66 |
+
"searching_msg": "กำลังค้นหา..."
|
67 |
+
},
|
68 |
+
"id": {
|
69 |
+
"search_header": "🔍 Cari Informasi Universitas",
|
70 |
+
"ask_prompt": "Tanyakan tentang penerimaan, biaya, beasiswa, dan program.",
|
71 |
+
"language_label": "Bahasa",
|
72 |
+
"your_question": "Pertanyaan Anda:",
|
73 |
+
"placeholder": "contoh: Magister di Malaysia di bawah 40,000 RMB/tahun",
|
74 |
+
"example_queries": "💡 Lihat Contoh Pertanyaan",
|
75 |
+
"complex_queries": "🧠 Pertanyaan Kompleks",
|
76 |
+
"simple_queries": "⚡ Pertanyaan Sederhana",
|
77 |
+
"advanced_filters": "🔧 Filter Lanjutan",
|
78 |
+
"budget_label": "Anggaran (USD/tahun)",
|
79 |
+
"study_level": "Tingkat Pendidikan",
|
80 |
+
"preferred_countries": "Negara Pilihan",
|
81 |
+
"search_button": "🔍 Cari",
|
82 |
+
"searching_msg": "Mencari..."
|
83 |
+
},
|
84 |
+
"vi": {
|
85 |
+
"search_header": "🔍 Tìm kiếm Thông tin Đại học",
|
86 |
+
"ask_prompt": "Hỏi về tuyển sinh, học phí, học bổng và chương trình học.",
|
87 |
+
"language_label": "Ngôn ngữ",
|
88 |
+
"your_question": "Câu hỏi của bạn:",
|
89 |
+
"placeholder": "ví dụ: Thạc sĩ tại Malaysia dưới 40,000 RMB/năm",
|
90 |
+
"example_queries": "💡 Xem Câu hỏi Mẫu",
|
91 |
+
"complex_queries": "🧠 Câu hỏi Phức tạp",
|
92 |
+
"simple_queries": "⚡ Câu hỏi Đơn giản",
|
93 |
+
"advanced_filters": "🔧 Bộ lọc Nâng cao",
|
94 |
+
"budget_label": "Ngân sách (USD/năm)",
|
95 |
+
"study_level": "Trình độ Học vấn",
|
96 |
+
"preferred_countries": "Quốc gia Ưu tiên",
|
97 |
+
"search_button": "🔍 Tìm kiếm",
|
98 |
+
"searching_msg": "Đang tìm kiếm..."
|
99 |
+
}
|
100 |
+
}
|