Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files
app.py
CHANGED
@@ -1,42 +1,360 @@
|
|
|
|
1 |
import streamlit as st
|
2 |
-
|
3 |
-
import
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
st.
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import justext
|
2 |
import streamlit as st
|
3 |
+
from lxml import etree
|
4 |
+
# import streamlit.components.v1 as components
|
5 |
+
|
6 |
+
# File Processing pkgs
|
7 |
+
from PIL import Image
|
8 |
+
import requests
|
9 |
+
# import xml.dom.minidom
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
# import json
|
12 |
+
import docx2txt
|
13 |
+
# import textract
|
14 |
+
from PyPDF2 import PdfFileReader
|
15 |
+
import pdfplumber
|
16 |
+
import os
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
# ---- LOAD ASSETS ----
|
21 |
+
img_page_icon = Image.open("images/web_icon.jpeg")
|
22 |
+
|
23 |
+
# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
|
24 |
+
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
|
25 |
+
|
26 |
+
|
27 |
+
# Load CSS file
|
28 |
+
def load_css(file_path):
|
29 |
+
with open(file_path) as f:
|
30 |
+
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
|
31 |
+
|
32 |
+
|
33 |
+
# Load CSS file
|
34 |
+
load_css('styles2.css')
|
35 |
+
|
36 |
+
|
37 |
+
# ----- FUNCTIONS ----
|
38 |
+
# function to check whether the url is a sitemap or not
|
39 |
+
def check_sitemap(url):
|
40 |
+
# Check the URL's ending
|
41 |
+
if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
|
42 |
+
try:
|
43 |
+
# Parse the content as XML
|
44 |
+
response = requests.get(url)
|
45 |
+
xml_content = etree.fromstring(response.content)
|
46 |
+
# Check for sitemap-specific elements
|
47 |
+
if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
|
48 |
+
return True
|
49 |
+
except etree.XMLSyntaxError:
|
50 |
+
pass
|
51 |
+
|
52 |
+
# Additional conditions for identifying sitemaps
|
53 |
+
if 'sitemap' in url.lower():
|
54 |
+
# Perform additional checks specific to the website's structure or naming conventions
|
55 |
+
return True
|
56 |
+
|
57 |
+
return False
|
58 |
+
|
59 |
+
|
60 |
+
# function to get urls from the site map and extract those data
|
61 |
+
def extract_urls_from_sitemaps(xml_url):
|
62 |
+
# Make a GET request to the URL and extract the xml content
|
63 |
+
response = requests.get(xml_url)
|
64 |
+
|
65 |
+
soup = BeautifulSoup(response.text, 'xml')
|
66 |
+
extracted_urls = []
|
67 |
+
|
68 |
+
# check if the sitemap contains nested sitemaps
|
69 |
+
sitemap_tags = soup.find_all('sitemap')
|
70 |
+
if sitemap_tags:
|
71 |
+
# Process nested sitemaps
|
72 |
+
for sitemap_tag in sitemap_tags:
|
73 |
+
print("sitemap_tags:" + sitemap_tag)
|
74 |
+
nested_url = sitemap_tag.find('loc').text
|
75 |
+
print('nested_url:', nested_url)
|
76 |
+
nested_urls = extract_urls_from_sitemaps(nested_url)
|
77 |
+
extracted_urls.extend(nested_urls)
|
78 |
+
else:
|
79 |
+
# Extract URLs from the current sitemap
|
80 |
+
loc_tags = soup.find_all('loc')
|
81 |
+
for loc_tag in loc_tags:
|
82 |
+
# if loc_tag.parent.name != 'image':
|
83 |
+
url = loc_tag.text
|
84 |
+
if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
|
85 |
+
print(f"url skipped because it is a {url.split('.')[-1]}")
|
86 |
+
else:
|
87 |
+
print('url:', url)
|
88 |
+
extracted_urls.append(url)
|
89 |
+
|
90 |
+
return extracted_urls
|
91 |
+
|
92 |
+
|
93 |
+
# function to check whether the entered url is valid
|
94 |
+
def valid_url(url):
|
95 |
+
try:
|
96 |
+
# Make a GET request to the URL and extract the text content
|
97 |
+
response = requests.get(url)
|
98 |
+
if response.status_code == 200:
|
99 |
+
return True
|
100 |
+
|
101 |
+
except requests.exceptions.RequestException as e:
|
102 |
+
return False
|
103 |
+
|
104 |
+
|
105 |
+
# function to create a custom stoplist for justext
|
106 |
+
def custom_stoplist():
|
107 |
+
odia_stopwords = [
|
108 |
+
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
|
109 |
+
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
|
110 |
+
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡ��କ", "ସେମାନଙ୍କର",
|
111 |
+
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
|
112 |
+
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
|
113 |
+
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
|
114 |
+
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
|
115 |
+
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
|
116 |
+
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
|
117 |
+
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
|
118 |
+
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
|
119 |
+
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
|
120 |
+
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
|
121 |
+
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
|
122 |
+
]
|
123 |
+
return frozenset(odia_stopwords)
|
124 |
+
|
125 |
+
|
126 |
+
# function to extract data from url using justext
|
127 |
+
def extract_data_from_url_(url):
|
128 |
+
response = requests.get(url)
|
129 |
+
response.raise_for_status()
|
130 |
+
page = response.content
|
131 |
+
|
132 |
+
data_url = ""
|
133 |
+
para = ""
|
134 |
+
paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
|
135 |
+
for paragraph in paragraphs:
|
136 |
+
if not paragraph.is_boilerplate:
|
137 |
+
para = para + '\n' + paragraph.text
|
138 |
+
|
139 |
+
data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')
|
140 |
+
|
141 |
+
return data_url
|
142 |
+
|
143 |
+
|
144 |
+
sitemap_data = ""
|
145 |
+
|
146 |
+
|
147 |
+
# function to get the text from pdf using PyPDF2
|
148 |
+
def read_pdf(file):
|
149 |
+
pdfReader = PdfFileReader(file)
|
150 |
+
count = pdfReader.numPages
|
151 |
+
# all_page_text = ""
|
152 |
+
# for i in range(count):
|
153 |
+
# page = pdfReader.getPage(i)
|
154 |
+
# all_page_text += page.extractText()
|
155 |
+
#
|
156 |
+
# return all_page_text
|
157 |
+
return count
|
158 |
+
|
159 |
+
|
160 |
+
# function to run the enter button
|
161 |
+
def run_function(url, documents):
|
162 |
+
data = ""
|
163 |
+
# Check if the user has provided a URL
|
164 |
+
if url:
|
165 |
+
if valid_url(url):
|
166 |
+
data = extract_data_from_url_(url)
|
167 |
+
st.text_area("Extracted Text", value=data, height=200)
|
168 |
+
# return extract status, and the data extracted
|
169 |
+
return True, data
|
170 |
+
else:
|
171 |
+
return False, data
|
172 |
+
|
173 |
+
|
174 |
+
# Check if the user has provided a document
|
175 |
+
elif documents is not None:
|
176 |
+
for document in documents:
|
177 |
+
document_details = {
|
178 |
+
"filename": document.name,
|
179 |
+
"filetype": document.type,
|
180 |
+
"filesize": document.size
|
181 |
+
}
|
182 |
+
st.write(document_details)
|
183 |
+
|
184 |
+
# Extract content from the txt file
|
185 |
+
if document.type == "text/plain":
|
186 |
+
# Read as bytes
|
187 |
+
data += str(document.read(), "utf-8")
|
188 |
+
|
189 |
+
# Extract content from the pdf file
|
190 |
+
elif document.type == "application/pdf":
|
191 |
+
# using PyPDF2
|
192 |
+
# data += read_pdf(document)
|
193 |
+
|
194 |
+
# using pdfplumber
|
195 |
+
try:
|
196 |
+
with pdfplumber.open(document) as pdf:
|
197 |
+
all_text = ""
|
198 |
+
for page in pdf.pages:
|
199 |
+
text = page.extract_text()
|
200 |
+
all_text += text + "\n"
|
201 |
+
data += all_text
|
202 |
+
except requests.exceptions.RequestException as e:
|
203 |
+
st.write("None")
|
204 |
+
|
205 |
+
# Extract content from the docx file
|
206 |
+
elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
207 |
+
data += docx2txt.process(document)
|
208 |
+
|
209 |
+
# Display the extracted text content from file
|
210 |
+
st.write("attached")
|
211 |
+
st.text_area("Extracted Text", value=data, height=200)
|
212 |
+
# return extract status, and the data extracted
|
213 |
+
return True, data
|
214 |
+
|
215 |
+
else:
|
216 |
+
st.error("Error: An error occurred while fetching content.")
|
217 |
+
# return extract status, and the data extracted
|
218 |
+
return False, data
|
219 |
+
|
220 |
+
|
221 |
+
def main():
|
222 |
+
# ---- HEADER SECTION ----
|
223 |
+
with st.container():
|
224 |
+
st.subheader("Hi!! :wave:")
|
225 |
+
st.write("##")
|
226 |
+
st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
|
227 |
+
unsafe_allow_html=True)
|
228 |
+
st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
|
229 |
+
# st.title("Odia Generative AI")
|
230 |
+
|
231 |
+
st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
|
232 |
+
|
233 |
+
# ---- BODY SECTION ----
|
234 |
+
with st.container():
|
235 |
+
st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
|
236 |
+
|
237 |
+
# dividing the body section into 3 columns for url, attach button and enter button
|
238 |
+
col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
|
239 |
+
# url/xml
|
240 |
+
with col1:
|
241 |
+
|
242 |
+
url_or_xml = st.text_input(label='', placeholder="Enter URL")
|
243 |
+
is_a_sitemap = check_sitemap(url_or_xml)
|
244 |
+
|
245 |
+
# attached files
|
246 |
+
with col2:
|
247 |
+
|
248 |
+
documents = st.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True)
|
249 |
+
if not documents:
|
250 |
+
documents = None
|
251 |
+
else:
|
252 |
+
for doc in documents:
|
253 |
+
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
|
254 |
+
# if documents is not the relevant type
|
255 |
+
st.error("Unsupported file: " + doc.name)
|
256 |
+
|
257 |
+
# Initialize state of button Enter
|
258 |
+
with col3:
|
259 |
+
st.write('##')
|
260 |
+
if "button_enter" not in st.session_state:
|
261 |
+
st.session_state.button_enter = False
|
262 |
+
|
263 |
+
if st.button("Enter"):
|
264 |
+
st.session_state.button_enter = True
|
265 |
+
# st.write("session state true")
|
266 |
+
|
267 |
+
if "extracted" not in st.session_state:
|
268 |
+
st.session_state.extracted = False
|
269 |
+
data = ""
|
270 |
+
|
271 |
+
# the enter button
|
272 |
+
if st.session_state.button_enter:
|
273 |
+
# check if it is a sitemap or not
|
274 |
+
if is_a_sitemap:
|
275 |
+
if "Initial" not in st.session_state:
|
276 |
+
st.session_state.Initial = True
|
277 |
+
# check whether its the initial state
|
278 |
+
if st.session_state.Initial == True:
|
279 |
+
# print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
|
280 |
+
xml = url_or_xml
|
281 |
+
st.write("It is a sitemap")
|
282 |
+
stored_sitemap_urls = extract_urls_from_sitemaps(xml)
|
283 |
+
print('\nno. of urls: ', len(stored_sitemap_urls))
|
284 |
+
|
285 |
+
if stored_sitemap_urls:
|
286 |
+
print(stored_sitemap_urls)
|
287 |
+
for sitemap_url in stored_sitemap_urls:
|
288 |
+
|
289 |
+
if valid_url(sitemap_url):
|
290 |
+
print(sitemap_url)
|
291 |
+
# using justext to extract data
|
292 |
+
data = data + extract_data_from_url_(sitemap_url)
|
293 |
+
else:
|
294 |
+
st.error("Couldnt extract data from " + sitemap_url)
|
295 |
+
|
296 |
+
if "sitemap_data" not in st.session_state:
|
297 |
+
st.session_state.sitemap_data = data
|
298 |
+
# print("\n\n\nst.session.data ", st.session_state.sitemap_data)
|
299 |
+
# print("\n\n\n\nRUNNING \n\n\n\n")
|
300 |
+
st.session_state.Initial = False
|
301 |
+
print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
|
302 |
+
st.session_state.extracted = True
|
303 |
+
# st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
|
304 |
+
|
305 |
+
else:
|
306 |
+
st.error("Error: Invalid sitemap.")
|
307 |
+
|
308 |
+
|
309 |
+
else:
|
310 |
+
url = url_or_xml
|
311 |
+
st.session_state.extracted, data = run_function(url, documents)
|
312 |
+
|
313 |
+
if st.session_state.extracted:
|
314 |
+
if is_a_sitemap:
|
315 |
+
st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
|
316 |
+
col1, col2 = st.columns([0.5, 0.5])
|
317 |
+
|
318 |
+
with col1:
|
319 |
+
saved_button = False
|
320 |
+
if st.button("Save", key="b_save"):
|
321 |
+
file_name = "output.txt"
|
322 |
+
|
323 |
+
# Define the folder path
|
324 |
+
folder_path = "extracted data"
|
325 |
+
|
326 |
+
# Create the folder if it doesn't exist
|
327 |
+
os.makedirs(folder_path, exist_ok=True)
|
328 |
+
|
329 |
+
# Define the file path
|
330 |
+
file_path = os.path.join(folder_path, file_name)
|
331 |
+
if is_a_sitemap:
|
332 |
+
saved_data = st.session_state.sitemap_data
|
333 |
+
# Save string variable to file
|
334 |
+
with open(file_path, "w", encoding="utf-8") as file:
|
335 |
+
file.write(saved_data)
|
336 |
+
else:
|
337 |
+
with open(file_path, "w", encoding="utf-8") as file:
|
338 |
+
file.write(data)
|
339 |
+
saved_button = True
|
340 |
+
|
341 |
+
with col2:
|
342 |
+
if st.button("Clear"):
|
343 |
+
st.session_state.button_enter = False
|
344 |
+
st.session_state.Initial = True
|
345 |
+
st.session_state.extracted = False
|
346 |
+
if 'sitemap_data' in st.session_state:
|
347 |
+
del st.session_state['sitemap_data']
|
348 |
+
st.session_state.button_enter = False
|
349 |
+
st.experimental_rerun()
|
350 |
+
|
351 |
+
if saved_button:
|
352 |
+
# Confirmation message
|
353 |
+
st.success(f"File saved as {file_name} in the current directory.")
|
354 |
+
|
355 |
+
else:
|
356 |
+
st.warning("Data not extracted")
|
357 |
+
|
358 |
+
|
359 |
+
if __name__ == "__main__":
|
360 |
+
main()
|