sam2ai commited on
Commit
ff8635f
1 Parent(s): 6e5992a

Synced repo using 'sync_with_huggingface' Github Action

Browse files
Files changed (1) hide show
  1. app.py +359 -41
app.py CHANGED
@@ -1,42 +1,360 @@
 
1
  import streamlit as st
2
- import pandas as pd
3
- import numpy as np
4
-
5
- st.title('Uber pickups in NYC')
6
-
7
- DATE_COLUMN = 'date/time'
8
- DATA_URL = ('https://s3-us-west-2.amazonaws.com/'
9
- 'streamlit-demo-data/uber-raw-data-sep14.csv.gz')
10
-
11
- @st.cache_resource
12
- def load_data(nrows):
13
- data = pd.read_csv(DATA_URL, nrows=nrows)
14
- lowercase = lambda x: str(x).lower()
15
- data.rename(lowercase, axis='columns', inplace=True)
16
- data[DATE_COLUMN] = pd.to_datetime(data[DATE_COLUMN])
17
- return data
18
-
19
- data_load_state = st.text('Loading data...')
20
- data = load_data(10000)
21
- data_load_state.text("Done! (using st.cache)")
22
-
23
- if st.checkbox('Show raw data'):
24
- st.subheader('Raw data')
25
- st.write(data)
26
-
27
- st.subheader('Number of pickups by hour')
28
- hist_values = np.histogram(data[DATE_COLUMN].dt.hour, bins=24, range=(0,24))[0]
29
- st.bar_chart(hist_values)
30
-
31
- # Some number in the range 0-23
32
- hour_to_filter = st.slider('hour', 0, 23, 17)
33
- filtered_data = data[data[DATE_COLUMN].dt.hour == hour_to_filter]
34
-
35
- st.subheader('Map of all pickups at %s:00' % hour_to_filter)
36
- st.map(filtered_data)
37
-
38
- uploaded_file = st.file_uploader("Choose a file")
39
- if uploaded_file is not None:
40
- st.write(uploaded_file.name)
41
- bytes_data = uploaded_file.getvalue()
42
- st.write(len(bytes_data), "bytes")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import justext
2
  import streamlit as st
3
+ from lxml import etree
4
+ # import streamlit.components.v1 as components
5
+
6
+ # File Processing pkgs
7
+ from PIL import Image
8
+ import requests
9
+ # import xml.dom.minidom
10
+ from bs4 import BeautifulSoup
11
+ # import json
12
+ import docx2txt
13
+ # import textract
14
+ from PyPDF2 import PdfFileReader
15
+ import pdfplumber
16
+ import os
17
+
18
+
19
+
20
+ # ---- LOAD ASSETS ----
21
+ img_page_icon = Image.open("images/web_icon.jpeg")
22
+
23
+ # Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
24
+ st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")
25
+
26
+
27
+ # Load CSS file
28
+ def load_css(file_path):
29
+ with open(file_path) as f:
30
+ st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
31
+
32
+
33
+ # Load CSS file
34
+ load_css('styles2.css')
35
+
36
+
37
+ # ----- FUNCTIONS ----
38
+ # function to check whether the url is a sitemap or not
39
+ def check_sitemap(url):
40
+ # Check the URL's ending
41
+ if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')):
42
+ try:
43
+ # Parse the content as XML
44
+ response = requests.get(url)
45
+ xml_content = etree.fromstring(response.content)
46
+ # Check for sitemap-specific elements
47
+ if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
48
+ return True
49
+ except etree.XMLSyntaxError:
50
+ pass
51
+
52
+ # Additional conditions for identifying sitemaps
53
+ if 'sitemap' in url.lower():
54
+ # Perform additional checks specific to the website's structure or naming conventions
55
+ return True
56
+
57
+ return False
58
+
59
+
60
+ # function to get urls from the site map and extract those data
61
+ def extract_urls_from_sitemaps(xml_url):
62
+ # Make a GET request to the URL and extract the xml content
63
+ response = requests.get(xml_url)
64
+
65
+ soup = BeautifulSoup(response.text, 'xml')
66
+ extracted_urls = []
67
+
68
+ # check if the sitemap contains nested sitemaps
69
+ sitemap_tags = soup.find_all('sitemap')
70
+ if sitemap_tags:
71
+ # Process nested sitemaps
72
+ for sitemap_tag in sitemap_tags:
73
+ print("sitemap_tags:" + sitemap_tag)
74
+ nested_url = sitemap_tag.find('loc').text
75
+ print('nested_url:', nested_url)
76
+ nested_urls = extract_urls_from_sitemaps(nested_url)
77
+ extracted_urls.extend(nested_urls)
78
+ else:
79
+ # Extract URLs from the current sitemap
80
+ loc_tags = soup.find_all('loc')
81
+ for loc_tag in loc_tags:
82
+ # if loc_tag.parent.name != 'image':
83
+ url = loc_tag.text
84
+ if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'):
85
+ print(f"url skipped because it is a {url.split('.')[-1]}")
86
+ else:
87
+ print('url:', url)
88
+ extracted_urls.append(url)
89
+
90
+ return extracted_urls
91
+
92
+
93
+ # function to check whether the entered url is valid
94
+ def valid_url(url):
95
+ try:
96
+ # Make a GET request to the URL and extract the text content
97
+ response = requests.get(url)
98
+ if response.status_code == 200:
99
+ return True
100
+
101
+ except requests.exceptions.RequestException as e:
102
+ return False
103
+
104
+
105
+ # function to create a custom stoplist for justext
106
+ def custom_stoplist():
107
+ odia_stopwords = [
108
+ "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
109
+ "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
110
+ "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡ��କ", "ସେମାନଙ୍କର",
111
+ "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
112
+ "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
113
+ "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
114
+ "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
115
+ "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
116
+ "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
117
+ "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
118
+ "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
119
+ "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
120
+ "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
121
+ "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
122
+ ]
123
+ return frozenset(odia_stopwords)
124
+
125
+
126
+ # function to extract data from url using justext
127
+ def extract_data_from_url_(url):
128
+ response = requests.get(url)
129
+ response.raise_for_status()
130
+ page = response.content
131
+
132
+ data_url = ""
133
+ para = ""
134
+ paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False)
135
+ for paragraph in paragraphs:
136
+ if not paragraph.is_boilerplate:
137
+ para = para + '\n' + paragraph.text
138
+
139
+ data_url = ('\n\nFrom url:' + url + '\n' + para + '\n')
140
+
141
+ return data_url
142
+
143
+
144
+ sitemap_data = ""
145
+
146
+
147
+ # function to get the text from pdf using PyPDF2
148
+ def read_pdf(file):
149
+ pdfReader = PdfFileReader(file)
150
+ count = pdfReader.numPages
151
+ # all_page_text = ""
152
+ # for i in range(count):
153
+ # page = pdfReader.getPage(i)
154
+ # all_page_text += page.extractText()
155
+ #
156
+ # return all_page_text
157
+ return count
158
+
159
+
160
+ # function to run the enter button
161
+ def run_function(url, documents):
162
+ data = ""
163
+ # Check if the user has provided a URL
164
+ if url:
165
+ if valid_url(url):
166
+ data = extract_data_from_url_(url)
167
+ st.text_area("Extracted Text", value=data, height=200)
168
+ # return extract status, and the data extracted
169
+ return True, data
170
+ else:
171
+ return False, data
172
+
173
+
174
+ # Check if the user has provided a document
175
+ elif documents is not None:
176
+ for document in documents:
177
+ document_details = {
178
+ "filename": document.name,
179
+ "filetype": document.type,
180
+ "filesize": document.size
181
+ }
182
+ st.write(document_details)
183
+
184
+ # Extract content from the txt file
185
+ if document.type == "text/plain":
186
+ # Read as bytes
187
+ data += str(document.read(), "utf-8")
188
+
189
+ # Extract content from the pdf file
190
+ elif document.type == "application/pdf":
191
+ # using PyPDF2
192
+ # data += read_pdf(document)
193
+
194
+ # using pdfplumber
195
+ try:
196
+ with pdfplumber.open(document) as pdf:
197
+ all_text = ""
198
+ for page in pdf.pages:
199
+ text = page.extract_text()
200
+ all_text += text + "\n"
201
+ data += all_text
202
+ except requests.exceptions.RequestException as e:
203
+ st.write("None")
204
+
205
+ # Extract content from the docx file
206
+ elif document.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
207
+ data += docx2txt.process(document)
208
+
209
+ # Display the extracted text content from file
210
+ st.write("attached")
211
+ st.text_area("Extracted Text", value=data, height=200)
212
+ # return extract status, and the data extracted
213
+ return True, data
214
+
215
+ else:
216
+ st.error("Error: An error occurred while fetching content.")
217
+ # return extract status, and the data extracted
218
+ return False, data
219
+
220
+
221
+ def main():
222
+ # ---- HEADER SECTION ----
223
+ with st.container():
224
+ st.subheader("Hi!! :wave:")
225
+ st.write("##")
226
+ st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
227
+ unsafe_allow_html=True)
228
+ st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
229
+ # st.title("Odia Generative AI")
230
+
231
+ st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)
232
+
233
+ # ---- BODY SECTION ----
234
+ with st.container():
235
+ st.subheader("Collecting monolingual data (Odia or any Indic Languages)")
236
+
237
+ # dividing the body section into 3 columns for url, attach button and enter button
238
+ col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
239
+ # url/xml
240
+ with col1:
241
+
242
+ url_or_xml = st.text_input(label='', placeholder="Enter URL")
243
+ is_a_sitemap = check_sitemap(url_or_xml)
244
+
245
+ # attached files
246
+ with col2:
247
+
248
+ documents = st.file_uploader("", type=["pdf", "txt", "docx"], accept_multiple_files=True)
249
+ if not documents:
250
+ documents = None
251
+ else:
252
+ for doc in documents:
253
+ if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
254
+ # if documents is not the relevant type
255
+ st.error("Unsupported file: " + doc.name)
256
+
257
+ # Initialize state of button Enter
258
+ with col3:
259
+ st.write('##')
260
+ if "button_enter" not in st.session_state:
261
+ st.session_state.button_enter = False
262
+
263
+ if st.button("Enter"):
264
+ st.session_state.button_enter = True
265
+ # st.write("session state true")
266
+
267
+ if "extracted" not in st.session_state:
268
+ st.session_state.extracted = False
269
+ data = ""
270
+
271
+ # the enter button
272
+ if st.session_state.button_enter:
273
+ # check if it is a sitemap or not
274
+ if is_a_sitemap:
275
+ if "Initial" not in st.session_state:
276
+ st.session_state.Initial = True
277
+ # check whether its the initial state
278
+ if st.session_state.Initial == True:
279
+ # print("\n\n\n\n1)Initial State", st.session_state.Initial, "\n\n\n\n\n")
280
+ xml = url_or_xml
281
+ st.write("It is a sitemap")
282
+ stored_sitemap_urls = extract_urls_from_sitemaps(xml)
283
+ print('\nno. of urls: ', len(stored_sitemap_urls))
284
+
285
+ if stored_sitemap_urls:
286
+ print(stored_sitemap_urls)
287
+ for sitemap_url in stored_sitemap_urls:
288
+
289
+ if valid_url(sitemap_url):
290
+ print(sitemap_url)
291
+ # using justext to extract data
292
+ data = data + extract_data_from_url_(sitemap_url)
293
+ else:
294
+ st.error("Couldnt extract data from " + sitemap_url)
295
+
296
+ if "sitemap_data" not in st.session_state:
297
+ st.session_state.sitemap_data = data
298
+ # print("\n\n\nst.session.data ", st.session_state.sitemap_data)
299
+ # print("\n\n\n\nRUNNING \n\n\n\n")
300
+ st.session_state.Initial = False
301
+ print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n")
302
+ st.session_state.extracted = True
303
+ # st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
304
+
305
+ else:
306
+ st.error("Error: Invalid sitemap.")
307
+
308
+
309
+ else:
310
+ url = url_or_xml
311
+ st.session_state.extracted, data = run_function(url, documents)
312
+
313
+ if st.session_state.extracted:
314
+ if is_a_sitemap:
315
+ st.text_area("Extracted Text", value=st.session_state.sitemap_data, height=300)
316
+ col1, col2 = st.columns([0.5, 0.5])
317
+
318
+ with col1:
319
+ saved_button = False
320
+ if st.button("Save", key="b_save"):
321
+ file_name = "output.txt"
322
+
323
+ # Define the folder path
324
+ folder_path = "extracted data"
325
+
326
+ # Create the folder if it doesn't exist
327
+ os.makedirs(folder_path, exist_ok=True)
328
+
329
+ # Define the file path
330
+ file_path = os.path.join(folder_path, file_name)
331
+ if is_a_sitemap:
332
+ saved_data = st.session_state.sitemap_data
333
+ # Save string variable to file
334
+ with open(file_path, "w", encoding="utf-8") as file:
335
+ file.write(saved_data)
336
+ else:
337
+ with open(file_path, "w", encoding="utf-8") as file:
338
+ file.write(data)
339
+ saved_button = True
340
+
341
+ with col2:
342
+ if st.button("Clear"):
343
+ st.session_state.button_enter = False
344
+ st.session_state.Initial = True
345
+ st.session_state.extracted = False
346
+ if 'sitemap_data' in st.session_state:
347
+ del st.session_state['sitemap_data']
348
+ st.session_state.button_enter = False
349
+ st.experimental_rerun()
350
+
351
+ if saved_button:
352
+ # Confirmation message
353
+ st.success(f"File saved as {file_name} in the current directory.")
354
+
355
+ else:
356
+ st.warning("Data not extracted")
357
+
358
+
359
+ if __name__ == "__main__":
360
+ main()