Spaces:

OdiaGenAI
/

web_scrapper_odia

Runtime error

App Files Files Community

web_scrapper_odia / app.py

sam2ai

Update app.py

efcdc3b over 1 year ago

raw

history blame

5.49 kB

	# installed pip packages
	# pip install streamlit
	# pip install beautifulsoup4
	# pip install docx2txt
	# pip install pypdf2
	# pip install pdfplumber

	import streamlit as st

	# File Processing pkgs
	from PIL import Image
	import requests
	from bs4 import BeautifulSoup
	import json
	import docx2txt
	# import textract
	from PyPDF2 import PdfFileReader
	import pdfplumber


	# ---- LOAD ASSETS ----
	img_page_icon = Image.open("web_icon.jpeg")

	# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/
	st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide")

	# Load CSS file
	def load_css(file_path):
	with open(file_path) as f:
	st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)

	# Load CSS file
	load_css('styles.css')



	# ---- HEADER SECTION ----
	with st.container():
	st.subheader("Hi, username :wave:")
	st.write("##")
	st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>",
	unsafe_allow_html=True)
	st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True)
	# st.title("Odia Generative AI")

	st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True)



	# ---- BODY SECTION ----
	with st.container():
	st.subheader("Collecting monolingual data (Odia or any Indic Languages)")

	# ----- FUNCTIONS ----
	# function to get the text from pdf using PyPDF2
	def read_pdf(file):
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	# all_page_text = ""
	# for i in range(count):
	# page = pdfReader.getPage(i)
	# all_page_text += page.extractText()
	#
	# return all_page_text
	return count

	# function to run the enter button
	def run_function(url , documents):
	news = ""
	# Check if the user has provided a URL
	if url:
	try:
	# Make a GET request to the URL and extract the text content
	response = requests.get(url)
	if response.status_code == 200:
	text_content = response.text

	soup = BeautifulSoup(text_content, 'html.parser')

	# Extracting the header
	# Extracting the script tag which includes the heading
	heading = soup.find('script', type='application/ld+json')

	# Extract the JSON data from the script tag
	json_data_heading = heading.string

	# Load the JSON data into a Python dictionary
	data = json.loads(json_data_heading)
	headline = data['headline']

	body = soup.find('div', class_='oi-article-lt')
	# Find all <p> tags within the div_tag
	p_tags = body.find_all('p')

	# Extract the text content from each <p> tag
	paragraphs = [p.get_text(strip=True) for p in p_tags]
	paragraphs = '\n'.join(paragraphs)

	news = news + (headline + '\n\n' + paragraphs)

	# Display the extracted text content from url
	st.text_area("Extracted Text", value=news, height=200)

	else:
	st.error("Error: Unable to fetch content from the provided URL.")
	except requests.exceptions.RequestException as e:
	st.error("Error: An exception occurred while fetching content from the URL.")

	# Check if the user has provided a document
	elif documents is not None:
	for document in documents:
	document_details = {
	"filename":document.name,
	"filetype":document.type,
	"filesize":document.size
	}
	st.write(document_details)

	# Extract content from the txt file
	if document.type == "text/plain":
	# Read as bytes
	news += str(document.read(), "utf-8")

	# Extract content from the pdf file
	elif document.type == "application/pdf":
	# using PyPDF2
	# news += read_pdf(document)

	# using pdfplumber
	try:
	with pdfplumber.open(document) as pdf:
	all_text = ""
	for page in pdf.pages:
	text = page.extract_text()
	all_text += text + "\n"
	news += all_text
	except:
	st.write("None")

	# Extract content from the docx file
	else:
	news += docx2txt.process(document)

	# Display the extracted text content from file
	st.text_area("Extracted Text", value=news, height=200)
	else:
	st.error("Error: An error occurred while fetching content .")



	col1, col2, col3 = st.columns([0.6, 0.2, 0.2])
	with col1:
	url = st.text_input(label='', placeholder="Enter URL")

	with col2:
	documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True)

	with col3:
	b = st.button("Enter")

	if b:
	run_function(url, documents)