Spaces:
Runtime error
Runtime error
# installed pip packages | |
# pip install streamlit | |
# pip install beautifulsoup4 | |
# pip install docx2txt | |
# pip install pypdf2 | |
# pip install pdfplumber | |
import streamlit as st | |
# File Processing pkgs | |
from PIL import Image | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import docx2txt | |
# import textract | |
from PyPDF2 import PdfFileReader | |
import pdfplumber | |
# ---- LOAD ASSETS ---- | |
img_page_icon = Image.open("web_icon.jpeg") | |
# Find more emojis here: https://www.webfx.com/tools/emoji-cheat-sheet/ | |
st.set_page_config(page_title="OdiaGenAI ", page_icon=img_page_icon, layout="wide") | |
# Load CSS file | |
def load_css(file_path): | |
with open(file_path) as f: | |
st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True) | |
# Load CSS file | |
load_css('styles.css') | |
# ---- HEADER SECTION ---- | |
with st.container(): | |
st.subheader("Hi, username :wave:") | |
st.write("##") | |
st.markdown("<h5 class='text'>OdiaGenAI is a collaborative initiative that conducts research on </h5>", | |
unsafe_allow_html=True) | |
st.markdown("<h5>Generative AI and LLM for the Odia Language.</h5>", unsafe_allow_html=True) | |
# st.title("Odia Generative AI") | |
st.markdown("<h1 class='title'>Odia Generative AI</h1>", unsafe_allow_html=True) | |
# ---- BODY SECTION ---- | |
with st.container(): | |
st.subheader("Collecting monolingual data (Odia or any Indic Languages)") | |
# ----- FUNCTIONS ---- | |
# function to get the text from pdf using PyPDF2 | |
def read_pdf(file): | |
pdfReader = PdfFileReader(file) | |
count = pdfReader.numPages | |
# all_page_text = "" | |
# for i in range(count): | |
# page = pdfReader.getPage(i) | |
# all_page_text += page.extractText() | |
# | |
# return all_page_text | |
return count | |
# function to run the enter button | |
def run_function(url , documents): | |
news = "" | |
# Check if the user has provided a URL | |
if url: | |
try: | |
# Make a GET request to the URL and extract the text content | |
response = requests.get(url) | |
if response.status_code == 200: | |
text_content = response.text | |
soup = BeautifulSoup(text_content, 'html.parser') | |
# Extracting the header | |
# Extracting the script tag which includes the heading | |
heading = soup.find('script', type='application/ld+json') | |
# Extract the JSON data from the script tag | |
json_data_heading = heading.string | |
# Load the JSON data into a Python dictionary | |
data = json.loads(json_data_heading) | |
headline = data['headline'] | |
body = soup.find('div', class_='oi-article-lt') | |
# Find all <p> tags within the div_tag | |
p_tags = body.find_all('p') | |
# Extract the text content from each <p> tag | |
paragraphs = [p.get_text(strip=True) for p in p_tags] | |
paragraphs = '\n'.join(paragraphs) | |
news = news + (headline + '\n\n' + paragraphs) | |
# Display the extracted text content from url | |
st.text_area("Extracted Text", value=news, height=200) | |
else: | |
st.error("Error: Unable to fetch content from the provided URL.") | |
except requests.exceptions.RequestException as e: | |
st.error("Error: An exception occurred while fetching content from the URL.") | |
# Check if the user has provided a document | |
elif documents is not None: | |
for document in documents: | |
document_details = { | |
"filename":document.name, | |
"filetype":document.type, | |
"filesize":document.size | |
} | |
st.write(document_details) | |
# Extract content from the txt file | |
if document.type == "text/plain": | |
# Read as bytes | |
news += str(document.read(), "utf-8") | |
# Extract content from the pdf file | |
elif document.type == "application/pdf": | |
# using PyPDF2 | |
# news += read_pdf(document) | |
# using pdfplumber | |
try: | |
with pdfplumber.open(document) as pdf: | |
all_text = "" | |
for page in pdf.pages: | |
text = page.extract_text() | |
all_text += text + "\n" | |
news += all_text | |
except: | |
st.write("None") | |
# Extract content from the docx file | |
else: | |
news += docx2txt.process(document) | |
# Display the extracted text content from file | |
st.text_area("Extracted Text", value=news, height=200) | |
else: | |
st.error("Error: An error occurred while fetching content .") | |
col1, col2, col3 = st.columns([0.6, 0.2, 0.2]) | |
with col1: | |
url = st.text_input(label='', placeholder="Enter URL") | |
with col2: | |
documents = st.file_uploader("", type=["png", "jpg", "jpeg", "pdf", "txt", "docx"], accept_multiple_files=True) | |
with col3: | |
b = st.button("Enter") | |
if b: | |
run_function(url, documents) | |