Olive_Farm / app.py
sam2ai's picture
Synced repo using 'sync_with_huggingface' Github Action
363527f
import streamlit as st
import requests
import justext
import pdfplumber
import docx2txt
import json
import ast
import os
import re
import openai
import json
from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate
st.set_page_config(page_title="LLM instruction Generator")
# sidebar content
with st.sidebar:
st.markdown("""
<style>
[data-testid=stImage]{
display: block;
margin-top: -20px;
margin-left: auto;
margin-right: auto;
}
</style>
""", unsafe_allow_html=True)
st.image(image="olive_farm.png", width=100)
st.markdown("""
<style>
.sidebar-text {
text-align: justify;
font-size: 14px;
padding-bottom: 16px;
}
.list {
font-size: 14px !important;
}
</style>
<div class="sidebar-text">
OliveFarm is a cutting-edge web application crafted by the innovative minds at
<a href="https://www.odiagenai.org/" target="_blank">OdiaGenAI.</a>
It's designed to effortlessly generate LLM (Language Model) instruction sets in Indic languages.
Presently, it offers support for Hindi and Odia, with seamless scalability to incorporate
additional languages on the horizon.
</div>
<div class="sidebar-text">
This versatile tool accommodates inputs from a variety of sources, including (URLs, PDF documents, and plain text).
</div>
<div class="sidebar-text">
Additionally, OliveFarm features a collection of pre-existing templates, powered by ChatGPT,
to streamline the process of generating instruction sets. Experience the future of
Indic language instruction with OliveFarm!
</div>
<div>
Please follow the
<a href="https://github.com/OdiaGenAI/Olive_Farm/blob/main/README.md" target="_blank">GitHub README</a>
instructions to generate the instruction set.
</div>
<div class="sidebar-text">
Contributors:
</div>
<ul>
<li class="list">AR Kamaldeen</li>
<li class="list">SK Shahid</li>
<li class="list">Sambit Sekhar</li>
<li class="list">Parul Agarwal</li>
<li class="list">Dr. Shantipriya Parida</li>
</ul>
""", unsafe_allow_html=True)
st.markdown(
"""
<style>
.copyright {
text-align: center;
font-size: 14px;
}
</style>
<div class="copyright">
© 2023 Odia Generative AI
</div>
"""
, unsafe_allow_html=True)
# function for the odia stoplists justext
def odia_stoplist():
odia_stopwords = [
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
]
return frozenset(odia_stopwords)
# function to extract data from url using justext
def extract_data_from_url(url, language):
try:
response = requests.get(url)
if response.status_code == 200:
print("inside the response")
response.raise_for_status()
page = response.content
para = ""
if language == "English":
paragraphs = justext.justext(page, justext.get_stoplist("English"))
elif language == "Hindi":
paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
elif language == "Odia":
paragraphs = justext.justext(
page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
)
for paragraph in paragraphs:
if not paragraph.is_boilerplate:
para = para + "\n" + paragraph.text
# returning the extracted data i.e para as string
if para == "":
st.error("Unable to extract data from the URL")
return None
else:
return para
else:
st.error("Request failed ")
return None
except Exception as err:
st.error(err)
return None
# function to extract data from documents
def extract_data_from_documents(documents):
data = ""
if documents is not None:
for document in documents:
document_details = {
"filename": document.name,
"filetype": document.type,
"filesize": document.size,
}
st.write(document_details)
# Extract content from the txt file
if document.type == "text/plain":
# Read as bytes
data += str(document.read(), "utf-8")
# Extract content from the pdf file
elif document.type == "application/pdf":
# using pdfplumber
try:
with pdfplumber.open(document) as pdf:
all_text = ""
for page in pdf.pages:
text = page.extract_text()
all_text += text + "\n"
data += all_text
except requests.exceptions.RequestException as e:
st.write("None")
# Extract content from the docx file
elif (
document.type
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
):
data += docx2txt.process(document)
# return extract data
return data
else:
st.error("Error: An error occurred while fetching content.")
# return extract status, and the data extracted
return None
# function for the keyboard
# Check the inputs for language, promptType
def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
langFlag = False
promptFlag = False
noOfQuestionFlag = False
instructionFormatFlag = False
if language:
langFlag = True
if promptType:
promptFlag = True
if noOfQuestions:
noOfQuestionFlag = True
if instructionFormat:
instructionFormatFlag = True
# checking for the compalsory inputs and return true only if all are set
return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag
def main():
# setting up the initial session_states
if "extract_button" not in st.session_state:
st.session_state.extract_button = False
if "submit" not in st.session_state:
st.session_state.submit = False
if "generated" not in st.session_state:
st.session_state.generated = False
if "selected" not in st.session_state:
st.session_state.selected = False
if "answered" not in st.session_state:
st.session_state.answered = False
st.subheader("LLM Instructions")
# form to get the inputs
with st.form(key="form1"):
st.write("#")
# dropdown for language
language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))
# dropdown for prompt type
promptType = st.selectbox(
"Select the Prompt type", ("", "Input text", "Url", "Document")
)
# inputs for number
noOfQuestions = st.number_input(
"Number of questions to generate:", min_value=1, max_value=20, value=10
)
# dropdown for language
instructionFormat = st.selectbox(
"Format of instruction:", ("Imperative sentence", "Question")
)
# input text for openAiKey
openAiKey = st.text_input(label="Input the openai key", type="password")
if "openAiKey" in st.session_state:
st.session_state["openAiKey"] = openAiKey
else:
st.session_state["openAiKey"] = openAiKey
st.write("##")
# form submit button and setting up the session_state
if st.form_submit_button():
st.session_state.submit = True
if st.session_state.submit:
# extends the prompt form to extract the data
with st.expander(label="prompt"):
with st.form(key="form2"):
# calling the function inside if to check valid drop down inputs
if valid_drop_down(
language, promptType, noOfQuestions, instructionFormat
):
if promptType == "Input text":
inputText = st.text_area(
label="For Instructions",
placeholder="Please enter your text here",
)
elif promptType == "Url":
url = st.text_input(
label="For URL", placeholder="Please enter your text here"
)
elif promptType == "Document":
documents = st.file_uploader(
label="For Documents ( pdf / txt / docx )",
type=["pdf", "txt", "docx"],
accept_multiple_files=True,
)
# if addInfoCheckbox:
# additionalInfo = st.text_input(
# label="Additional Instructions",
# placeholder="Please enter your text here",
# )
if st.form_submit_button():
st.session_state.extract_button = True
# st.experimental_rerun()
# extracting data
if st.session_state.extract_button:
# extracting data
if promptType == "Input text":
extractedData = inputText
elif promptType == "Url":
extractedURLData = extract_data_from_url(url, language)
if extractedURLData is not None:
extractedData = extractedURLData
st.text_area("Extracted Text:", value=extractedData, height=200)
else:
extractedData = False
elif promptType == "Document":
if not documents:
documents = None
else:
for doc in documents:
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
# if documents is not the relevant type
st.error("Unsupported file: " + doc.name)
extractedDocumentData = extract_data_from_documents(documents)
extractedData = extractedDocumentData
# if the values are extracted running the custom prompt by creating an instance
if extractedData:
# ----------------------------- RUNNING THE PROMPT -----------------------------
if "extractedData" not in st.session_state:
st.session_state["extractedData"] = extractedData
else:
st.session_state["extractedData"] = extractedData
if "Initial" not in st.session_state:
st.session_state.Initial=True
if st.session_state.Initial == True:
# running the prompt form here
openai.api_key = st.session_state["openAiKey"]
my_prompt_template = InstructionGenerationTemplate()
# providing the rules for the instructions to be generated
additional_rules = """
- You do not need to provide a response to the generated examples.
- You must return the response in the specified language.
- Each generated instruction can be either an imperative sentence or a question.
"""
try :
if st.button("Generate Instructions"):
prompt = my_prompt_template.format(
num_questions=noOfQuestions,
context=extractedData,
instruction_format=instructionFormat,
lang=language,
additional_rules=additional_rules
)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": prompt},
])
# if st.button("Generate Instructions"):
print("Generate button")
print("Checkpoint 1!")
if "result" not in st.session_state:
content = response.choices[0].message.content
# content = "\n1. helloworld1.\n2. helloworld2"
responses_list = content.split('\n')
responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
st.session_state["result"]=responses_list
st.session_state.generated = True
st.session_state.Initial = False
except Exception as err:
st.error(err)
if st.session_state.generated:
# displaying the generated instructions
st.write("Generated Instructions")
result = st.session_state["result"]
# print(type(result))
# print(result)
result_dict = {i+1: value for i,value in enumerate(result)}
selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
# print(type(result_dict))
# print(result_dict)
print("Checked point 2!")
# Display the selected items as a list
if selected_items:
st.write("Selected Items:")
st.write(selected_items)
if "selected_items" not in st.session_state:
st.session_state["selected_items"] = selected_items
st.session_state["selected_items"] = selected_items
st.session_state.selected = True
else:
st.write("No items selected.")
# ----------------------------- RUNNING THE PROMPT FOR ANSWER GENERATION -----------------------------
if st.session_state.selected:
if "Initial2" not in st.session_state:
st.session_state.Initial2=True
if st.session_state.Initial2:
# running the prompt form here
openai.api_key = st.session_state["openAiKey"]
my_prompt_template2 = AnswerGenerationTemplate()
# providing the rules for the answers to be generated
additional_rules = """
Each generated answer should be within the <ans>Answer</ans> tag and the question should be within the <ques>Question</ques> tag.
"""
question = st.session_state["selected_items"]
try:
if st.button("Generate Answers"):
prompt = my_prompt_template2.format(
questions=question,
additional_rules = additional_rules
)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": prompt},
])
# if st.button("Generate Answers"):
# print("\n\n\n\nInside Answersss:\n\n\n\n")
# print(st.session_state["selected_items"])
# print("Generate button")
# print("Checkpoint 3!")
if "answers" not in st.session_state:
content = response.choices[0].message.content
# content = "\n1. Answer1.\n2. Answer2"
print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
print(content)
# print("Answer Type:" + str(type(content)))
# responses_list = content.split('\n')
# print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
# print(responses_list)
# print("Answer Type:" + str(type(responses_list)))
# responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
responses_list = re.findall(r'<ans>(.*?)</ans>', content, re.DOTALL)
st.session_state["answers"]=responses_list
st.session_state.answered = True
st.session_state.Initial2 = False
except Exception as e:
st.error(e)
if st.session_state.answered:
# displaying the generated Answers
questions = st.session_state["selected_items"]
answers = st.session_state["answers"]
# print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
# print(answers)
# print("Answer Type:" + str(type(answers)))
answers_dict = {i+1: value for i,value in enumerate(answers)}
# print(type(answers_dict))
# print(answers_dict)
# print("Checked point 4!")
# st.write("answers")
# st.write(answers_dict)
# Create a list to hold the JSON-like data
st.write("Generated Questions and Answers")
# Create a list of dictionaries
jsonl_data = [{"Instruction": question, "Output": answers_dict.get(i, 'No answer found'), "Input":""} for i, question in enumerate(questions, start=1)]
st.write(jsonl_data)
jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)
# Display the JSONL data
print(jsonl_string)
if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
st.success("Successfully saved")
if st.button("Clear"):
st.session_state.extract_button = False
st.session_state.submit = False
st.session_state.generated = False
st.session_state.selected = False
st.session_state.answered = False
if "Initial" in st.session_state:
st.session_state.Initial = True
if "Initial2" in st.session_state:
st.session_state.Initial2 = True
if "openAiKey" in st.session_state:
del st.session_state["openAiKey"]
if "extractedData" in st.session_state:
del st.session_state["extractedData"]
if "result" in st.session_state:
del st.session_state["result"]
if "selected_items" in st.session_state:
del st.session_state["selected_items"]
if "answers" in st.session_state:
del st.session_state["answers"]
st.experimental_rerun()
if __name__ == "__main__":
main()