Spaces:
Runtime error
Runtime error
import streamlit as st | |
import requests | |
import justext | |
import pdfplumber | |
import docx2txt | |
import json | |
import ast | |
import os | |
import re | |
import openai | |
import json | |
from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate | |
st.set_page_config(page_title="LLM instruction Generator") | |
# sidebar content | |
with st.sidebar: | |
st.markdown(""" | |
<style> | |
[data-testid=stImage]{ | |
display: block; | |
margin-top: -20px; | |
margin-left: auto; | |
margin-right: auto; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.image(image="olive_farm.png", width=100) | |
st.markdown(""" | |
<style> | |
.sidebar-text { | |
text-align: justify; | |
font-size: 14px; | |
padding-bottom: 16px; | |
} | |
.list { | |
font-size: 14px !important; | |
} | |
</style> | |
<div class="sidebar-text"> | |
OliveFarm is a cutting-edge web application crafted by the innovative minds at | |
<a href="https://www.odiagenai.org/" target="_blank">OdiaGenAI.</a> | |
It's designed to effortlessly generate LLM (Language Model) instruction sets in Indic languages. | |
Presently, it offers support for Hindi and Odia, with seamless scalability to incorporate | |
additional languages on the horizon. | |
</div> | |
<div class="sidebar-text"> | |
This versatile tool accommodates inputs from a variety of sources, including (URLs, PDF documents, and plain text). | |
</div> | |
<div class="sidebar-text"> | |
Additionally, OliveFarm features a collection of pre-existing templates, powered by ChatGPT, | |
to streamline the process of generating instruction sets. Experience the future of | |
Indic language instruction with OliveFarm! | |
</div> | |
<div> | |
Please follow the | |
<a href="https://github.com/OdiaGenAI/Olive_Farm/blob/main/README.md" target="_blank">GitHub README</a> | |
instructions to generate the instruction set. | |
</div> | |
<div class="sidebar-text"> | |
Contributors: | |
</div> | |
<ul> | |
<li class="list">AR Kamaldeen</li> | |
<li class="list">SK Shahid</li> | |
<li class="list">Sambit Sekhar</li> | |
<li class="list">Parul Agarwal</li> | |
<li class="list">Dr. Shantipriya Parida</li> | |
</ul> | |
""", unsafe_allow_html=True) | |
st.markdown( | |
""" | |
<style> | |
.copyright { | |
text-align: center; | |
font-size: 14px; | |
} | |
</style> | |
<div class="copyright"> | |
© 2023 Odia Generative AI | |
</div> | |
""" | |
, unsafe_allow_html=True) | |
# function for the odia stoplists justext | |
def odia_stoplist(): | |
odia_stopwords = [ | |
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର", | |
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", | |
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର", | |
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ", | |
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା", | |
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |", | |
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ", | |
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ", | |
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ", | |
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ", | |
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ", | |
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି", | |
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ", | |
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ" | |
] | |
return frozenset(odia_stopwords) | |
# function to extract data from url using justext | |
def extract_data_from_url(url, language): | |
try: | |
response = requests.get(url) | |
if response.status_code == 200: | |
print("inside the response") | |
response.raise_for_status() | |
page = response.content | |
para = "" | |
if language == "English": | |
paragraphs = justext.justext(page, justext.get_stoplist("English")) | |
elif language == "Hindi": | |
paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False) | |
elif language == "Odia": | |
paragraphs = justext.justext( | |
page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False | |
) | |
for paragraph in paragraphs: | |
if not paragraph.is_boilerplate: | |
para = para + "\n" + paragraph.text | |
# returning the extracted data i.e para as string | |
if para == "": | |
st.error("Unable to extract data from the URL") | |
return None | |
else: | |
return para | |
else: | |
st.error("Request failed ") | |
return None | |
except Exception as err: | |
st.error(err) | |
return None | |
# function to extract data from documents | |
def extract_data_from_documents(documents): | |
data = "" | |
if documents is not None: | |
for document in documents: | |
document_details = { | |
"filename": document.name, | |
"filetype": document.type, | |
"filesize": document.size, | |
} | |
st.write(document_details) | |
# Extract content from the txt file | |
if document.type == "text/plain": | |
# Read as bytes | |
data += str(document.read(), "utf-8") | |
# Extract content from the pdf file | |
elif document.type == "application/pdf": | |
# using pdfplumber | |
try: | |
with pdfplumber.open(document) as pdf: | |
all_text = "" | |
for page in pdf.pages: | |
text = page.extract_text() | |
all_text += text + "\n" | |
data += all_text | |
except requests.exceptions.RequestException as e: | |
st.write("None") | |
# Extract content from the docx file | |
elif ( | |
document.type | |
== "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | |
): | |
data += docx2txt.process(document) | |
# return extract data | |
return data | |
else: | |
st.error("Error: An error occurred while fetching content.") | |
# return extract status, and the data extracted | |
return None | |
# function for the keyboard | |
# Check the inputs for language, promptType | |
def valid_drop_down(language, promptType, noOfQuestions, instructionFormat): | |
langFlag = False | |
promptFlag = False | |
noOfQuestionFlag = False | |
instructionFormatFlag = False | |
if language: | |
langFlag = True | |
if promptType: | |
promptFlag = True | |
if noOfQuestions: | |
noOfQuestionFlag = True | |
if instructionFormat: | |
instructionFormatFlag = True | |
# checking for the compalsory inputs and return true only if all are set | |
return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag | |
def main(): | |
# setting up the initial session_states | |
if "extract_button" not in st.session_state: | |
st.session_state.extract_button = False | |
if "submit" not in st.session_state: | |
st.session_state.submit = False | |
if "generated" not in st.session_state: | |
st.session_state.generated = False | |
if "selected" not in st.session_state: | |
st.session_state.selected = False | |
if "answered" not in st.session_state: | |
st.session_state.answered = False | |
st.subheader("LLM Instructions") | |
# form to get the inputs | |
with st.form(key="form1"): | |
st.write("#") | |
# dropdown for language | |
language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia")) | |
# dropdown for prompt type | |
promptType = st.selectbox( | |
"Select the Prompt type", ("", "Input text", "Url", "Document") | |
) | |
# inputs for number | |
noOfQuestions = st.number_input( | |
"Number of questions to generate:", min_value=1, max_value=20, value=10 | |
) | |
# dropdown for language | |
instructionFormat = st.selectbox( | |
"Format of instruction:", ("Imperative sentence", "Question") | |
) | |
# input text for openAiKey | |
openAiKey = st.text_input(label="Input the openai key", type="password") | |
if "openAiKey" in st.session_state: | |
st.session_state["openAiKey"] = openAiKey | |
else: | |
st.session_state["openAiKey"] = openAiKey | |
st.write("##") | |
# form submit button and setting up the session_state | |
if st.form_submit_button(): | |
st.session_state.submit = True | |
if st.session_state.submit: | |
# extends the prompt form to extract the data | |
with st.expander(label="prompt"): | |
with st.form(key="form2"): | |
# calling the function inside if to check valid drop down inputs | |
if valid_drop_down( | |
language, promptType, noOfQuestions, instructionFormat | |
): | |
if promptType == "Input text": | |
inputText = st.text_area( | |
label="For Instructions", | |
placeholder="Please enter your text here", | |
) | |
elif promptType == "Url": | |
url = st.text_input( | |
label="For URL", placeholder="Please enter your text here" | |
) | |
elif promptType == "Document": | |
documents = st.file_uploader( | |
label="For Documents ( pdf / txt / docx )", | |
type=["pdf", "txt", "docx"], | |
accept_multiple_files=True, | |
) | |
# if addInfoCheckbox: | |
# additionalInfo = st.text_input( | |
# label="Additional Instructions", | |
# placeholder="Please enter your text here", | |
# ) | |
if st.form_submit_button(): | |
st.session_state.extract_button = True | |
# st.experimental_rerun() | |
# extracting data | |
if st.session_state.extract_button: | |
# extracting data | |
if promptType == "Input text": | |
extractedData = inputText | |
elif promptType == "Url": | |
extractedURLData = extract_data_from_url(url, language) | |
if extractedURLData is not None: | |
extractedData = extractedURLData | |
st.text_area("Extracted Text:", value=extractedData, height=200) | |
else: | |
extractedData = False | |
elif promptType == "Document": | |
if not documents: | |
documents = None | |
else: | |
for doc in documents: | |
if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]: | |
# if documents is not the relevant type | |
st.error("Unsupported file: " + doc.name) | |
extractedDocumentData = extract_data_from_documents(documents) | |
extractedData = extractedDocumentData | |
# if the values are extracted running the custom prompt by creating an instance | |
if extractedData: | |
# ----------------------------- RUNNING THE PROMPT ----------------------------- | |
if "extractedData" not in st.session_state: | |
st.session_state["extractedData"] = extractedData | |
else: | |
st.session_state["extractedData"] = extractedData | |
if "Initial" not in st.session_state: | |
st.session_state.Initial=True | |
if st.session_state.Initial == True: | |
# running the prompt form here | |
openai.api_key = st.session_state["openAiKey"] | |
my_prompt_template = InstructionGenerationTemplate() | |
# providing the rules for the instructions to be generated | |
additional_rules = """ | |
- You do not need to provide a response to the generated examples. | |
- You must return the response in the specified language. | |
- Each generated instruction can be either an imperative sentence or a question. | |
""" | |
try : | |
if st.button("Generate Instructions"): | |
prompt = my_prompt_template.format( | |
num_questions=noOfQuestions, | |
context=extractedData, | |
instruction_format=instructionFormat, | |
lang=language, | |
additional_rules=additional_rules | |
) | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": prompt}, | |
]) | |
# if st.button("Generate Instructions"): | |
print("Generate button") | |
print("Checkpoint 1!") | |
if "result" not in st.session_state: | |
content = response.choices[0].message.content | |
# content = "\n1. helloworld1.\n2. helloworld2" | |
responses_list = content.split('\n') | |
responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp] | |
st.session_state["result"]=responses_list | |
st.session_state.generated = True | |
st.session_state.Initial = False | |
except Exception as err: | |
st.error(err) | |
if st.session_state.generated: | |
# displaying the generated instructions | |
st.write("Generated Instructions") | |
result = st.session_state["result"] | |
# print(type(result)) | |
# print(result) | |
result_dict = {i+1: value for i,value in enumerate(result)} | |
selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")] | |
# print(type(result_dict)) | |
# print(result_dict) | |
print("Checked point 2!") | |
# Display the selected items as a list | |
if selected_items: | |
st.write("Selected Items:") | |
st.write(selected_items) | |
if "selected_items" not in st.session_state: | |
st.session_state["selected_items"] = selected_items | |
st.session_state["selected_items"] = selected_items | |
st.session_state.selected = True | |
else: | |
st.write("No items selected.") | |
# ----------------------------- RUNNING THE PROMPT FOR ANSWER GENERATION ----------------------------- | |
if st.session_state.selected: | |
if "Initial2" not in st.session_state: | |
st.session_state.Initial2=True | |
if st.session_state.Initial2: | |
# running the prompt form here | |
openai.api_key = st.session_state["openAiKey"] | |
my_prompt_template2 = AnswerGenerationTemplate() | |
# providing the rules for the answers to be generated | |
additional_rules = """ | |
Each generated answer should be within the <ans>Answer</ans> tag and the question should be within the <ques>Question</ques> tag. | |
""" | |
question = st.session_state["selected_items"] | |
try: | |
if st.button("Generate Answers"): | |
prompt = my_prompt_template2.format( | |
questions=question, | |
additional_rules = additional_rules | |
) | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": prompt}, | |
]) | |
# if st.button("Generate Answers"): | |
# print("\n\n\n\nInside Answersss:\n\n\n\n") | |
# print(st.session_state["selected_items"]) | |
# print("Generate button") | |
# print("Checkpoint 3!") | |
if "answers" not in st.session_state: | |
content = response.choices[0].message.content | |
# content = "\n1. Answer1.\n2. Answer2" | |
print("\n\n\n\n\nAnswerss before regex\n\n\n\n") | |
print(content) | |
# print("Answer Type:" + str(type(content))) | |
# responses_list = content.split('\n') | |
# print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n") | |
# print(responses_list) | |
# print("Answer Type:" + str(type(responses_list))) | |
# responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp] | |
responses_list = re.findall(r'<ans>(.*?)</ans>', content, re.DOTALL) | |
st.session_state["answers"]=responses_list | |
st.session_state.answered = True | |
st.session_state.Initial2 = False | |
except Exception as e: | |
st.error(e) | |
if st.session_state.answered: | |
# displaying the generated Answers | |
questions = st.session_state["selected_items"] | |
answers = st.session_state["answers"] | |
# print("\n\n\n\n\nAnswerss after regex\n\n\n\n") | |
# print(answers) | |
# print("Answer Type:" + str(type(answers))) | |
answers_dict = {i+1: value for i,value in enumerate(answers)} | |
# print(type(answers_dict)) | |
# print(answers_dict) | |
# print("Checked point 4!") | |
# st.write("answers") | |
# st.write(answers_dict) | |
# Create a list to hold the JSON-like data | |
st.write("Generated Questions and Answers") | |
# Create a list of dictionaries | |
jsonl_data = [{"Instruction": question, "Output": answers_dict.get(i, 'No answer found'), "Input":""} for i, question in enumerate(questions, start=1)] | |
st.write(jsonl_data) | |
jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data) | |
# Display the JSONL data | |
print(jsonl_string) | |
if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"): | |
st.success("Successfully saved") | |
if st.button("Clear"): | |
st.session_state.extract_button = False | |
st.session_state.submit = False | |
st.session_state.generated = False | |
st.session_state.selected = False | |
st.session_state.answered = False | |
if "Initial" in st.session_state: | |
st.session_state.Initial = True | |
if "Initial2" in st.session_state: | |
st.session_state.Initial2 = True | |
if "openAiKey" in st.session_state: | |
del st.session_state["openAiKey"] | |
if "extractedData" in st.session_state: | |
del st.session_state["extractedData"] | |
if "result" in st.session_state: | |
del st.session_state["result"] | |
if "selected_items" in st.session_state: | |
del st.session_state["selected_items"] | |
if "answers" in st.session_state: | |
del st.session_state["answers"] | |
st.experimental_rerun() | |
if __name__ == "__main__": | |
main() | |