Spaces:

OdiaGenAI
/

Olive_Farm

Running

File size: 22,614 Bytes

import streamlit as st
import requests
import justext
import pdfplumber
import docx2txt
import json
import ast
import os
import re
import openai
import json

from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate


st.set_page_config(page_title="LLM instruction Generator")


# sidebar content
with st.sidebar:
    st.markdown("""
    <style>
    [data-testid=stImage]{
        display: block;
        margin-top: -20px;
        margin-left: auto;
        margin-right: auto;  
    }            
    </style>
    """, unsafe_allow_html=True)
    st.image(image="olive_farm.png", width=100)

    st.markdown("""
        <style>
            .sidebar-text {
                text-align: justify;
                font-size: 14px;
                padding-bottom: 16px;
            }
            .list {
                font-size: 14px !important;
            }
           
        </style>
        <div class="sidebar-text">
            OliveFarm is a cutting-edge web application crafted by the innovative minds at 
            <a href="https://www.odiagenai.org/" target="_blank">OdiaGenAI.</a>
            It's designed to effortlessly generate LLM (Language Model) instruction sets in Indic languages. 
            Presently, it offers support for Hindi and Odia, with seamless scalability to incorporate 
            additional languages on the horizon.
        </div>
        <div class="sidebar-text">
            This versatile tool accommodates inputs from a variety of sources, including (URLs, PDF documents, and plain text).         
        </div>
        <div class="sidebar-text">
            Additionally, OliveFarm features a collection of pre-existing templates, powered by ChatGPT, 
            to streamline the process of generating instruction sets. Experience the future of 
            Indic language instruction with OliveFarm!
        </div>
        <div>
            Please follow the  
            <a href="https://github.com/OdiaGenAI/Olive_Farm/blob/main/README.md" target="_blank">GitHub README</a>
            instructions to generate the instruction set.
        </div>
        <div class="sidebar-text">
            Contributors:
        </div>
        <ul>
            <li class="list">AR Kamaldeen</li>
            <li class="list">SK Shahid</li>
            <li class="list">Sambit Sekhar</li>
            <li class="list">Parul Agarwal</li>
            <li class="list">Dr. Shantipriya Parida</li>
        </ul>
    """, unsafe_allow_html=True)

    st.markdown(
    """
        <style>
            .copyright {
                text-align: center;
                font-size: 14px;
            }
        </style>
        <div class="copyright">
            © 2023 Odia Generative AI
        </div>
    """
    , unsafe_allow_html=True)

# function for the odia stoplists justext
def odia_stoplist():
    odia_stopwords = [
        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
        "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
    ]
    return frozenset(odia_stopwords)


# function to extract data from url using justext
def extract_data_from_url(url, language):
    try:
        response = requests.get(url)
        
        if response.status_code == 200:
            print("inside the response")
            response.raise_for_status()
            page = response.content
            para = ""
            if language == "English":
                paragraphs = justext.justext(page, justext.get_stoplist("English"))
            elif language == "Hindi":
                paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
            elif language == "Odia":
                paragraphs = justext.justext(
                    page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
                )

            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    para = para + "\n" + paragraph.text
            # returning the extracted data i.e para as string
            if para == "":
                st.error("Unable to extract data from the URL")
                return None
            else:
                return para
        else:
            st.error("Request failed ")
            return None
    except Exception as err:
        st.error(err)
        return None
        
    


# function to extract data from documents
def extract_data_from_documents(documents):
    data = ""
    if documents is not None:
        for document in documents:
            document_details = {
                "filename": document.name,
                "filetype": document.type,
                "filesize": document.size,
            }
            st.write(document_details)

            # Extract content from the txt file
            if document.type == "text/plain":
                # Read as bytes
                data += str(document.read(), "utf-8")

            # Extract content from the pdf file
            elif document.type == "application/pdf":
                # using pdfplumber
                try:
                    with pdfplumber.open(document) as pdf:
                        all_text = ""
                        for page in pdf.pages:
                            text = page.extract_text()
                            all_text += text + "\n"
                        data += all_text
                except requests.exceptions.RequestException as e:
                    st.write("None")

            # Extract content from the docx file
            elif (
                document.type
                == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ):
                data += docx2txt.process(document)

        # return extract data
        return data
    else:
        st.error("Error: An error occurred while fetching content.")
        # return extract status, and the data extracted
        return None


# function for the keyboard



# Check the inputs for language, promptType
def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
    langFlag = False
    promptFlag = False
    noOfQuestionFlag = False
    instructionFormatFlag = False

    if language:
        langFlag = True
    if promptType:
        promptFlag = True
    if noOfQuestions:
        noOfQuestionFlag = True
    if instructionFormat:
        instructionFormatFlag = True
    # checking for the compalsory inputs and return true only if all are set
    return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag


def main():
    # setting up the initial session_states
    if "extract_button" not in st.session_state:
        st.session_state.extract_button = False
    if "submit" not in st.session_state:
        st.session_state.submit = False
    if "generated" not in st.session_state:
        st.session_state.generated = False
    if "selected" not in st.session_state:
        st.session_state.selected = False
    if "answered" not in st.session_state:
        st.session_state.answered = False

    st.subheader("LLM Instructions")

    # form to get the inputs
    with st.form(key="form1"):
        st.write("#")

        # dropdown for language
        language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))

        # dropdown for prompt type
        promptType = st.selectbox(
            "Select the Prompt type", ("", "Input text", "Url", "Document")
        )
        # inputs for number
        noOfQuestions = st.number_input(
            "Number of questions to generate:", min_value=1, max_value=20, value=10
        )

        # dropdown for language
        instructionFormat = st.selectbox(
            "Format of instruction:", ("Imperative sentence", "Question")
        )

        # input text for openAiKey
        openAiKey = st.text_input(label="Input the openai key", type="password")
        if "openAiKey" in  st.session_state:
            st.session_state["openAiKey"]  = openAiKey
        else:
            st.session_state["openAiKey"]  = openAiKey

        st.write("##")

        # form submit button and setting up the session_state
        if st.form_submit_button():
            st.session_state.submit = True

    if st.session_state.submit:
        # extends the prompt form to extract the data
        with st.expander(label="prompt"):
            with st.form(key="form2"):
                # calling the function inside if to check valid drop down inputs
                if valid_drop_down(
                    language, promptType, noOfQuestions, instructionFormat
                ):
                    if promptType == "Input text":
                        inputText = st.text_area(
                            label="For Instructions",
                            placeholder="Please enter your text here",
                        )

                    elif promptType == "Url":
                        url = st.text_input(
                            label="For URL", placeholder="Please enter your text here"
                        )
                    elif promptType == "Document":
                        documents = st.file_uploader(
                            label="For Documents ( pdf / txt / docx )",
                            type=["pdf", "txt", "docx"],
                            accept_multiple_files=True,
                        )

                    # if addInfoCheckbox:
                    #     additionalInfo = st.text_input(
                    #         label="Additional Instructions",
                    #         placeholder="Please enter your text here",
                    #     )

                    if st.form_submit_button():
                        st.session_state.extract_button = True
                        # st.experimental_rerun()

    # extracting data
    if st.session_state.extract_button:
        # extracting data
        
        if promptType == "Input text":
            extractedData = inputText

        elif promptType == "Url":
            extractedURLData = extract_data_from_url(url, language)
            if extractedURLData is not None:
                extractedData = extractedURLData
                st.text_area("Extracted Text:", value=extractedData, height=200)
            else:
                extractedData = False
        elif promptType == "Document":
            if not documents:
                documents = None
            else:
                for doc in documents:
                    if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
                        # if documents is not the relevant type
                        st.error("Unsupported file: " + doc.name)

                extractedDocumentData = extract_data_from_documents(documents)
                extractedData = extractedDocumentData


        # if the values are extracted running the custom prompt by creating an instance
        if extractedData:


            # -----------------------------    RUNNING THE PROMPT   -----------------------------
            if "extractedData" not in st.session_state:
                st.session_state["extractedData"] = extractedData
            else:
                st.session_state["extractedData"] = extractedData

            if "Initial" not in st.session_state:
                st.session_state.Initial=True

            if st.session_state.Initial == True:
                
                # running the prompt form here

                openai.api_key = st.session_state["openAiKey"]
                my_prompt_template = InstructionGenerationTemplate()

                # providing the rules for the instructions to be generated
                additional_rules = """
                - You do not need to provide a response to the generated examples.
                - You must return the response in the specified language.
                - Each generated instruction can be either an imperative sentence or a question.
                """
                try :
                    if st.button("Generate Instructions"):
                        prompt = my_prompt_template.format(
                            num_questions=noOfQuestions, 
                            context=extractedData, 
                            instruction_format=instructionFormat, 
                            lang=language, 
                            additional_rules=additional_rules
                        )
                        response = openai.ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                    {"role": "system", "content": prompt},
                                ])
                    # if st.button("Generate Instructions"):
                        print("Generate button")
                        print("Checkpoint 1!")
                        
                        if "result" not in st.session_state:
                            content = response.choices[0].message.content
                            # content = "\n1. helloworld1.\n2. helloworld2"
                            responses_list = content.split('\n')
                            responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
                            st.session_state["result"]=responses_list
                            st.session_state.generated = True
                        st.session_state.Initial = False
                except Exception as err:
                    st.error(err)
            if st.session_state.generated:
                # displaying the generated instructions
                st.write("Generated Instructions")
                result = st.session_state["result"]
                # print(type(result))
                # print(result)
                result_dict = {i+1: value for i,value in enumerate(result)}
                selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
                # print(type(result_dict))
                # print(result_dict)
                print("Checked point 2!")
                # Display the selected items as a list
                if selected_items:
                    st.write("Selected Items:")
                    st.write(selected_items)
                    if "selected_items" not in st.session_state:
                        st.session_state["selected_items"] = selected_items
                    st.session_state["selected_items"] = selected_items
                    st.session_state.selected = True
                else:
                    st.write("No items selected.")
            


            # -----------------------------    RUNNING THE PROMPT FOR ANSWER GENERATION  -----------------------------
            
           

            if st.session_state.selected:

                if "Initial2" not in st.session_state:
                    st.session_state.Initial2=True
            
                if st.session_state.Initial2:
                    # running the prompt form here
                    openai.api_key = st.session_state["openAiKey"]
                    my_prompt_template2 = AnswerGenerationTemplate()

                     # providing the rules for the answers to be generated
                    additional_rules = """
                        Each generated answer should be within the <ans>Answer</ans> tag and the question should be within the <ques>Question</ques> tag.
                    """
                   
                    question =  st.session_state["selected_items"]
                    try:
                        if st.button("Generate Answers"):
                            prompt = my_prompt_template2.format(
                                    questions=question,
                                    additional_rules = additional_rules
                            )
                            response = openai.ChatCompletion.create(
                                model="gpt-3.5-turbo",
                                messages=[
                                        {"role": "system", "content": prompt},
                                    ])

                        # if st.button("Generate Answers"):
                            # print("\n\n\n\nInside Answersss:\n\n\n\n")
                            # print(st.session_state["selected_items"])
                            
                            # print("Generate button")
                            # print("Checkpoint 3!")

                            if "answers" not in st.session_state:
                                content = response.choices[0].message.content
                                # content = "\n1. Answer1.\n2. Answer2"
                                print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
                                print(content)
                                # print("Answer Type:" + str(type(content)))
                                # responses_list = content.split('\n')
                                # print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
                                # print(responses_list)
                                # print("Answer Type:" + str(type(responses_list)))

                                # responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
                                responses_list = re.findall(r'<ans>(.*?)</ans>', content, re.DOTALL)

                                
                                st.session_state["answers"]=responses_list
                                st.session_state.answered = True
                            st.session_state.Initial2 = False
                    except Exception as e:
                        st.error(e)
                if st.session_state.answered:
                    # displaying the generated Answers
                    
                    questions = st.session_state["selected_items"]
                    answers = st.session_state["answers"]
                    # print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
                    # print(answers)
                    # print("Answer Type:" + str(type(answers)))
                    answers_dict = {i+1: value for i,value in enumerate(answers)}
                    # print(type(answers_dict))
                    # print(answers_dict)
                    # print("Checked point 4!")
                    # st.write("answers")
                    # st.write(answers_dict)

                    # Create a list to hold the JSON-like data
                    st.write("Generated Questions and Answers")
                    # Create a list of dictionaries
                    jsonl_data = [{"Instruction": question, "Output": answers_dict.get(i, 'No answer found'), "Input":""} for i, question in enumerate(questions, start=1)]
                    
                
                    st.write(jsonl_data)
                    jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)

                    # Display the JSONL data
                    print(jsonl_string)

                    if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
                        st.success("Successfully saved")


                    
                
        if st.button("Clear"):
            st.session_state.extract_button = False
            st.session_state.submit = False
            st.session_state.generated = False
            st.session_state.selected = False
            st.session_state.answered = False
            

            
            if "Initial" in st.session_state:
                st.session_state.Initial = True
            if "Initial2" in st.session_state:
                st.session_state.Initial2 = True


            if "openAiKey" in  st.session_state:
                del st.session_state["openAiKey"]
            if "extractedData" in st.session_state:
                del st.session_state["extractedData"]
            if "result" in  st.session_state:
                del st.session_state["result"]  
            if "selected_items" in  st.session_state:
                del st.session_state["selected_items"]  
            if "answers" in  st.session_state:
                del st.session_state["answers"]  
            st.experimental_rerun()


if __name__ == "__main__":
    main()