File size: 22,614 Bytes
11fa0f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00fc1ec
 
 
 
 
 
83e653d
00fc1ec
 
 
 
 
 
 
 
83e653d
 
 
 
 
 
 
 
 
 
 
 
eaeff06
 
83e653d
 
 
 
 
 
 
 
 
 
 
 
eaeff06
 
 
 
 
83e653d
 
 
 
 
 
 
363527f
83e653d
 
00fc1ec
83e653d
00fc1ec
 
83e653d
 
 
 
 
 
 
 
 
00fc1ec
 
11fa0f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99defd8
11fa0f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83e653d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11fa0f1
 
83e653d
11fa0f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f964c86
11fa0f1
 
 
83e653d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f964c86
83e653d
 
 
 
f964c86
 
 
 
83e653d
 
 
 
 
11fa0f1
 
 
 
 
 
 
 
 
 
 
 
 
83e653d
11fa0f1
 
 
 
83e653d
11fa0f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83e653d
11fa0f1
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
import streamlit as st
import requests
import justext
import pdfplumber
import docx2txt
import json
import ast
import os
import re
import openai
import json

from custom_prompt_template import InstructionGenerationTemplate, AnswerGenerationTemplate


st.set_page_config(page_title="LLM instruction Generator")


# sidebar content
with st.sidebar:
    st.markdown("""
    <style>
    [data-testid=stImage]{
        display: block;
        margin-top: -20px;
        margin-left: auto;
        margin-right: auto;  
    }            
    </style>
    """, unsafe_allow_html=True)
    st.image(image="olive_farm.png", width=100)

    st.markdown("""
        <style>
            .sidebar-text {
                text-align: justify;
                font-size: 14px;
                padding-bottom: 16px;
            }
            .list {
                font-size: 14px !important;
            }
           
        </style>
        <div class="sidebar-text">
            OliveFarm is a cutting-edge web application crafted by the innovative minds at 
            <a href="https://www.odiagenai.org/" target="_blank">OdiaGenAI.</a>
            It's designed to effortlessly generate LLM (Language Model) instruction sets in Indic languages. 
            Presently, it offers support for Hindi and Odia, with seamless scalability to incorporate 
            additional languages on the horizon.
        </div>
        <div class="sidebar-text">
            This versatile tool accommodates inputs from a variety of sources, including (URLs, PDF documents, and plain text).         
        </div>
        <div class="sidebar-text">
            Additionally, OliveFarm features a collection of pre-existing templates, powered by ChatGPT, 
            to streamline the process of generating instruction sets. Experience the future of 
            Indic language instruction with OliveFarm!
        </div>
        <div>
            Please follow the  
            <a href="https://github.com/OdiaGenAI/Olive_Farm/blob/main/README.md" target="_blank">GitHub README</a>
            instructions to generate the instruction set.
        </div>
        <div class="sidebar-text">
            Contributors:
        </div>
        <ul>
            <li class="list">AR Kamaldeen</li>
            <li class="list">SK Shahid</li>
            <li class="list">Sambit Sekhar</li>
            <li class="list">Parul Agarwal</li>
            <li class="list">Dr. Shantipriya Parida</li>
        </ul>
    """, unsafe_allow_html=True)

    st.markdown(
    """
        <style>
            .copyright {
                text-align: center;
                font-size: 14px;
            }
        </style>
        <div class="copyright">
            © 2023 Odia Generative AI
        </div>
    """
    , unsafe_allow_html=True)

# function for the odia stoplists justext
def odia_stoplist():
    odia_stopwords = [
        "ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର",
        "ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର",
        "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର",
        "ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ",
        "ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା",
        "ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |",
        "କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ",
        "ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ",
        "ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ",
        "ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ",
        "ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ",
        "ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି",
        "ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ",
        "କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ"
    ]
    return frozenset(odia_stopwords)


# function to extract data from url using justext
def extract_data_from_url(url, language):
    try:
        response = requests.get(url)
        
        if response.status_code == 200:
            print("inside the response")
            response.raise_for_status()
            page = response.content
            para = ""
            if language == "English":
                paragraphs = justext.justext(page, justext.get_stoplist("English"))
            elif language == "Hindi":
                paragraphs = justext.justext(page, justext.get_stoplist("Hindi"), 70, 140, 0.0, 0.02, 0.5, 150, False)
            elif language == "Odia":
                paragraphs = justext.justext(
                    page, odia_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False
                )

            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    para = para + "\n" + paragraph.text
            # returning the extracted data i.e para as string
            if para == "":
                st.error("Unable to extract data from the URL")
                return None
            else:
                return para
        else:
            st.error("Request failed ")
            return None
    except Exception as err:
        st.error(err)
        return None
        
    


# function to extract data from documents
def extract_data_from_documents(documents):
    data = ""
    if documents is not None:
        for document in documents:
            document_details = {
                "filename": document.name,
                "filetype": document.type,
                "filesize": document.size,
            }
            st.write(document_details)

            # Extract content from the txt file
            if document.type == "text/plain":
                # Read as bytes
                data += str(document.read(), "utf-8")

            # Extract content from the pdf file
            elif document.type == "application/pdf":
                # using pdfplumber
                try:
                    with pdfplumber.open(document) as pdf:
                        all_text = ""
                        for page in pdf.pages:
                            text = page.extract_text()
                            all_text += text + "\n"
                        data += all_text
                except requests.exceptions.RequestException as e:
                    st.write("None")

            # Extract content from the docx file
            elif (
                document.type
                == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
            ):
                data += docx2txt.process(document)

        # return extract data
        return data
    else:
        st.error("Error: An error occurred while fetching content.")
        # return extract status, and the data extracted
        return None


# function for the keyboard



# Check the inputs for language, promptType
def valid_drop_down(language, promptType, noOfQuestions, instructionFormat):
    langFlag = False
    promptFlag = False
    noOfQuestionFlag = False
    instructionFormatFlag = False

    if language:
        langFlag = True
    if promptType:
        promptFlag = True
    if noOfQuestions:
        noOfQuestionFlag = True
    if instructionFormat:
        instructionFormatFlag = True
    # checking for the compalsory inputs and return true only if all are set
    return langFlag & promptFlag & noOfQuestionFlag & instructionFormatFlag


def main():
    # setting up the initial session_states
    if "extract_button" not in st.session_state:
        st.session_state.extract_button = False
    if "submit" not in st.session_state:
        st.session_state.submit = False
    if "generated" not in st.session_state:
        st.session_state.generated = False
    if "selected" not in st.session_state:
        st.session_state.selected = False
    if "answered" not in st.session_state:
        st.session_state.answered = False

    st.subheader("LLM Instructions")

    # form to get the inputs
    with st.form(key="form1"):
        st.write("#")

        # dropdown for language
        language = st.selectbox("Select a language", ("", "English", "Hindi", "Odia"))

        # dropdown for prompt type
        promptType = st.selectbox(
            "Select the Prompt type", ("", "Input text", "Url", "Document")
        )
        # inputs for number
        noOfQuestions = st.number_input(
            "Number of questions to generate:", min_value=1, max_value=20, value=10
        )

        # dropdown for language
        instructionFormat = st.selectbox(
            "Format of instruction:", ("Imperative sentence", "Question")
        )

        # input text for openAiKey
        openAiKey = st.text_input(label="Input the openai key", type="password")
        if "openAiKey" in  st.session_state:
            st.session_state["openAiKey"]  = openAiKey
        else:
            st.session_state["openAiKey"]  = openAiKey

        st.write("##")

        # form submit button and setting up the session_state
        if st.form_submit_button():
            st.session_state.submit = True

    if st.session_state.submit:
        # extends the prompt form to extract the data
        with st.expander(label="prompt"):
            with st.form(key="form2"):
                # calling the function inside if to check valid drop down inputs
                if valid_drop_down(
                    language, promptType, noOfQuestions, instructionFormat
                ):
                    if promptType == "Input text":
                        inputText = st.text_area(
                            label="For Instructions",
                            placeholder="Please enter your text here",
                        )

                    elif promptType == "Url":
                        url = st.text_input(
                            label="For URL", placeholder="Please enter your text here"
                        )
                    elif promptType == "Document":
                        documents = st.file_uploader(
                            label="For Documents ( pdf / txt / docx )",
                            type=["pdf", "txt", "docx"],
                            accept_multiple_files=True,
                        )

                    # if addInfoCheckbox:
                    #     additionalInfo = st.text_input(
                    #         label="Additional Instructions",
                    #         placeholder="Please enter your text here",
                    #     )

                    if st.form_submit_button():
                        st.session_state.extract_button = True
                        # st.experimental_rerun()

    # extracting data
    if st.session_state.extract_button:
        # extracting data
        
        if promptType == "Input text":
            extractedData = inputText

        elif promptType == "Url":
            extractedURLData = extract_data_from_url(url, language)
            if extractedURLData is not None:
                extractedData = extractedURLData
                st.text_area("Extracted Text:", value=extractedData, height=200)
            else:
                extractedData = False
        elif promptType == "Document":
            if not documents:
                documents = None
            else:
                for doc in documents:
                    if doc.name.split(".")[-1].lower() not in ["pdf", "txt", "docx"]:
                        # if documents is not the relevant type
                        st.error("Unsupported file: " + doc.name)

                extractedDocumentData = extract_data_from_documents(documents)
                extractedData = extractedDocumentData


        # if the values are extracted running the custom prompt by creating an instance
        if extractedData:


            # -----------------------------    RUNNING THE PROMPT   -----------------------------
            if "extractedData" not in st.session_state:
                st.session_state["extractedData"] = extractedData
            else:
                st.session_state["extractedData"] = extractedData

            if "Initial" not in st.session_state:
                st.session_state.Initial=True

            if st.session_state.Initial == True:
                
                # running the prompt form here

                openai.api_key = st.session_state["openAiKey"]
                my_prompt_template = InstructionGenerationTemplate()

                # providing the rules for the instructions to be generated
                additional_rules = """
                - You do not need to provide a response to the generated examples.
                - You must return the response in the specified language.
                - Each generated instruction can be either an imperative sentence or a question.
                """
                try :
                    if st.button("Generate Instructions"):
                        prompt = my_prompt_template.format(
                            num_questions=noOfQuestions, 
                            context=extractedData, 
                            instruction_format=instructionFormat, 
                            lang=language, 
                            additional_rules=additional_rules
                        )
                        response = openai.ChatCompletion.create(
                            model="gpt-3.5-turbo",
                            messages=[
                                    {"role": "system", "content": prompt},
                                ])
                    # if st.button("Generate Instructions"):
                        print("Generate button")
                        print("Checkpoint 1!")
                        
                        if "result" not in st.session_state:
                            content = response.choices[0].message.content
                            # content = "\n1. helloworld1.\n2. helloworld2"
                            responses_list = content.split('\n')
                            responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
                            st.session_state["result"]=responses_list
                            st.session_state.generated = True
                        st.session_state.Initial = False
                except Exception as err:
                    st.error(err)
            if st.session_state.generated:
                # displaying the generated instructions
                st.write("Generated Instructions")
                result = st.session_state["result"]
                # print(type(result))
                # print(result)
                result_dict = {i+1: value for i,value in enumerate(result)}
                selected_items = [f" {value} " for key, value in result_dict.items() if st.checkbox(f"Q{key} : {value}")]
                # print(type(result_dict))
                # print(result_dict)
                print("Checked point 2!")
                # Display the selected items as a list
                if selected_items:
                    st.write("Selected Items:")
                    st.write(selected_items)
                    if "selected_items" not in st.session_state:
                        st.session_state["selected_items"] = selected_items
                    st.session_state["selected_items"] = selected_items
                    st.session_state.selected = True
                else:
                    st.write("No items selected.")
            


            # -----------------------------    RUNNING THE PROMPT FOR ANSWER GENERATION  -----------------------------
            
           

            if st.session_state.selected:

                if "Initial2" not in st.session_state:
                    st.session_state.Initial2=True
            
                if st.session_state.Initial2:
                    # running the prompt form here
                    openai.api_key = st.session_state["openAiKey"]
                    my_prompt_template2 = AnswerGenerationTemplate()

                     # providing the rules for the answers to be generated
                    additional_rules = """
                        Each generated answer should be within the <ans>Answer</ans> tag and the question should be within the <ques>Question</ques> tag.
                    """
                   
                    question =  st.session_state["selected_items"]
                    try:
                        if st.button("Generate Answers"):
                            prompt = my_prompt_template2.format(
                                    questions=question,
                                    additional_rules = additional_rules
                            )
                            response = openai.ChatCompletion.create(
                                model="gpt-3.5-turbo",
                                messages=[
                                        {"role": "system", "content": prompt},
                                    ])

                        # if st.button("Generate Answers"):
                            # print("\n\n\n\nInside Answersss:\n\n\n\n")
                            # print(st.session_state["selected_items"])
                            
                            # print("Generate button")
                            # print("Checkpoint 3!")

                            if "answers" not in st.session_state:
                                content = response.choices[0].message.content
                                # content = "\n1. Answer1.\n2. Answer2"
                                print("\n\n\n\n\nAnswerss before regex\n\n\n\n")
                                print(content)
                                # print("Answer Type:" + str(type(content)))
                                # responses_list = content.split('\n')
                                # print("\n\n\n\n\nAnswerss before regex after splitting\n\n\n\n")
                                # print(responses_list)
                                # print("Answer Type:" + str(type(responses_list)))

                                # responses_list = [re.sub(r'^\s*\d+\.\s*', '', resp) for resp in responses_list if resp]
                                responses_list = re.findall(r'<ans>(.*?)</ans>', content, re.DOTALL)

                                
                                st.session_state["answers"]=responses_list
                                st.session_state.answered = True
                            st.session_state.Initial2 = False
                    except Exception as e:
                        st.error(e)
                if st.session_state.answered:
                    # displaying the generated Answers
                    
                    questions = st.session_state["selected_items"]
                    answers = st.session_state["answers"]
                    # print("\n\n\n\n\nAnswerss after regex\n\n\n\n")
                    # print(answers)
                    # print("Answer Type:" + str(type(answers)))
                    answers_dict = {i+1: value for i,value in enumerate(answers)}
                    # print(type(answers_dict))
                    # print(answers_dict)
                    # print("Checked point 4!")
                    # st.write("answers")
                    # st.write(answers_dict)

                    # Create a list to hold the JSON-like data
                    st.write("Generated Questions and Answers")
                    # Create a list of dictionaries
                    jsonl_data = [{"Instruction": question, "Output": answers_dict.get(i, 'No answer found'), "Input":""} for i, question in enumerate(questions, start=1)]
                    
                
                    st.write(jsonl_data)
                    jsonl_string = '\n'.join(json.dumps(item, ensure_ascii=False) for item in jsonl_data)

                    # Display the JSONL data
                    print(jsonl_string)

                    if st.download_button(label="Save as jsonl", data=jsonl_string, mime="application/json"):
                        st.success("Successfully saved")


                    
                
        if st.button("Clear"):
            st.session_state.extract_button = False
            st.session_state.submit = False
            st.session_state.generated = False
            st.session_state.selected = False
            st.session_state.answered = False
            

            
            if "Initial" in st.session_state:
                st.session_state.Initial = True
            if "Initial2" in st.session_state:
                st.session_state.Initial2 = True


            if "openAiKey" in  st.session_state:
                del st.session_state["openAiKey"]
            if "extractedData" in st.session_state:
                del st.session_state["extractedData"]
            if "result" in  st.session_state:
                del st.session_state["result"]  
            if "selected_items" in  st.session_state:
                del st.session_state["selected_items"]  
            if "answers" in  st.session_state:
                del st.session_state["answers"]  
            st.experimental_rerun()


if __name__ == "__main__":
    main()