File size: 22,999 Bytes
8499c35
 
3738236
a0aab75
c3f5a2a
7ef5456
8499c35
 
 
 
 
1172ae8
8499c35
 
 
35a0403
77b71a6
be67fcf
 
8d7b496
d401ad6
6155281
0566fd9
 
 
a0ade06
a0aab75
08e6e30
b3f5eda
8e2eef3
3e9b436
f1e7785
805e19a
b3f5eda
 
 
 
 
 
 
bfc4098
3e9b436
 
 
 
 
 
 
 
 
 
 
 
 
08e6e30
6155281
8499c35
 
6155281
8499c35
 
0be8860
8d6cc8d
cedea8d
 
 
b3f5eda
63f91c1
74f896e
92672ef
770e1bd
8499c35
63f91c1
 
8499c35
 
 
 
 
cac7541
b7ef881
7e12771
bfef940
8499c35
1487c65
377fd6b
a5008d2
 
 
 
 
 
 
 
47afd47
 
 
 
 
 
 
1487c65
47afd47
92672ef
 
 
 
 
 
4070bba
 
92672ef
 
 
 
74f896e
14eaae6
 
 
 
 
 
 
 
 
 
 
3e374bf
14eaae6
 
 
 
 
 
 
 
 
 
 
 
 
3e374bf
14eaae6
 
 
 
93cff1c
136e24e
92672ef
 
136e24e
 
 
 
 
47afd47
2162c36
14eaae6
47afd47
14eaae6
47afd47
14eaae6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4efab7
14eaae6
 
 
 
 
 
 
 
 
 
 
a4efab7
 
14eaae6
 
 
 
 
 
 
faf79c5
4572bc8
a5008d2
4070bba
 
 
 
 
 
 
 
14eaae6
4070bba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ef5456
4070bba
93cff1c
7ef5456
4070bba
 
 
 
 
 
 
 
 
14eaae6
 
 
 
4070bba
14eaae6
4070bba
14eaae6
4070bba
770e1bd
47afd47
 
63f91c1
47afd47
 
 
 
 
1ad3fab
47afd47
8499c35
47afd47
 
 
 
 
 
 
8e2eef3
47afd47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63f91c1
47afd47
 
 
 
 
 
 
 
08e6e30
770e1bd
45412ff
17828cf
 
b1b5065
17828cf
e6fd33c
b1b5065
17828cf
 
 
b1b5065
17828cf
b1b5065
 
e85d7fc
 
b1b5065
1c1d7c0
 
 
17828cf
42a3a84
 
 
17828cf
42a3a84
 
 
 
 
45412ff
42a3a84
 
 
17828cf
b1b5065
7a5728d
770e1bd
93ef2af
 
 
cac7541
 
 
 
 
 
6a1d689
93ef2af
47afd47
edd60a3
770e1bd
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
040ddf0
8499c35
 
 
 
 
 
47afd47
770e1bd
8499c35
 
 
edd60a3
770e1bd
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
c2ce126
 
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
770e1bd
8499c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47afd47
 
 
 
8610297
 
 
 
 
47afd47
 
 
 
 
 
 
 
 
 
c25ef02
2a35885
c25ef02
47afd47
 
 
3e3cd53
47afd47
b9ab57e
47afd47
db7f66b
3e3cd53
db7f66b
 
 
3b92b2a
 
db7f66b
 
3d4bc90
47afd47
822959f
c25ef02
721935c
4847fdb
47afd47
 
b3f5eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47afd47
 
 
 
 
 
 
 
 
 
 
b3f5eda
47afd47
 
 
b3f5eda
 
 
 
 
 
 
47afd47
 
 
b3f5eda
47afd47
 
 
 
 
 
 
 
 
 
 
 
805e19a
 
b3f5eda
47afd47
 
b3f5eda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47afd47
b3f5eda
 
 
 
 
 
 
 
 
 
47afd47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8499c35
 
 
 
 
 
 
 
 
edd60a3
 
8499c35
 
 
 
 
 
 
 
 
 
 
edd60a3
e512522
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
import whisper
import os
import random
import openai
import yt_dlp
from pytube import YouTube, extract
import pandas as pd
import plotly_express as px
import nltk
import plotly.graph_objects as go
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoModelForTokenClassification
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import streamlit as st
import en_core_web_lg
import validators
import re
import itertools
import numpy as np
from bs4 import BeautifulSoup   
import base64, time
from annotated_text import annotated_text
import pickle, math
import wikipedia
from pyvis.network import Network
import torch
from pydub import AudioSegment
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain

from langchain.callbacks import StreamlitCallbackHandler
from langchain.agents import OpenAIFunctionsAgent, AgentExecutor
from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.agents.openai_functions_agent.agent_token_buffer_memory import (
    AgentTokenBufferMemory,
)
from langchain.prompts import MessagesPlaceholder

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

from langchain.prompts import PromptTemplate

nltk.download('punkt')


from nltk import sent_tokenize

OPEN_AI_KEY = os.environ.get('OPEN_AI_KEY')
time_str = time.strftime("%d%m%Y-%H%M%S")
HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; 
margin-bottom: 2.5rem">{}</div> """


###################### Functions #######################################################################################

#load all required models and cache
@st.cache_resource
def load_models():

    '''Load and cache all the models to be used'''
    q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
    ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
    q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
    ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
    sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
    sum_pipe = pipeline("summarization",model="philschmid/flan-t5-base-samsum",clean_up_tokenization_spaces=True)
    ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
    cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
    sbert = SentenceTransformer('all-MiniLM-L6-v2')
    
    return sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert

@st.cache_data
def load_asr_model(model_name):

    '''Load the open source  whisper model in cases where the API is not working'''
    model = whisper.load_model(model_name)

    return model

@st.cache_resource
def get_spacy():
    nlp = en_core_web_lg.load()
    return nlp
    
nlp = get_spacy()    

sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert  = load_models()

@st.cache_data
def get_yt_audio(url):

    '''Get YT video from given URL link'''
    yt = YouTube(url)

    title = yt.title

    # Get the first available audio stream and download it
    audio_stream =  yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download()

    return audio_stream, title

@st.cache_data
def get_yt_audio_dl(url):

    '''Back up for when pytube is down'''
    
    temp_audio_file = os.path.join('output', 'audio')

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': temp_audio_file,
        'quiet': True,
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        
        info = ydl.extract_info(url, download=False)
        title = info.get('title', None)
        ydl.download([url])

    #with open(temp_audio_file+'.mp3', 'rb') as file:
    audio_file = os.path.join('output', 'audio.mp3')

    return audio_file, title

    
@st.cache_data
def load_whisper_api(audio):

    '''Transcribe YT audio to text using Open AI API'''
    file = open(audio, "rb")
    transcript = openai.Audio.translate("whisper-1", file)

    return transcript

@st.cache_data
def transcribe_yt_video(link, py_tube=True):
    '''Transcribe YouTube video'''

    if py_tube:

        audio_file, title = get_yt_audio(link)

        print(f'audio_file:{audio_file}')

        st.session_state['audio'] = audio_file

        print(f"audio_file_session_state:{st.session_state['audio'] }")

        #Get size of audio file
        audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)

        #Check if file is > 24mb, if not then use Whisper API
        if audio_size <= 25:

            st.info("`Transcribing YT audio...`")
            
            #Use whisper API
            results = load_whisper_api(st.session_state['audio'])['text']

        else:

            st.warning('File size larger than 24mb, applying chunking and transcription',icon="⚠️")

            song = AudioSegment.from_file(st.session_state['audio'], format='mp4')

            # PyDub handles time in milliseconds
            twenty_minutes = 20 * 60 * 1000
            
            chunks = song[::twenty_minutes]
            
            transcriptions = []

            video_id = extract.video_id(link)
            
            for i, chunk in enumerate(chunks):
                chunk.export(f'output/chunk_{i}_{video_id}.mp4', format='mp4')
                transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp4')['text'])

            results = ','.join(transcriptions)

    else:

        audio_file, title = get_yt_audio_dl(link)

        print(f'audio_file:{audio_file}')

        st.session_state['audio'] = audio_file

        print(f"audio_file_session_state:{st.session_state['audio'] }")

        #Get size of audio file
        audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1)

        #Check if file is > 24mb, if not then use Whisper API
        if audio_size <= 25:

            st.info("`Transcribing YT audio...`")
            
            #Use whisper API
            results = load_whisper_api(st.session_state['audio'])['text']

        else:

            st.warning('File size larger than 24mb, applying chunking and transcription',icon="⚠️")

            song = AudioSegment.from_file(st.session_state['audio'], format='mp3')

            # PyDub handles time in milliseconds
            twenty_minutes = 20 * 60 * 1000
            
            chunks = song[::twenty_minutes]
            
            transcriptions = []

            video_id = extract.video_id(link)
            
            for i, chunk in enumerate(chunks):
                chunk.export(f'output/chunk_{i}_{video_id}.mp3', format='mp3')
                transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp3')['text'])

            results = ','.join(transcriptions)


    st.info("`YT Video transcription process complete...`")

    return results, title

@st.cache_data
def inference(link, upload):
    '''Convert Youtube video or Audio upload to text'''
    
    try:
        
        if validators.url(link):

            st.info("`Downloading YT audio...`")
            
            results, title = transcribe_yt_video(link)

            return results, title

        elif _upload:

            #Get size of audio file
            audio_size = round(os.path.getsize(_upload)/(1024*1024),1)
    
            #Check if file is > 24mb, if not then use Whisper API
            if audio_size <= 25:

                st.info("`Transcribing uploaded audio...`")
                
                #Use whisper API
                results = load_whisper_api(_upload)['text']
    
            else:
    
                st.write('File size larger than 24mb, applying chunking and transcription')
    
                song = AudioSegment.from_file(_upload)
    
                # PyDub handles time in milliseconds
                twenty_minutes = 20 * 60 * 1000
                
                chunks = song[::twenty_minutes]
                
                transcriptions = []

                st.info("`Transcribing uploaded audio...`")

                for i, chunk in enumerate(chunks):
                    chunk.export(f'output/chunk_{i}.mp4', format='mp4')
                    transcriptions.append(load_whisper_api(f'output/chunk_{i}.mp4')['text'])
    
                results = ','.join(transcriptions)

            st.info("`Uploaded audio transcription process complete...`")

            return results, "Transcribed Earnings Audio"
                
    except Exception as e:

        st.error(f'''PyTube Error: {e}, 
                    Using yt_dlp module, might take longer than expected''',icon="🚨")

        results, title = transcribe_yt_video(link, py_tube=False)
        
        # results = _asr_model.transcribe(st.session_state['audio'], task='transcribe', language='en')
      
        return results, title

@st.cache_data
def clean_text(text):
    '''Clean all text after inference'''

    text = text.encode("ascii", "ignore").decode()  # unicode
    text = re.sub(r"https*\S+", " ", text)  # url
    text = re.sub(r"@\S+", " ", text)  # mentions
    text = re.sub(r"#\S+", " ", text)  # hastags
    text = re.sub(r"\s{2,}", " ", text)  # over spaces
    
    return text

@st.cache_data
def chunk_long_text(text,threshold,window_size=3,stride=2):
    '''Preprocess text and chunk for sentiment analysis'''
    
    #Convert cleaned text into sentences
    sentences = sent_tokenize(text)
    out = []

    #Limit the length of each sentence to a threshold
    for chunk in sentences:
        if len(chunk.split()) < threshold:
            out.append(chunk)
        else:
            words = chunk.split()
            num = int(len(words)/threshold)
            for i in range(0,num*threshold+1,threshold):
                out.append(' '.join(words[i:threshold+i]))
    
    passages = []
    
    #Combine sentences into a window of size window_size
    for paragraph in [out]:
        for start_idx in range(0, len(paragraph), stride):
            end_idx = min(start_idx+window_size, len(paragraph))
            passages.append(" ".join(paragraph[start_idx:end_idx]))
            
    return passages  

@st.cache_data
def sentiment_pipe(earnings_text):
    '''Determine the sentiment of the text'''
    
    earnings_sentences = chunk_long_text(earnings_text,150,1,1)
    earnings_sentiment = sent_pipe(earnings_sentences)
    
    return earnings_sentiment, earnings_sentences 

@st.cache_data
def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'):

    '''Chunk and preprocess text for summarization'''
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    sentences = sent_tokenize(text)
    
    # initialize
    length = 0
    chunk = ""
    chunks = []
    count = -1
    
    for sentence in sentences:
        count += 1
        combined_length = len(tokenizer.tokenize(sentence)) + length # add the no. of sentence tokens to the length counter
    
        if combined_length  <= tokenizer.max_len_single_sentence: # if it doesn't exceed
            chunk += sentence + " " # add the sentence to the chunk
            length = combined_length # update the length counter
    
            # if it is the last sentence
            if count == len(sentences) - 1:
                chunks.append(chunk) # save the chunk
      
        else: 
            chunks.append(chunk) # save the chunk
            # reset 
            length = 0 
            chunk = ""
        
            # take care of the overflow sentence
            chunk += sentence + " "
            length = len(tokenizer.tokenize(sentence))

    return chunks

@st.cache_data
def summarize_text(text_to_summarize,max_len,min_len):
    '''Summarize text with HF model'''
    
    summarized_text = sum_pipe(text_to_summarize,
                               max_length=max_len,
                               min_length=min_len,
                               do_sample=False, 
                               early_stopping=True,
                              num_beams=4)
    summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text])
     
    return summarized_text 

@st.cache_data 	
def get_all_entities_per_sentence(text):
    doc = nlp(''.join(text))

    sentences = list(doc.sents)

    entities_all_sentences = []
    for sentence in sentences:
        entities_this_sentence = []

        # SPACY ENTITIES
        for entity in sentence.ents:
            entities_this_sentence.append(str(entity))

        # XLM ENTITIES
        entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))]
        for entity in entities_xlm:
            entities_this_sentence.append(str(entity))

        entities_all_sentences.append(entities_this_sentence)

    return entities_all_sentences

@st.cache_data 
def get_all_entities(text):
    all_entities_per_sentence = get_all_entities_per_sentence(text)
    return list(itertools.chain.from_iterable(all_entities_per_sentence))

@st.cache_data    
def get_and_compare_entities(article_content,summary_output):
    
    all_entities_per_sentence = get_all_entities_per_sentence(article_content)
    entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence))
   
    all_entities_per_sentence = get_all_entities_per_sentence(summary_output)
    entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence))
   
    matched_entities = []
    unmatched_entities = []
    for entity in entities_summary:
        if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article):
            matched_entities.append(entity)
        elif any(
                np.inner(sbert.encode(entity, show_progress_bar=False),
                         sbert.encode(art_entity, show_progress_bar=False)) > 0.9 for
                art_entity in entities_article):
            matched_entities.append(entity)
        else:
            unmatched_entities.append(entity)

    matched_entities = list(dict.fromkeys(matched_entities))
    unmatched_entities = list(dict.fromkeys(unmatched_entities))

    matched_entities_to_remove = []
    unmatched_entities_to_remove = []

    for entity in matched_entities:
        for substring_entity in matched_entities:
            if entity != substring_entity and entity.lower() in substring_entity.lower():
                matched_entities_to_remove.append(entity)

    for entity in unmatched_entities:
        for substring_entity in unmatched_entities:
            if entity != substring_entity and entity.lower() in substring_entity.lower():
                unmatched_entities_to_remove.append(entity)

    matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove))
    unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove))

    for entity in matched_entities_to_remove:
        matched_entities.remove(entity)
    for entity in unmatched_entities_to_remove:
        unmatched_entities.remove(entity)

    return matched_entities, unmatched_entities

@st.cache_data 
def highlight_entities(article_content,summary_output):
   
    markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
    markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">"
    markdown_end = "</mark>"

    matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output)
    
    for entity in matched_entities:
        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)

    for entity in unmatched_entities:
        summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
    
    print("")
    print("")
    
    soup = BeautifulSoup(summary_output, features="html.parser")

    return HTML_WRAPPER.format(soup)

def summary_downloader(raw_text):
    '''Download the summary generated'''
    
    b64 = base64.b64encode(raw_text.encode()).decode()
    new_filename = "new_text_file_{}_.txt".format(time_str)
    st.markdown("#### Download Summary as a File ###")
    href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
    st.markdown(href,unsafe_allow_html=True)

@st.cache_data
def generate_eval(raw_text, N, chunk):

    # Generate N questions from context of chunk chars
    # IN: text, N questions, chunk size to draw question from in the doc
    # OUT: eval set as JSON list

    # raw_text = ','.join(raw_text)
    
    update = st.empty()
    ques_update = st.empty()
    update.info("`Generating sample questions ...`")
    n = len(raw_text)
    starting_indices = [random.randint(0, n-chunk) for _ in range(N)]
    sub_sequences = [raw_text[i:i+chunk] for i in starting_indices]
    chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0))
    eval_set = []
    
    for i, b in enumerate(sub_sequences):
        try:
            qa = chain.run(b)
            eval_set.append(qa)
            ques_update.info(f"Creating Question: {i+1}")

        except Exception as e:
            print(e)
            st.warning(f'Error in generating Question: {i+1}...', icon="⚠️")
            continue
        
    eval_set_full = list(itertools.chain.from_iterable(eval_set))
    
    update.empty()
    ques_update.empty()

    return eval_set_full

@st.cache_resource
def create_prompt_and_llm():
    '''Create prompt'''

    llm = ChatOpenAI(temperature=0, streaming=True, model="gpt-4")

    message = SystemMessage(
        content=(
            "You are a helpful chatbot who is tasked with answering questions acuurately about earnings call transcript provided. "
            "Unless otherwise explicitly stated, it is probably fair to assume that questions are about the earnings call transcript. "
            "If there is any ambiguity, you probably assume they are about that."
            "Do not use any information not provided in the earnings context and remember you are a to speak like a finance expert."
            "If you don't know the answer, just say 'There is no relevant answer in the given earnings call transcript'" 
            "don't try to make up an answer"
        )
    )

    prompt = OpenAIFunctionsAgent.create_prompt(
        system_message=message,
        extra_prompt_messages=[MessagesPlaceholder(variable_name="history")],
    )

    return prompt, llm
    
@st.cache_resource
def gen_embeddings(embedding_model):

    '''Generate embeddings for given model'''
    
    if 'hkunlp' in embedding_model:
        
        embeddings = HuggingFaceInstructEmbeddings(model_name=embedding_model,
                                           query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
                                           embed_instruction='Represent the Financial paragraph for retrieval: ')

    elif 'mpnet' in embedding_model:
        
        embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

    elif 'FlagEmbedding' in embedding_model:

        encode_kwargs = {'normalize_embeddings': True}
        embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model,
                                                   encode_kwargs = encode_kwargs
                                                   )

    return embeddings
    
@st.cache_data
def create_vectorstore(corpus, title, embedding_model, chunk_size=1000, overlap=50):

    '''Process text for Semantic Search'''
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap)

    texts = text_splitter.split_text(corpus)

    embeddings = gen_embeddings(embedding_model)

    vectorstore = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))])

    return vectorstore

@st.cache_data
def create_memory_and_agent(query,_docsearch):
    
    '''Embed text and generate semantic search scores'''

    #create vectorstore
    vectorstore = _docsearch.as_retriever(search_kwargs={"k": 4})

    #create retriever tool
    tool = create_retriever_tool(
    vectorstore,
    "earnings_call_search",
    "Searches and returns documents using the earnings context provided as a source, relevant to the user input question.",
    )

    tools = [tool]

    prompt,llm = create_prompt_and_llm()

    agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt)
    
    agent_executor = AgentExecutor(
        agent=agent,
        tools=tools,
        verbose=True,
        return_intermediate_steps=True,
    )
    
    memory = AgentTokenBufferMemory(llm=llm)
        
    return memory, agent_executor

@st.cache_data
def gen_sentiment(text):
    '''Generate sentiment of given text'''
    return sent_pipe(text)[0]['label']

@st.cache_data 
def gen_annotated_text(df):
    '''Generate annotated text'''
    
    tag_list=[]
    for row in df.itertuples():
        label = row[2]
        text = row[1]
        if label == 'Positive':
            tag_list.append((text,label,'#8fce00'))
        elif label == 'Negative':
            tag_list.append((text,label,'#f44336'))
        else:
            tag_list.append((text,label,'#000000'))
        
    return tag_list
    
    
def display_df_as_table(model,top_k,score='score'):
    '''Display the df with text and scores as a table'''
    
    df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
    df['Score'] = round(df['Score'],2)
    
    return df   

      
def make_spans(text,results):
    results_list = []
    for i in range(len(results)):
        results_list.append(results[i]['label'])
    facts_spans = []
    facts_spans = list(zip(sent_tokenizer(text),results_list))
    return facts_spans

##Fiscal Sentiment by Sentence
def fin_ext(text):
    results = remote_clx(sent_tokenizer(text))
    return make_spans(text,results)

## Knowledge Graphs code

def get_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article